In [6]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-win_amd64.whl (11.0 MB)
     ---------------------------------------- 11.0/11.0 MB 6.6 MB/s eta 0:00:00
Collecting scipy>=1.6.0
  Downloading scipy-1.14.1-cp311-cp311-win_amd64.whl (44.8 MB)
     ---------------------------------------- 44.8/44.8 MB 3.9 MB/s eta 0:00:00
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
     -------------------------------------- 301.8/301.8 kB 9.1 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 scipy-1.14.1 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  accuracy_score


In [None]:
# Loading the dataset
df = pd.read_csv("data.csv")

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# Converting Text to numeric for further processing
label_encoder = LabelEncoder()
df['City Tier'] = label_encoder.fit_transform(df['City_Tier'])
df = pd.get_dummies(df,columns = ['Occupation'], drop_first = True)

In [None]:
df.info()

In [None]:
# Converting Boolean values are in the int format for further processing
df[['Occupation_Retired','Occupation_Self_Employed', 'Occupation_Student']] = df[['Occupation_Retired','Occupation_Self_Employed', 'Occupation_Student']].astype(int)

In [None]:
# Dropping City_Tier Column as it has been encoded
df= df.drop('City_Tier', axis = 1)

In [None]:
# Removing outliers in the dataset

def remove_outliers(df):
    
    df_clean = df.copy()
    
    columns_to_check = ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 'Insurance',
                        'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 'Utilities',
                        'Healthcare', 'Education', 'Miscellaneous',
                        'Savings',  'Potential_Savings_Groceries',
                        'Potential_Savings_Transport', 'Potential_Savings_Eating_Out',
                        'Potential_Savings_Entertainment', 'Potential_Savings_Utilities',
                        'Potential_Savings_Healthcare', 'Potential_Savings_Education',
                        'Potential_Savings_Miscellaneous']

    for column in columns_to_check:
        
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1

        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]

    
    df_clean.reset_index(drop=True, inplace=True)

    
    print(f"Number of rows in original dataframe: {len(df)}")
    print(f"Number of rows in cleaned dataframe: {len(df_clean)}")
    print(f"Number of rows removed: {len(df) - len(df_clean)}")

    return df_clean

df= remove_outliers(df)

df.to_csv('file.csv', index=False)

In [None]:
# Removing any duplicates in my data
df_cleaned = df.drop_duplicates()

df_cleaned.reset_index(drop=True, inplace=True)

print(df.shape)
print(df_cleaned.shape)


In [None]:
df.info()

In [None]:
df.to_csv('file.csv',index= False)

## Finding the Optimal Number of Clusters

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors

def find_optimal_clusters(df):
    # Feature engineering for ratios
    feature_data = pd.DataFrame({
        'entertainment_ratio': (df['Eating_Out'] + df['Entertainment']) / df['Income'],
        'family_ratio': (df['Groceries'] + df['Rent'] + df['Transport']) / df['Income'],
        'savings_ratio': df['Savings'] / df['Income'],
    })
    
    # Scaling features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_data)
    
    # Dimensionality reduction with PCA
    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(scaled_features)
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 10))
    
    # 1. Elbow Method
    inertias = []
    silhouette_scores = []
    K = range(2, 11)
    
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(pca_features)
        inertias.append(kmeans.inertia_)
        if k > 1:  # Silhouette score requires at least 2 clusters
            silhouette_scores.append(silhouette_score(pca_features, kmeans.labels_))
    
    # Plot Elbow Method
    ax1 = fig.add_subplot(231)
    ax1.plot(K, inertias, 'bx-')
    ax1.set_xlabel('k')
    ax1.set_ylabel('Inertia')
    ax1.set_title('Elbow Method')
    
    # Find the elbow point
    kl = KneeLocator(K, inertias, curve='convex', direction='decreasing')
    ax1.axvline(x=kl.elbow, color='r', linestyle='--')
    
    # 2. Silhouette Analysis
    ax2 = fig.add_subplot(232)
    ax2.plot(K[1:], silhouette_scores[:-1], 'rx-') 
    ax2.set_xlabel('k')
    ax2.set_ylabel('Silhouette Score')
    ax2.set_title('Silhouette Analysis')
    
    # 3. Hierarchical Clustering Dendrogram
    ax3 = fig.add_subplot(233)
    linkage_matrix = linkage(pca_features, method='ward')
    dendrogram(linkage_matrix, truncate_mode='lastp', p=10)
    ax3.set_title('Hierarchical Clustering Dendrogram')
    
    # 4. DBSCAN Optimal Epsilon
    neighbors = NearestNeighbors(n_neighbors=2)
    neighbors_fit = neighbors.fit(pca_features)
    distances, indices = neighbors_fit.kneighbors(pca_features)
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    
    ax4 = fig.add_subplot(234)
    ax4.plot(np.arange(len(distances)), distances)
    ax4.set_xlabel('Points')
    ax4.set_ylabel('Distance')
    ax4.set_title('DBSCAN: k-distance graph')
    
    # 5. Visualization of best clustering
    optimal_k = kl.elbow
    kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    cluster_labels = kmeans_optimal.fit_predict(pca_features)
    
    ax5 = fig.add_subplot(235)
    scatter = ax5.scatter(pca_features[:, 0], pca_features[:, 1], 
                         c=cluster_labels, cmap='viridis')
    ax5.set_title(f'Clustering with Optimal k={optimal_k}')
    plt.colorbar(scatter, ax=ax5)
    
    plt.tight_layout()
    plt.show()
    
    # Print analysis results
    print("\nOptimal Number of Clusters Analysis:")
    print(f"1. Elbow Method suggests {kl.elbow} clusters")
    print(f"2. Best Silhouette Score: {max(silhouette_scores):.3f} "
          f"at k={silhouette_scores.index(max(silhouette_scores)) + 2}")
    
    # Calculate additional cluster validity metrics
    optimal_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    optimal_labels = optimal_kmeans.fit_predict(pca_features)
    
    # Calculate cluster characteristics
    for i in range(optimal_k):
        cluster_points = pca_features[optimal_labels == i]
        cluster_center = np.mean(cluster_points, axis=0)
        cluster_std = np.std(cluster_points, axis=0)
        cluster_size = len(cluster_points)
        
        print(f"\nCluster {i} Statistics:")
        print(f"Size: {cluster_size} points ({cluster_size/len(pca_features)*100:.1f}%)")
        print(f"Compactness (avg distance to center): "
              f"{np.mean(np.linalg.norm(cluster_points - cluster_center, axis=1)):.3f}")
        print(f"Standard deviation: {np.mean(cluster_std):.3f}")
    
    return optimal_k, pca_features, optimal_labels

# Run the analysis
optimal_k, pca_features, optimal_labels = find_optimal_clusters(df)

Looking at your results, let me analyze the findings:

Cluster Count Recommendations:


Elbow Method suggests 4 clusters
Silhouette Score suggests 3 clusters (score of 0.338)
The relatively low silhouette score (0.338) indicates some overlap between clusters


Cluster Statistics Analysis:


The clusters are fairly well-balanced in size (22-28% each)
Very similar compactness metrics (0.759-0.808)
Similar standard deviations (0.593-0.633)
This uniformity suggests natural groupings in the data

Let's optimize the clustering for 3 clusters since:

It has the best silhouette score
The 4-cluster solution shows very similar compactness metrics, suggesting it might be over-segmenting

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_optimized_clusters(df):
    # Feature engineering for ratios
    feature_data = pd.DataFrame({
        'entertainment_ratio': (df['Eating_Out'] + df['Entertainment']) / df['Income'],
        'family_ratio': (df['Groceries'] + df['Transport']) / df['Income'],
        'savings_ratio': df['Savings'] / df['Income'],
    })
     
    # Scaling features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_data)
    
    # Dimensionality reduction with PCA
    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(scaled_features)
    
    # Optimal clustering with k=3
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
    cluster_labels = kmeans.fit_predict(pca_features)
    
    # Calculate silhouette scores for each point
    silhouette_vals = silhouette_samples(pca_features, cluster_labels)
    
    # Create visualization
    fig = plt.figure(figsize=(20, 10))
    
    # 1. Main clustering plot
    ax1 = fig.add_subplot(121)
    scatter = ax1.scatter(pca_features[:, 0], pca_features[:, 1], 
                         c=cluster_labels, cmap='viridis')
    
    # Plot cluster centers
    centers = kmeans.cluster_centers_
    ax1.scatter(centers[:, 0], centers[:, 1], c='red', marker='x', 
                s=200, linewidths=3, label='Centroids')
    
    ax1.set_title('Optimized 3-Cluster Solution')
    ax1.legend()
    plt.colorbar(scatter)
    
    # 2. Silhouette analysis plot
    ax2 = fig.add_subplot(122)
    
    y_lower = 10
    for i in range(3):
        cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
        cluster_silhouette_vals.sort()
        cluster_size = cluster_silhouette_vals.shape[0]
        y_upper = y_lower + cluster_size
        
        color = plt.cm.viridis(float(i) / 3)
        ax2.fill_betweenx(np.arange(y_lower, y_upper),
                         0, cluster_silhouette_vals,
                         facecolor=color, alpha=0.7)
        
        y_lower = y_upper + 10
    
    ax2.set_title('Silhouette Analysis')
    ax2.set_xlabel('Silhouette Coefficient')
    ax2.axvline(x=np.mean(silhouette_vals), color="red", linestyle="--")
    
    plt.tight_layout()
    plt.show()
    
    # Calculate and print detailed cluster characteristics
    original_features = feature_data.values
    
    print("\nDetailed Cluster Analysis:")
    for i in range(3):
        cluster_mask = cluster_labels == i
        cluster_points = original_features[cluster_mask]
        
        print(f"\nCluster {i} Profile:")
        print(f"Size: {np.sum(cluster_mask)} points ({np.sum(cluster_mask)/len(cluster_labels)*100:.1f}%)")
        print("\nAverage Ratios:")
        print(f"Entertainment: {np.mean(cluster_points[:, 0]):.3f}")
        print(f"Family: {np.mean(cluster_points[:, 1]):.3f}")
        print(f"Savings: {np.mean(cluster_points[:, 2]):.3f}")
        
        print("\nSpending Pattern:")
        primary_focus = np.argmax(np.mean(cluster_points, axis=0))
        if primary_focus == 0:
            pattern = "Entertainment-focused"
        elif primary_focus == 1:
            pattern = "Family-focused"
        else:
            pattern = "Savings-focused"
        print(f"Primary spending pattern: {pattern}")
    
    return cluster_labels, pca_features

# Run the analysis
cluster_labels, pca_features = analyze_optimized_clusters(df)

Clear Segmentation:


The three clusters show distinct spending patterns
Major differentiation is in savings behavior
Family spending is a secondary differentiator
Entertainment spending is relatively consistent


Behavioral Insights:


Two distinct saving behaviors (high vs. moderate)
Family spending inversely correlates with savings
Entertainment spending is relatively stable across segments

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
clusters = ["Family-Focused (C0)", "High Savers (C1)", "Balanced Savers (C2)"]
entertainment = [7.5, 5.8, 7.6]
family = [20.2, 19.3, 17.5]
savings = [17.5, 30.3, 29.1]

# Set bar width
bar_width = 0.25
index = np.arange(len(clusters))

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

bar1 = ax.bar(index - bar_width, entertainment, bar_width, label='Entertainment', color='#8884d8')
bar2 = ax.bar(index, family, bar_width, label='Family', color='#82ca9d')
bar3 = ax.bar(index + bar_width, savings, bar_width, label='Savings', color='#ffc658')

# Adding labels and title
ax.set_xlabel('Clusters')
ax.set_ylabel('Percentage of Income')
ax.set_title('Cluster Spending Patterns (% of Income)')
ax.set_xticks(index)
ax.set_xticklabels(clusters)
ax.legend()

# Adding percentage annotations
def add_labels(bars):
    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, yval + 0.2, f'{yval}%', ha='center', va='bottom')

add_labels(bar1)
add_labels(bar2)
add_labels(bar3)

# Display the plot
plt.tight_layout()
plt.show()

# Key Insights
insights = [
    "Cluster sizes are very balanced (32-34% each)",
    "Entertainment spending is consistently low across all clusters (5.8-7.6%)",
    "Family spending shows moderate variation (17.5-20.2%)",
    "Savings shows the highest variation between clusters (17.5-30.3%)"
]

print("\nKey Insights:")
for insight in insights:
    print(f"- {insight}")


Based on your data characteristics and the clustering results we've seen, let me help you choose the optimal algorithm:
Comparison of Best Clustering AlgorithmsClick to open code
Based on your data characteristics:

K-means is the recommended choice because:

Your clusters are fairly balanced (32-34%)
The spending ratios show clear centroids
The clusters have similar variances
The data shows good spherical separation in the transformed space
Computationally efficient for your dataset size


Advantages of K-means for your case:

Produces clean, interpretable clusters
Handles the scale of your data well (thousands of points)
Creates balanced clusters which match your data's natural distribution
The centroids have clear business interpretation (spending patterns)

Why not other algorithms:

DBSCAN:

Cluster Shape: The clusters you identified are well-separated and likely spherical or near-spherical, which K-means can handle effectively. DBSCAN is more suited to data with irregular shapes and varying densities, which doesn't seem to apply to your dataset.
Density-Based Assumptions: Your clusters have relatively similar densities. DBSCAN works best when there are clusters with significant density differences (dense areas vs. sparse areas), which doesn’t appear to be the case here.


Spectral Clustering:

Computationally expensive for your dataset size
Overkill since your clusters are well-separated


Hierarchical Clustering:

Too computationally intensive for your dataset size
Doesn't add value given your clear cluster structure


Gaussian Mixture Models:

More complex than needed
K-means gives similarly good results with simpler interpretation

the correct conditions will be these: 

kmeans = KMeans(
    n_clusters=3,
    init='k-means++',    # Use k-means++ initialization
    n_init=20,           # Multiple initializations for stability
    random_state=42,     # For reproducibility
    max_iter=300         # Allow enough iterations for convergence
)

# Training the model

In [None]:
# Function to train the clustering model
def train_clustering_model(df):
    # Feature engineering for ratios
    feature_data = pd.DataFrame({
        'entertainment_ratio': (df['Eating_Out'] + df['Entertainment']) / df['Income'],
        'family_ratio': (df['Groceries'] + df['Transport']) / df['Income'],
        'savings_ratio': df['Savings'] / df['Income'],
    })
    
    # Scaling features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_data)
    
    # Dimensionality reduction with PCA
    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(scaled_features)
    
    # Fit KMeans with 3 clusters
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
    kmeans.fit(pca_features)
    
    # Return the KMeans model, scaler, and PCA for later prediction
    return kmeans, scaler, pca


### Explanation of the cluster formation

"High Savers": If the savings percentage is higher than 25% and the family spending is also high (above 19%), this cluster is labeled as "High Savers".

"Balanced Savers": If the savings percentage is higher than 25% but family spending is relatively moderate (below 19%), it’s labeled as "Balanced Savers".

"Family-Focused": If the savings percentage is lower (below 25%) and family spending is relatively high, this cluster is considered "Family-Focused".

# Predicting the model: 

In [None]:

# Function to predict the cluster for a new user's input
def predict_user_cluster(kmeans, scaler, pca, new_user_data):
    # Feature engineering for the new user's input
    feature_data = pd.DataFrame({
        'entertainment_ratio': (new_user_data['Eating_Out'] + new_user_data['Entertainment']) / new_user_data['Income'],
        'family_ratio': (new_user_data['Groceries'] + new_user_data['Transport']) / new_user_data['Income'],
        'savings_ratio': new_user_data['Savings'] / new_user_data['Income'],
    }, index=[0])  # Ensure it's a DataFrame with one row
    
    # Scale the features
    scaled_features = scaler.transform(feature_data)
    
    # Transform using PCA
    pca_features = pca.transform(scaled_features)
    
    # Predict the cluster
    cluster_label = kmeans.predict(pca_features)[0]
    
    # Map the cluster number to a human-readable name
    cluster_names = ['High Savers', 'Balanced Savers', 'Family Focused']
    predicted_cluster = cluster_names[cluster_label]
    
    return predicted_cluster

# Testing
kmeans, scaler, pca = train_clustering_model(df)

# New user's data to predict (example)
new_user_data = {
    'Eating_Out': 400,
    'Entertainment': 150,
    'Groceries': 500,
    'Transport': 200,
    'Savings': 800,
    'Income': 3000
}

# Predict the cluster for the new user
predicted_cluster = predict_user_cluster(kmeans, scaler, pca, new_user_data)
print(f"The new user belongs to the cluster: {predicted_cluster}")

In [None]:
df.head()

# Random Forest with KMeans

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Assuming you have already run the KMeans model and have these variables:
# scaler, pca, kmeans

# Manually define the cluster names based on KMeans output:
cluster_names = {
    0: "Family Focused",    # For cluster 0
    1: "High Savers",       # For cluster 1
    2: "Balanced Savers",   # For cluster 2
}

def predict_and_print_results(df, kmeans, scaler, pca, cluster_names, new_user):
    # Features for savings prediction
    features_savings = ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 
                        'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 
                        'City Tier', 'Occupation_Retired', 'Occupation_Self_Employed', 
                        'Occupation_Student']
      
    # Train a Random Forest for savings prediction
    rf_savings = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_savings.fit(df[features_savings], df['Savings'])
    predicted_savings = rf_savings.predict(new_user[features_savings])[0]

    # Predict potential expenses in various categories
    savings_categories = [col for col in df.columns if col.startswith('Potential_Savings_')]
    features_expenses = ['Income', 'Age', 'Dependents', 'City Tier',
                         'Occupation_Retired', 'Occupation_Self_Employed',
                         'Occupation_Student']
    
    expense_predictions = {}
    for category in savings_categories:
        rf_expense = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_expense.fit(df[features_expenses], df[category])
        expense_predictions[category] = rf_expense.predict(new_user[features_expenses])[0]

    # Calculate ratios for KMeans prediction (entertainment_ratio, family_ratio, savings_ratio)
    entertainment_ratio = (new_user['Eating_Out'].values[0] + new_user['Entertainment'].values[0]) / new_user['Income'].values[0]
    family_ratio = (new_user['Groceries'].values[0] + new_user['Transport'].values[0]) / new_user['Income'].values[0]
    savings_ratio = new_user['Savings'] / new_user['Income'].values[0]  

    # Prepare the input for KMeans prediction
    cluster_input = pd.DataFrame({
        'entertainment_ratio': [entertainment_ratio],
        'family_ratio': [family_ratio],
        'savings_ratio': [savings_ratio],
    })

    # Scale the input for KMeans and apply PCA transformation
    scaled_input = scaler.transform(cluster_input)
    pca_input = pca.transform(scaled_input)

    # Predict the cluster using KMeans
    predicted_cluster = kmeans.predict(pca_input)[0]
    predicted_category = cluster_names.get(predicted_cluster, "Unknown Category")

    # Print the results
    print("\n=== Financial Prediction Results ===")
    print("\n1. User Profile Analysis:")
    print(f"Spending Profile Type: {predicted_category}")

    print("\n2. Savings Prediction:")
    print(f"Recommended Monthly Savings: {predicted_savings:,.2f} Rs")

    print("\n3. Potential Monthly Savings by Category:")
    for category, amount in expense_predictions.items():
        category_name = category.replace('Potential_Savings_', '').replace('_', ' ')
        print(f"{category_name:<15} {amount:,.2f} Rs")

    return {
        'predicted_savings': predicted_savings,
        'expense_predictions': expense_predictions,
        'user_cluster': predicted_cluster,
        'user_category': predicted_category,
    }

## Predicting Model

In [None]:
# Example usage
if __name__ == "__main__":
   
    new_user = pd.DataFrame({
        'Income': [4000],
        'Age': [30],
        'Dependents': [0],
        'Rent': [750],
        'Loan_Repayment': [1000],
        'Groceries': [300],
        'Transport': [100],
        'Eating_Out': [100],
        'Entertainment': [100],
        'City Tier': [0],
        'Occupation_Retired': [0],
        'Occupation_Self_Employed': [1],
        'Occupation_Student': [0],
        'Savings': [750]  
    })
    
   
    results = predict_and_print_results(df, kmeans, scaler, pca, cluster_names, new_user)

# Validating the model

### Measuring MAE

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

def calculate_and_print_mae_scores(df):
    # Features for savings prediction
    features_savings = ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 
                        'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 
                        'City Tier', 'Occupation_Retired', 'Occupation_Self_Employed', 
                        'Occupation_Student']
    
    # Features for expense predictions
    features_expenses = ['Income', 'Age', 'Dependents', 'City Tier',
                         'Occupation_Retired', 'Occupation_Self_Employed',
                         'Occupation_Student']
    
    # Savings prediction MAE
    X = df[features_savings]
    y = df['Savings']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf_savings = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_savings.fit(X_train, y_train)
    y_pred = rf_savings.predict(X_test)
    mae_savings = mean_absolute_error(y_test, y_pred)
    
    print("\n=== Model Performance Evaluation ===")
    print(f"MAE for Savings Prediction: ${mae_savings:.2f}")
    
    # Expense predictions MAE
    savings_categories = [col for col in df.columns if col.startswith('Potential_Savings_')]
    for category in savings_categories:
        X = df[features_expenses]
        y = df[category]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        rf_expense = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_expense.fit(X_train, y_train)
        y_pred = rf_expense.predict(X_test)
        mae_expense = mean_absolute_error(y_test, y_pred)
        
        category_name = category.replace('Potential_Savings_', '').replace('_', ' ')
        print(f"MAE for {category_name} Prediction: ${mae_expense:.2f}")

# Example usage
if __name__ == "__main__":
    # Assuming df is your DataFrame with all the necessary data
    calculate_and_print_mae_scores(df)
    
    

### Model Fitting

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Feature selection for savings prediction
features_savings = ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 
                    'Groceries', 'Transport', 'Eating_Out', 'Entertainment', 
                    'City Tier', 'Occupation_Retired', 'Occupation_Self_Employed', 
                    'Occupation_Student']

# Function to plot underfitting/overfitting analysis
def plot_fitting_analysis(df):
    X = df[features_savings]
    y = df['Savings']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Range of number of trees (n_estimators) to evaluate model complexity
    n_estimators_range = [1, 2, 5, 10, 20, 50, 100, 200, 500]
    
    train_errors = []
    test_errors = []

    # Loop through different model complexities
    for n_estimators in n_estimators_range:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
        rf_model.fit(X_train, y_train)

        # Predict on training and testing data
        y_train_pred = rf_model.predict(X_train)
        y_test_pred = rf_model.predict(X_test)

        # Calculate Root Mean Squared Error (RMSE)
        train_error = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_error = np.sqrt(mean_squared_error(y_test, y_test_pred))

        # Store errors for plotting
        train_errors.append(train_error)
        test_errors.append(test_error)

    # Plot the training and validation error
    plt.figure(figsize=(10, 6))
    plt.plot(n_estimators_range, train_errors, label='Training Error', marker='o', color='blue')
    plt.plot(n_estimators_range, test_errors, label='Validation Error', marker='s', color='red')
    
    # Highlight regions of underfitting, good fit, and overfitting
    plt.axvline(x=5, color='gray', linestyle='--', alpha=0.5)
    plt.axvline(x=100, color='gray', linestyle='--', alpha=0.5)
    plt.text(3, max(train_errors), 'Underfitting\nRegion', ha='right')
    plt.text(50, max(train_errors), 'Good\nFit', ha='center')
    plt.text(350, max(train_errors), 'Overfitting\nRegion', ha='left')

    plt.xscale('log')
    plt.xlabel('Number of Trees (n_estimators)')
    plt.ylabel('Root Mean Squared Error (RMSE)')
    plt.title('Model Complexity vs. Error\n(Underfitting-Overfitting Analysis)')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()


plot_fitting_analysis(df)


## Understanding the Graph

This graph illustrates the concept of underfitting and overfitting in machine learning, specifically showing how model complexity (measured by the number of trees/estimators) affects both training and validation error.
Let's break it down:

Axes:


X-axis: Number of trees (n_estimators) on a logarithmic scale from 10⁰ to 10² (1 to 100 trees)
Y-axis: Root Mean Square Error (RMSE) - lower values indicate better performance


Two Lines:


Blue line: Training Error
Red line: Validation Error


Three Regions:


Underfitting Region (left): High error on both training and validation sets
Good Fit (middle): Optimal balance between training and validation error
Overfitting Region (right): Low training error but validation error plateaus


Key Observations:


As more trees are added, both errors initially decrease
Training error consistently decreases
Validation error eventually plateaus
The optimal model complexity appears to be around where the validation error levels off


Best Practice:


Choose the number of trees where validation error stabilizes (around 10-20 trees in this case)
Adding more trees beyond this point doesn't significantly improve model performance
This helps avoid both underfitting (too simple) and overfitting (too complex)

This is a classic example of the bias-variance tradeoff in machine learning, showing how model complexity needs to be balanced to achieve optimal performance.

# Saving the models for Streamlit

In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Function to train and save the models
def train_and_save_models(df):
    # Feature engineering for ratios
    feature_data = pd.DataFrame({
        'entertainment_ratio': (df['Eating_Out'] + df['Entertainment']) / df['Income'],
        'family_ratio': (df['Groceries'] + df['Transport']) / df['Income'],
        'savings_ratio': df['Savings'] / df['Income'],
    })
    
    # Scaling features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_data)
    
    # Dimensionality reduction with PCA
    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(scaled_features)
    
    # Fit KMeans with 3 clusters
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
    kmeans.fit(pca_features)
    
    # Save models and preprocessors
    joblib.dump(kmeans, 'kmeans_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(pca, 'pca_model.pkl')

    # Train and save Random Forest model for savings prediction
    features_savings = ['Income', 'Age', 'Dependents', 'Rent', 'Loan_Repayment', 'Groceries', 'Transport', 
                        'Eating_Out', 'Entertainment', 'City Tier', 'Occupation_Retired', 
                        'Occupation_Self_Employed', 'Occupation_Student']
    rf_savings = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_savings.fit(df[features_savings], df['Savings'])
    joblib.dump(rf_savings, 'rf_savings_model.pkl')

    # Train and save Random Forest models for expense prediction
    savings_categories = [col for col in df.columns if col.startswith('Potential_Savings_')]
    rf_expense_models = {}
    for category in savings_categories:
        rf_expense = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_expense.fit(df[features_savings], df[category])
        rf_expense_models[category] = rf_expense
        joblib.dump(rf_expense, f'{category}_model.pkl')

    print("Models have been saved.")

# Example DataFrame df (you can replace this with your actual data)
df = pd.read_csv("file.csv")
train_and_save_models(df)
