In [16]:
# Load cleaned dataset
import pandas as pd
import numpy as np

df = pd.read_csv(r"D:\project 2\Expenses and Income Prediction\data\anz_cleaned.csv")

df.head()


Unnamed: 0,status,card_present_flag,bpay_biller_code,account,currency,long_lat,txn_description,merchant_id,merchant_code,first_name,...,age,merchant_suburb,merchant_state,extraction,amount,transaction_id,country,customer_id,merchant_long_lat,movement
0,authorized,1.0,,ACC-1598451071,AUD,153.41 -27.95,POS,81c48296-73be-44a7-befa-d053f48ce7cd,,Diana,...,26,Ashmore,QLD,2018-08-01T01:01:15.000+0000,16.25,a623070bfead4541a6b0fff8a09e706c,Australia,CUS-2487424745,153.38 -27.99,debit
1,authorized,0.0,,ACC-1598451071,AUD,153.41 -27.95,SALES-POS,830a451c-316e-4a6a-bf25-e37caedca49e,,Diana,...,26,Sydney,NSW,2018-08-01T01:13:45.000+0000,14.19,13270a2a902145da9db4c951e04b51b9,Australia,CUS-2487424745,151.21 -33.87,debit
2,authorized,1.0,,ACC-1222300524,AUD,151.23 -33.94,POS,835c231d-8cdf-4e96-859d-e9d571760cf0,,Michael,...,38,Sydney,NSW,2018-08-01T01:26:15.000+0000,6.42,feb79e7ecd7048a5a36ec889d1a94270,Australia,CUS-2142601169,151.21 -33.87,debit
3,authorized,1.0,,ACC-1037050564,AUD,153.10 -27.66,SALES-POS,48514682-c78a-4a88-b0da-2d6302e64673,,Rhonda,...,40,Buderim,QLD,2018-08-01T01:38:45.000+0000,40.9,2698170da3704fd981b15e64a006079e,Australia,CUS-1614226872,153.05 -26.68,debit
4,authorized,1.0,,ACC-1598451071,AUD,153.41 -27.95,SALES-POS,b4e02c10-0852-4273-b8fd-7b3395e32eb0,,Diana,...,26,Mermaid Beach,QLD,2018-08-01T01:51:15.000+0000,3.25,329adf79878c4cf0aeb4188b4691c266,Australia,CUS-2487424745,153.44 -28.06,debit


In [17]:
# Create Income and Expenses Targets

income = df[df['movement'] == 'credit'].groupby('customer_id')['amount'].sum().rename('total_income')

expenses = df[df['movement'] == 'debit'].groupby('customer_id')['amount'].sum().rename('total_expenses')

targets = pd.concat([income, expenses], axis=1).fillna(0).reset_index()

targets.head()


Unnamed: 0,customer_id,total_income,total_expenses
0,CUS-1005756958,12616.11,3652.86
1,CUS-1117979751,25050.55,8933.82
2,CUS-1140341822,11499.06,5511.54
3,CUS-1147642491,22248.07,6732.75
4,CUS-1196156254,27326.11,8724.61


In [18]:
# Create Base Features 

total_txn = df.groupby('customer_id')['amount'].count().rename('total_transactions')

debit_txn = df[df['movement']=='debit'].groupby('customer_id')['amount'].count().rename('debit_count')
credit_txn = df[df['movement']=='credit'].groupby('customer_id')['amount'].count().rename('credit_count')

avg_amount = df.groupby('customer_id')['amount'].mean().rename('avg_amount')
max_amount = df.groupby('customer_id')['amount'].max().rename('max_amount')
min_amount = df.groupby('customer_id')['amount'].min().rename('min_amount')

features = pd.concat([total_txn, debit_txn, credit_txn, avg_amount, max_amount, min_amount], axis=1).fillna(0)

features.head()


Unnamed: 0_level_0,total_transactions,debit_count,credit_count,avg_amount,max_amount,min_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUS-1005756958,73,60,13,222.862603,970.47,6.48
CUS-1117979751,100,93,7,339.8437,3578.65,5.64
CUS-1140341822,80,74,6,212.6325,1916.51,4.09
CUS-1147642491,118,105,13,245.600169,1711.39,8.78
CUS-1196156254,245,238,7,147.145796,3903.73,3.66


In [19]:
# Merge features with targets

data = targets.merge(features, on='customer_id', how='left')

data.head()


Unnamed: 0,customer_id,total_income,total_expenses,total_transactions,debit_count,credit_count,avg_amount,max_amount,min_amount
0,CUS-1005756958,12616.11,3652.86,73,60,13,222.862603,970.47,6.48
1,CUS-1117979751,25050.55,8933.82,100,93,7,339.8437,3578.65,5.64
2,CUS-1140341822,11499.06,5511.54,80,74,6,212.6325,1916.51,4.09
3,CUS-1147642491,22248.07,6732.75,118,105,13,245.600169,1711.39,8.78
4,CUS-1196156254,27326.11,8724.61,245,238,7,147.145796,3903.73,3.66


In [20]:
# Prepare X and y

X = data.drop(['customer_id', 'total_income', 'total_expenses'], axis=1)

y_income = data['total_income']
y_expenses = data['total_expenses']


In [21]:
# Train/Test split

from sklearn.model_selection import train_test_split

X_train_inc, X_test_inc, y_train_inc, y_test_inc = train_test_split(
    X, y_income, test_size=0.2, random_state=42
)

X_train_exp, X_test_exp, y_train_exp, y_test_exp = train_test_split(
    X, y_expenses, test_size=0.2, random_state=42
)


In [22]:
# Linear Regression Models 

from sklearn.linear_model import LinearRegression

lr_income = LinearRegression()
lr_income.fit(X_train_inc, y_train_inc)

lr_expenses = LinearRegression()
lr_expenses.fit(X_train_exp, y_train_exp)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [23]:
# Random Forest Regessor

from sklearn.ensemble import RandomForestRegressor

rf_income = RandomForestRegressor(random_state=42)
rf_income.fit(X_train_inc, y_train_inc)

rf_expenses = RandomForestRegressor(random_state=42)
rf_expenses.fit(X_train_exp, y_train_exp)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
# Evaluate Models 

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    return mae, rmse, r2

print("Income Predictions:")
print("Linear Regression:", evaluate(lr_income, X_test_inc, y_test_inc))
print("Random Forest:", evaluate(rf_income, X_test_inc, y_test_inc))

print("\nExpenses Predictions:")
print("Linear Regression:", evaluate(lr_expenses, X_test_exp, y_test_exp))
print("Random Forest:", evaluate(rf_expenses, X_test_exp, y_test_exp))


Income Predictions:
Linear Regression: (3055.154974905402, np.float64(3953.093376369646), 0.618385155786825)
Random Forest: (2423.4490999999994, np.float64(3355.0317211655183), 0.7251192195787508)

Expenses Predictions:
Linear Regression: (1085.2498329008642, np.float64(1359.2527147244155), 0.6548446398759242)
Random Forest: (934.533325, np.float64(1358.5652276940557), 0.6551936990646721)


In [25]:
# Save Best Models 

import joblib

joblib.dump(rf_income, r"D:\project 2\Expenses and Income Prediction\models\best_income_model.pkl")
joblib.dump(rf_expenses, r"D:\project 2\Expenses and Income Prediction\models\best_expense_model.pkl")

print("Models saved successfully!")



Models saved successfully!
