<a href="https://colab.research.google.com/github/nadzirarifqi/machinelearning/blob/main/ML_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import pandas and datasets

In [1]:
import pandas as pd

spending_file_path = 'monthly_spending_dataset_2020_2025.csv'
spending_data = pd.read_csv(spending_file_path)
spending_data.head()

Unnamed: 0,Month,Groceries (₹),Rent (₹),Transportation (₹),Gym (₹),Utilities (₹),Healthcare (₹),Investments (₹),Savings (₹),EMI/Loans (₹),Dining & Entertainment (₹),Shopping & Wants (₹),Total Expenditure (₹),Income (₹)
0,2020-01-01,4860,10000,2595,888,1520,1930,4311,4232,0,3138,1121,30363,40000
1,2020-02-01,6135,10000,2371,851,1630,1923,5939,7329,0,3185,2332,34366,40000
2,2020-03-01,6853,10000,2715,1143,1776,1185,4700,3625,0,2684,1459,32515,36000
3,2020-04-01,6904,10000,2582,869,1975,1274,4420,6426,0,2475,2806,33305,36000
4,2020-05-01,4562,10000,3028,830,1984,1631,4410,3647,0,2146,1020,29611,36000


Drop Missing Values

In [2]:
spending_data = spending_data.dropna(axis=0)
spending_data.columns

Index(['Month', 'Groceries (₹)', 'Rent (₹)', 'Transportation (₹)', 'Gym (₹)',
       'Utilities (₹)', 'Healthcare (₹)', 'Investments (₹)', 'Savings (₹)',
       'EMI/Loans (₹)', 'Dining & Entertainment (₹)', 'Shopping & Wants (₹)',
       'Total Expenditure (₹)', 'Income (₹)'],
      dtype='object')

Select Target Predictions and Choosing Features

In [6]:
target_column_name = 'Savings (₹)'
drop_features = ['Savings (₹)', 'Month']
y = spending_data[target_column_name]
X = spending_data.drop(drop_features, axis=1)


Review Data

In [10]:
print(X.describe())

print(X.head())

       Groceries (₹)     Rent (₹)  Transportation (₹)      Gym (₹)  \
count      69.000000     69.00000           69.000000    69.000000   
mean     6286.913043  11000.00000         2569.260870   967.768116   
std      1233.330400   1424.57424          577.704373   149.508523   
min      4154.000000  10000.00000         1501.000000   700.000000   
25%      5158.000000  10000.00000         2095.000000   851.000000   
50%      6368.000000  10000.00000         2604.000000  1001.000000   
75%      6931.000000  13000.00000         3035.000000  1084.000000   
max      8983.000000  13000.00000         3482.000000  1196.000000   

       Utilities (₹)  Healthcare (₹)  Investments (₹)  EMI/Loans (₹)  \
count      69.000000       69.000000        69.000000      69.000000   
mean     1978.898551     1750.420290      6317.000000    1130.434783   
std       304.688228      577.078173      1459.367671    2363.403848   
min      1514.000000      837.000000      3952.000000       0.000000   
25%      

Import DecisionTreeRegressor

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Specify Model
spending_model = DecisionTreeRegressor(random_state=1)
# Fit model
spending_model.fit(X,y)

print('First in-sample predictions:', spending_model.predict(X.head()))
print('Actual target values for those homes:', y.head().tolist())

First in-sample predictions: [4232. 7329. 3625. 6426. 3647.]
Actual target values for those homes: [4232, 7329, 3625, 6426, 3647]


Split Your Data

In [12]:
train_X, val_X, train_y, val_y = train_test_split(X,y,test_size=0.25, random_state=0)

print(f"Total number of samples: {len(X)}")

#Train test sample size
print(f"Total number of training samples: {len(train_X)}")
print(f"Total number of validation samples: {len(val_X)}")

#Target train test sample size
print(f"Total number of target training samples: {len(train_y)}")
print(f"Total number of target validation samples: {len(val_y)}")

Total number of samples: 69
Total number of training samples: 51
Total number of validation samples: 18
Total number of target training samples: 51
Total number of target validation samples: 18


Specify and Fit the Model

In [13]:
#Create Decision Tree Model
split_spending_model = DecisionTreeRegressor(random_state=1)

#Fit the model with training data
split_spending_model.fit(train_X, train_y)

Make Predictions with Validation Data

In [17]:
val_predictions = split_spending_model.predict(val_X)

print(split_spending_model.predict(val_X.head()))
print(val_y.head().tolist())

# Because the result is different, we need to calcutate
# the errors

[ 7059.  7608. 11924.  7059.  7833.]
[10731, 6072, 10596, 11756, 6624]


Calculate the MAE in Validation Data

In [18]:
val_mae = mean_absolute_error(val_y, val_predictions)

print(val_mae)

#The data shows that the average magnitude of the error
#accross all predictions this model made is 1947.77 units
#of the target variable

1947.7777777777778
