# Import Libraries

In [1]:
import joblib
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Reading Data Files

In [2]:
df = pd.read_csv('./Data/df_cleaned.csv', index_col='index')
df.head()

Unnamed: 0_level_0,number_of_siblings,attendance_rate,sleep_duration,direct_admission_Yes,learning_style_Visual,CCA_None,tuition_Y,final_test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,91.0,8.0,1,1,0,0,69.0
1,2,94.0,8.0,0,0,0,0,47.0
2,0,92.0,8.0,1,1,1,0,85.0
3,1,95.0,8.0,0,0,0,1,64.0
4,0,95.0,8.0,0,0,0,0,66.0


# Train Test Split

1) In the EDA notebook I perform train test split with a test_size of 0.33 and random_state 42
2) I will do likewise here

In [3]:
X = df.drop(columns='final_test')
y = df['final_test']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)  

# Normalization

1) I will use min max scaler to normalize the data
2) The reason for this is because, I do not need to separate out the numerical features and categorical columns
3) The categorical columns are 0s and 1s therefore MinMaxScaler will have no effect on them

In [5]:
mms = MinMaxScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.transform(X_test)

# Model Building

1. In this model building, I will build 3 models, Linear Regression, Random Forest and Artificial Neural Network
2. The metric I will use for all 3 models will be mean squared error as it can be imported from sklearn without further calculation

### Linear Regression

In [6]:
lr = LinearRegression(n_jobs=-1)
lr.fit(X_train, y_train)
joblib.dump(value=lr, filename='./Data/linear_regression.pkl')

['./Data/linear_regression.pkl']

### Random Forest

In [7]:
rfr = RandomForestRegressor(n_jobs=-1, random_state=42)

param_grid = {'max_depth':[50,100,150,200], 
              'max_features':[1,2,3,4,5,6], 
              'min_samples_split':[30,40,50], 
              'min_samples_leaf':[30,40,50]
             }


gs_ss_rfr = GridSearchCV(estimator=rfr, 
                          param_grid=param_grid, 
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          n_jobs=-1,
                          verbose=10) 

gs_ss_rfr.fit(X_train,y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [8]:
joblib.dump(value=gs_ss_rfr, filename='./Data/random_forest.pkl')

['./Data/random_forest.pkl']

### Artificial neural network

In [9]:
model = Sequential()
model.add(layer=Dense(units=7, activation='relu'))
model.add(layer=Dense(units=20, activation='relu'))
model.add(layer=Dense(units=50, activation='relu'))
model.add(layer=Dense(units=20, activation='relu'))
model.add(layer=Dense(units=1))


model.compile(optimizer='adam', loss='mean_squared_error')

In [10]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=20, 
          validation_data=(X_test, y_test))

Epoch 1/20


[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 2616.7029 - val_loss: 108.5751
Epoch 2/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 107.8126 - val_loss: 101.5079
Epoch 3/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step - loss: 102.1475 - val_loss: 98.8867
Epoch 4/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 986us/step - loss: 98.5157 - val_loss: 96.9131
Epoch 5/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 965us/step - loss: 95.4836 - val_loss: 94.4604
Epoch 6/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 981us/step - loss: 95.6380 - val_loss: 94.3490
Epoch 7/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 995us/step - loss: 97.0306 - val_loss: 94.2643
Epoch 8/20
[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - loss: 92.8041 - val_loss: 92.7820
Epoch 9/20
[1m32

<keras.src.callbacks.history.History at 0x23db0a63850>

In [11]:
model.save(filepath='./Data/ANN.keras')