##### Homework #6
##### Nazmul Rabbi
##### 11/01/2024

In [42]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import numpy as np
import xgboost as xgb

# set print options
np.set_printoptions(legacy='1.25')

In [2]:
# read the data file
df = pd.read_csv('jamb_exam_results.csv')

# see the first few rows of the data
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
# make the column names lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# see the first few rows of the data
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
# drop the student_id column
df.drop('student_id', axis = 1, inplace = True)

# fill missing values with 0
df.fillna(0, inplace = True)

# see the first few rows of the data
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,0,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


In [5]:
# splitting the data into training and validation sets
X_train, test_val_split = train_test_split(df, test_size = 0.4, random_state = 1) 
X_val, X_test = train_test_split(test_val_split, test_size = 0.5, random_state = 1)

# set the target column
y_train = X_train['jamb_score'].values
y_val = X_val['jamb_score'].values
y_test = X_test['jamb_score'].values

# remove the target column from the training set
del X_train['jamb_score']
del X_val['jamb_score']
del X_test['jamb_score']

# convert to dictionary
X_train_dicts = X_train.to_dict(orient = 'records')
X_val_dicts = X_val.to_dict(orient = 'records')
X_test_dicts = X_test.to_dict(orient = 'records')

# initialize DictVectorizer
dv = DictVectorizer(sparse = True)

# fit the DictVectorizer
X_train_encoded = dv.fit_transform(X_train_dicts)
X_val_encoded = dv.transform(X_val_dicts)
X_test_encoded = dv.transform(X_test_dicts)

# train the model
model = DecisionTreeRegressor(max_depth = 1)
model.fit(X_train_encoded, y_train)

# make predictions
y_pred = model.predict(X_val_encoded)


In [6]:
# Get the feature importances and names
feature_importances = model.feature_importances_
feature_names = dv.get_feature_names_out()

# Find the feature with the highest importance
most_important_feature = feature_names[feature_importances.argmax()]
print(f"Q1: Most important feature is {most_important_feature}")

Q1: Most important feature is study_hours_per_week


In [7]:
# initialize and fit the random forest model
model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train_encoded, y_train)

# make predictions
y_pred = model.predict(X_val_encoded)

# calculate the root mean squared error
rmse = root_mean_squared_error(y_val, y_pred)
print(f"Q2: Random forest RMSE is {rmse:.2f}")

Q2: Random forest RMSE is 43.16


In [8]:
# initialize the list of estimator values
estimator_values = range(10, 201, 10)
rmse_values = []

# loop through the estimator values
for n in estimator_values:
    model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)
    rmse_values.append((n, rmse))

In [9]:
# convert the list to a DataFrame
rmse_values = pd.DataFrame(rmse_values, columns=['n_estimators', 'RMSE'])

# round the RMSE values to 2 decimal places
rmse_values.RMSE = rmse_values.RMSE.round(2)

# see the output
rmse_values

Unnamed: 0,n_estimators,RMSE
0,10,43.16
1,20,41.79
2,30,41.56
3,40,41.08
4,50,40.96
5,60,40.77
6,70,40.59
7,80,40.5
8,90,40.43
9,100,40.36


In [10]:
print(f"Q3: RMSE values stops improving after estimator value of 80")

Q3: RMSE values stops improving after estimator value of 80


In [11]:
# initialize the list of max_depth values
max_depth_values = range(10, 201, 10)
rmse_values = []

# loop through the max_depth values
for n in max_depth_values:
    model = RandomForestRegressor(n_estimators=80, max_depth=n, random_state=1, n_jobs=-1)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)
    rmse_values.append((n, rmse))

In [12]:
# convert the list to a DataFrame
rmse_values = pd.DataFrame(rmse_values, columns=['n_estimators', 'RMSE'])

# round the RMSE values to 2 decimal places
rmse_values.RMSE = rmse_values.RMSE.round(2)

# see the output
rmse_values

Unnamed: 0,n_estimators,RMSE
0,10,40.06
1,20,40.41
2,30,40.5
3,40,40.5
4,50,40.5
5,60,40.5
6,70,40.5
7,80,40.5
8,90,40.5
9,100,40.5


In [13]:
print(f"Q4: Best max_depth value is 10")

Q4: Best max_depth value is 10


In [23]:
model = RandomForestRegressor(n_estimators=80, max_depth=10, random_state=1, n_jobs=-1)
model.fit(X_train_encoded, y_train)

In [41]:
print("Q5: Most important feature is", dv.feature_names_[np.argmax(model.feature_importances_)])

Q5: Most important feature is study_hours_per_week


In [43]:
dtrain = xgb.DMatrix(X_train_encoded, label=y_train, feature_names=dv.feature_names_)
dval = xgb.DMatrix(X_val_encoded, label=y_val, feature_names=dv.feature_names_)

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [44]:
# Train the model with eta=0.3 for 100 rounds
model_eta_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dval, 'validation')], verbose_eval=False)

# Update eta to 0.1 and train again
xgb_params['eta'] = 0.1
model_eta_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dval, 'validation')], verbose_eval=False)

# Get the RMSE from the validation set for both models
rmse_eta_03 = model_eta_03.eval(dval).split(':')[1]
rmse_eta_01 = model_eta_01.eval(dval).split(':')[1]
print("RMSE for eta=0.3:", rmse_eta_03)
print("RMSE for eta=0.1:", rmse_eta_01)

RMSE for eta=0.3: 43.34290581580823698
RMSE for eta=0.1: 40.83187502240670597


In [47]:
print("Q6: Best ETA for XGBoost is 0.1")

Q6: Best ETA for XGBoost is 0.1
