In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv('jamb_exam_results.csv')
data.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
data.isnull().sum()

JAMB_Score                      0
Study_Hours_Per_Week            0
Attendance_Rate                 0
Teacher_Quality                 0
Distance_To_School              0
School_Type                     0
School_Location                 0
Extra_Tutorials                 0
Access_To_Learning_Materials    0
Parent_Involvement              0
IT_Knowledge                    0
Student_ID                      0
Age                             0
Gender                          0
Socioeconomic_Status            0
Parent_Education_Level          0
Assignments_Completed           0
dtype: int64

In [5]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [6]:
data.drop('student_id', axis=1, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  age                           5000 non-null   int64  
 12  gender                        5000 non-null   object 
 13  soc

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [11]:
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['jamb_score']
y_val = df_val['jamb_score']
y_test = df_test['jamb_score']

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [12]:
dv = DictVectorizer(sparse=True)
x_train_matrix= dv.fit_transform(df_train.to_dict(orient='records'))
x_val_matrix= dv.transform(df_val.to_dict(orient='records'))
x_test_matrix= dv.transform(df_test.to_dict(orient='records'))


## Q1

In [24]:
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [25]:
dt_regressor = DecisionTreeRegressor(max_depth=1)
dt_regressor.fit(x_train_matrix, y_train)

In [26]:
tree = export_text(dt_regressor, feature_names=list(dv.get_feature_names_out()))
print(tree)

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



## Q2

In [27]:
model_2 = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model_2.fit(x_train_matrix, y_train)

#rmse
y_val_preds = model_2.predict(x_val_matrix)
rmse = np.sqrt(mean_squared_error(y_val, y_val_preds))
print(f"{rmse:.2f}")


41.83


In [29]:

rmse_results = []
for n in range(10, 201, 10):
    rf_regressor = RandomForestRegressor(n_estimators=n,
                                         random_state=1,
                                         n_jobs=-1)
    rf_regressor.fit(x_train_matrix, y_train)
    y_val_preds = rf_regressor.predict(x_val_matrix)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_preds))
    rmse_results.append((n, rmse))

In [32]:
for n, rmse in rmse_results:
    print(f"n_estimators:{n}, RMSE:{rmse:.3f}")

n_estimators:10, RMSE:41.825
n_estimators:20, RMSE:41.196
n_estimators:30, RMSE:40.857
n_estimators:40, RMSE:40.666
n_estimators:50, RMSE:40.678
n_estimators:60, RMSE:40.604
n_estimators:70, RMSE:40.535
n_estimators:80, RMSE:40.404
n_estimators:90, RMSE:40.377
n_estimators:100, RMSE:40.430
n_estimators:110, RMSE:40.472
n_estimators:120, RMSE:40.522
n_estimators:130, RMSE:40.530
n_estimators:140, RMSE:40.465
n_estimators:150, RMSE:40.436
n_estimators:160, RMSE:40.462
n_estimators:170, RMSE:40.479
n_estimators:180, RMSE:40.482
n_estimators:190, RMSE:40.468
n_estimators:200, RMSE:40.454


In [35]:

mean_rmse = {}
for d in [10, 15, 20, 25]:
    rmse_list = []
    for n in range(10, 201, 10):
        rf_regressor = RandomForestRegressor(n_estimators=n,
                                             max_depth=d,
                                            random_state=1,
                                            n_jobs=-1)
        rf_regressor.fit(x_train_matrix, y_train)
        y_val_preds = rf_regressor.predict(x_val_matrix)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_preds))
        rmse_list.append(rmse)
    avg_rmse = np.mean(rmse_list)
mean_rmse[d] = avg_rmse

print(f"Mean RMSE for max_depth{d}: {avg_rmse:.3f}")

Mean RMSE for max_depth25: 40.611


## Q5

In [37]:
rfr = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rfr.fit(x_train_matrix, y_train)
importances = rfr.feature_importances_
feature_names = dv.get_feature_names_out()

importance_df = pd.DataFrame({'feature': feature_names,
                              'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)
most_important = importance_df.iloc[0]
print(most_important)

feature       study_hours_per_week
importance                0.246331
Name: 27, dtype: object
