In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import pickle
import json

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('static\csv_files\Student_Performance.csv')
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [3]:
df.rename(
    columns={"Hours Studied":"hours_studied", "Previous Scores": "previous_scores", "Extracurricular Activities": "extracurricular_activities", "Sleep Hours":"sleep_hours", "Sample Question Papers Practiced":"sample_question_papers_practiced","Performance Index": "performance_index"},
    inplace=True,
)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   hours_studied                     10000 non-null  int64  
 1   previous_scores                   10000 non-null  int64  
 2   extracurricular_activities        10000 non-null  object 
 3   sleep_hours                       10000 non-null  int64  
 4   sample_question_papers_practiced  10000 non-null  int64  
 5   performance_index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


## EDA

In [5]:
df['extracurricular_activities']

0       Yes
1        No
2       Yes
3       Yes
4        No
       ... 
9995    Yes
9996    Yes
9997    Yes
9998    Yes
9999     No
Name: extracurricular_activities, Length: 10000, dtype: object

In [6]:
df['extracurricular_activities'].value_counts()

extracurricular_activities
No     5052
Yes    4948
Name: count, dtype: int64

In [7]:
df['extracurricular_activities'].replace({'Yes':1, 'No':0},inplace = True)

In [8]:
extracurricular_activities_data = {'Yes':1, 'No':0}

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   hours_studied                     10000 non-null  int64  
 1   previous_scores                   10000 non-null  int64  
 2   extracurricular_activities        10000 non-null  int64  
 3   sleep_hours                       10000 non-null  int64  
 4   sample_question_papers_practiced  10000 non-null  int64  
 5   performance_index                 10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


### Model Training

In [10]:
x = df.drop('performance_index', axis = 1)
y = df['performance_index']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=13)

In [11]:
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)

### EValuation

In [12]:
y_pred = linear_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2_value = r2_score(y_test, y_pred)
print("R-Squared :",r2_value)

RMSE : 2.035802552869104
MAE : 1.6215037182212297
R-Squared : 0.9891710796460919


In [13]:
y_pred_train = linear_reg.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
rmse = np.sqrt(mse)

print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2_value = r2_score(y_train, y_pred_train )
print("R-Squared :",r2_value)

RMSE : 2.0381062516712043
MAE : 1.6165238326322477
R-Squared : 0.9886271538431827


## Single Row Testing

In [35]:
x_test[55:56]

Unnamed: 0,hours_studied,previous_scores,extracurricular_activities,sleep_hours,sample_question_papers_practiced
19,5,75,0,7,0


In [36]:
df


Unnamed: 0,hours_studied,previous_scores,extracurricular_activities,sleep_hours,sample_question_papers_practiced,performance_index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,1,4,2,23.0
9996,7,64,1,8,5,58.0
9997,6,83,1,8,5,74.0
9998,9,97,1,7,0,95.0


In [37]:
linear_reg.predict(x_test[55:56])[0]

59.93295397114955

In [39]:
column_names = x.columns.tolist()
column_names

['hours_studied',
 'previous_scores',
 'extracurricular_activities',
 'sleep_hours',
 'sample_question_papers_practiced']

In [40]:
x.shape[1]

5

In [41]:
linear_reg.n_features_in_

5

In [47]:
hours_studied = 4
previous_scores = 82
extracurricular_activities = "Yes"
sleep_hours = 7
sample_question_papers_practiced = 3

extracurricular_activities = extracurricular_activities_data[extracurricular_activities]


test_array = np.zeros([1,linear_reg.n_features_in_])
test_array[0,0] = hours_studied
test_array[0,1] = previous_scores
test_array[0,2] = extracurricular_activities
test_array[0,3] = sleep_hours
test_array[0,4] = sample_question_papers_practiced


predicted_charges = np.around(linear_reg.predict(test_array)[0],3)
predicted_charges

65.394

In [49]:
with open('linear_regression.pkl','wb') as f:
    pickle.dump(linear_reg, f)

In [50]:
project_data = {"Extracurricular_Activities": extracurricular_activities_data,
               "Column Names" : column_names}

with open('proj_data.json','w') as f:
    json.dump(project_data, f)