## Project: student performance

In [1]:
# modules/libraries:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load data:
df = pd.read_csv("Student_Performance.csv")
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [3]:
# explore I:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [4]:
# explore II:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [5]:
# one hot encoding leaving one category as Yes:
df = pd.get_dummies(df, columns = ['Extracurricular Activities'], drop_first = True)
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_Yes
0,7,99,9,1,91.0,1
1,4,82,4,2,65.0,0
2,8,51,7,2,45.0,1
3,5,52,5,2,36.0,1
4,7,75,8,5,66.0,0


In [6]:
# rearange columns:
cols = ['Hours Studied',
 'Previous Scores',
 'Sleep Hours',
 'Sample Question Papers Practiced',
 'Extracurricular Activities_Yes',
 'Performance Index']
df = df[cols]
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Extracurricular Activities_Yes,Performance Index
0,7,99,9,1,1,91.0
1,4,82,4,2,0,65.0
2,8,51,7,2,1,45.0
3,5,52,5,2,1,36.0
4,7,75,8,5,0,66.0


In [None]:
# machine learning modules/libraries:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# setup data:
X = df.iloc[:, :-1]
y = df[["Performance Index"]]
# split data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [None]:
# scale/normalize the data X_train and X_test:
scaler = StandardScaler()
X_norm_train = scaler.fit_transform(X_train)
X_norm_test = scaler.fit_transform(X_test)

In [None]:
# fit/train the model:
model = LinearRegression()
model.fit(X_norm_train, y_train)

In [None]:
# checking parameters or weights:
w_norm = model.coef_ # coefficints or weights
b_norm = model.intercept_ # b parameter
print(f"w parameters: {w_norm}", f" and b parameter: {b_norm}")

In [None]:
# prediction:
y_pred = model.predict(X_norm_test)
y_pred = np.round_(y_pred, decimals = 2)

In [None]:
# assess the performance and accuracy of the model (y_test and y_pred)
# we shoud get low MSE closer to 0 and high RS closer to 1
# MSE is a cost function/loss function!
mean_er = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error is: ', mean_er) # from 0 to infinity and lower the better
print('R Square is: ', r2) # between 0 and 1 and closer to 1 is better

In [None]:
# convert y_test into a numpy array:
real_results = y_test['Performance Index'].values
predicted_results = y_pred.ravel()

In [None]:
# final visualisations and comparison:
plt.scatter(real_results, predicted_results, marker = 'o', c = 'orange')
plt.show()

In [None]:
# more detailed comparison:
plt.scatter(range(len(real_results)),
            real_results, marker = 'o',
            c = 'orange',
            alpha = 0.5,
            label = 'Real Results')
plt.scatter(range(len(predicted_results)),
            predicted_results,
            marker = 'o',
            c = 'purple',
            alpha = 0.5,
            label = 'Predicted Results')
plt.legend()
plt.show()

In [None]:
# another comparison:
final_df = pd.DataFrame({'Actual performance': real_results, 'Predicted Performance': predicted_results})
final_df.head()

In [None]:
# function to get a difference between actual and predicted values:
def compute_difference(col):
    x = col['Actual performance']
    y = col['Predicted Performance']
    if x > y:
        return x - y
    else:
        return y - x
final_df['Difference'] = final_df.apply(compute_difference, axis = 1)
final_df.head()

In [None]:
# average difference between actual and predicted values:
mn = round(final_df['Difference'].mean(), 2)
mx = round(final_df['Difference'].max(), 2)
mi = round(final_df['Difference'].min(), 2)
print(f"Average difference value: {mn}")
print(f"Maximum difference value: {mx}")
print(f"Minimum difference value: {mi}")

In [None]:
# percentage of difference between 0 and 2:
print("Percentage of difference between 0 and 2 is: " 
      + str(round((len(final_df[(final_df['Difference'] >= 0.0)
              & (final_df['Difference'] <= 2.0)]) / len(final_df)) * 100, 2)) + str("%"))

## SVM

In [None]:
# machine learning modules/libraries:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# setup data:
X = df.iloc[:, :-1]
y = df[["Performance Index"]]
# split data:
X_train, X_test, y_train, y_test = (
    train_test_split(X, y, test_size = 0.3, random_state = 100))

In [None]:
# scale/normalize the data X_train and X_test:
scaler = StandardScaler()
X_norm_train = scaler.fit_transform(X_train)
X_norm_test = scaler.fit_transform(X_test)

# SVR as LinearSVR():

In [None]:
# model:
svm_model = LinearSVR()
svm_model.fit(X_norm_train, y_train)

In [None]:
# predict:
y_pred = svm_model.predict(X_norm_test)
y_pred = np.round_(y_pred, decimals = 2)

In [None]:
# test and validation:
mean_er = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error is: ', mean_er) # from 0 to infinity and lower the better
print('R Square is: ', r2) # between 0 and 1 and closer to 1 is better

## SVR with kernel = "linear" as SVR(kernel = "linear"):

In [None]:
# using kernel:
regressor = SVR(kernel = "linear")
regressor.fit(X_norm_train, y_train)

In [None]:
# predict:
y_pred = regressor.predict(X_norm_test)
y_pred = np.round_(y_pred, decimals = 2)

In [None]:
# test and validation:
mean_er = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error is: ', mean_er) # from 0 to infinity and lower the better
print('R Square is: ', r2) # between 0 and 1 and closer to 1 is better

## Comparing models accuracy:

In [7]:
# machine learning modules/libraries:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [8]:
# setup data:
X = df.iloc[:, :-1]
y = df[["Performance Index"]]
# split data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [9]:
# scale/normalize the data X_train and X_test:
scaler = StandardScaler()
X_norm_train = scaler.fit_transform(X_train)
X_norm_test = scaler.fit_transform(X_test)

In [10]:
# models:
lr_model = LinearRegression()
lr_model.fit(X_norm_train, y_train)

lsvr_model = LinearSVR()
lsvr_model.fit(X_norm_train, y_train)

svr_l_model = SVR(kernel = "linear")
svr_l_model.fit(X_norm_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [11]:
# prediction:
y_pred_lr = lr_model.predict(X_norm_test)
y_pred_lr = np.round_(y_pred_lr, decimals = 2)

y_pred_lsvr = lsvr_model.predict(X_norm_test)
y_pred_lsvr = np.round_(y_pred_lsvr, decimals = 2)

y_pred_svr_l = svr_l_model.predict(X_norm_test)
y_pred_svr_l = np.round_(y_pred_svr_l, decimals = 2)

In [12]:
models = [y_pred_lr, y_pred_lsvr, y_pred_svr_l]
models

[array([[65.28],
        [48.1 ],
        [35.89],
        ...,
        [70.36],
        [32.98],
        [14.91]]),
 array([65.23, 48.08, 35.83, ..., 70.29, 32.94, 14.85]),
 array([65.25, 48.11, 35.87, ..., 70.32, 32.99, 14.9 ])]

In [13]:
for x in models:
    mean_er = mean_squared_error(y_test, x)
    print(mean_er)

4.437524233333334
4.468414166666666
4.4491128


In [18]:
for x in models:
    r2 = r2_score(y_test, x)
    print(r2)

0.9880777729964348
0.9879947814953274
0.9880466381754894
