# **Student Performance Analysis using Multiple Linear Regression**

> **1. Import Important Libraries :**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

> **2. Import Dataset :**

In [4]:
df = pd.read_csv('/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv')


In [8]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [9]:
df.tail()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0
9999,7,74,No,8,1,64.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [6]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


> **EDA :**

In [10]:
df['Extracurricular Activities'].unique()

array(['Yes', 'No'], dtype=object)

In [32]:
# Encoding categorical values

d = {'Yes':0, 'No':1}
df['Extracurricular Activities'] = df['Extracurricular Activities'].map(d)


In [13]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,0,9,1,91.0
1,4,82,1,4,2,65.0
2,8,51,0,7,2,45.0
3,5,52,0,5,2,36.0
4,7,75,1,8,5,66.0


In [14]:
df.tail()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
9995,1,49,0,4,2,23.0
9996,7,64,0,8,5,58.0
9997,6,83,0,8,5,74.0
9998,9,97,0,7,0,95.0
9999,7,74,1,8,1,64.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int64  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


> **3. Splitting the Dataset :**

In [16]:
X = df.drop('Performance Index', axis=1)
Y = df['Performance Index']

In [17]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, random_state=42)


> **4. Training the Model :**

In [19]:
from sklearn.linear_model import LinearRegression

sklearn.linear_model._base.LinearRegression

In [20]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

> **5. Evaluating Results :**

In [21]:
y_pred = regressor.predict(x_test)

In [22]:
out = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
out

Unnamed: 0,Actual,Predicted
6252,51.0,54.711854
4684,20.0,22.615513
1731,46.0,47.903145
4742,28.0,31.289767
4521,41.0,43.004570
...,...,...
6412,45.0,46.886280
8285,66.0,62.698025
7853,16.0,16.793420
1095,65.0,63.343274


> **Model Evaluation :**

In [23]:
from sklearn.metrics import r2_score, mean_squared_error

In [26]:
r2 = r2_score(y_test, y_pred)
r2

# r2_score = 0.9889 ~ 1
# So, the model has good fit to the dataset

0.9889832909573145

In [27]:
sq_err = mean_squared_error(y_test, y_pred)
sq_err

4.0826283985218526

In [28]:
regressor.coef_

array([ 2.85248393,  1.0169882 , -0.60861668,  0.47694148,  0.19183144])

* So, the equation for multiple linear regression can be written as follows:

> Performance Index' = 2.85*'Hours Studied' + 1.01*'Previous Scores' + (-0.60)*'Extracurricular Activities' + 0.47*'Sleep Hours' + 0.19*'Sample Question Papers Practiced'

In [29]:
regressor.intercept_

-33.31332953597997