In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
# Loading our Dataset
df = pd.read_csv("Student_Performance.csv")
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


### Data Preprocessing

In [14]:
# checking for null values
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [15]:
# Handling Categorical Data Column using Label Encoding
le = LabelEncoder()
df['Extracurricular Activities'] = le.fit_transform(df['Extracurricular Activities'])
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


### Data Analysis

In [16]:
# Retreiving information about data types, missing values and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int32  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int32(1), int64(4)
memory usage: 429.8 KB


In [27]:
# Retreive statistical summary of numerical columns
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,0.4948,6.5306,4.5833,55.2248
std,2.589309,17.343152,0.499998,1.695863,2.867348,19.212558
min,1.0,40.0,0.0,4.0,0.0,10.0
25%,3.0,54.0,0.0,5.0,2.0,40.0
50%,5.0,69.0,0.0,7.0,5.0,55.0
75%,7.0,85.0,1.0,8.0,7.0,71.0
max,9.0,99.0,1.0,9.0,9.0,100.0


In [28]:
# Number of rows and columns in our dataset
df.shape

(10000, 6)

### Model Training

In [17]:
# Selecting Data
X = df[['Hours Studied','Previous Scores','Extracurricular Activities','Sleep Hours','Sample Question Papers Practiced']].values
y = df['Performance Index'].values
X

array([[ 7, 99,  1,  9,  1],
       [ 4, 82,  0,  4,  2],
       [ 8, 51,  1,  7,  2],
       ...,
       [ 6, 83,  1,  8,  5],
       [ 9, 97,  1,  7,  0],
       [ 7, 74,  0,  8,  1]], dtype=int64)

In [30]:
y

array([91., 65., 45., ..., 74., 95., 64.])

In [18]:
# Splitting our dataset for testing and training
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [19]:
# creating Linear Regression Object
model = LinearRegression()

In [27]:
# training our model
model.fit(X_train,y_train)

In [28]:
model.coef_ 

array([2.84983168, 1.01782832, 0.62449167, 0.48227479, 0.19772818])

In [29]:
model.intercept_

-34.056612281025345

### Evaluating our Model on Test Data

In [30]:
model.score(X_test,y_test)

0.9892639217277599