In [98]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv


In [99]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "Student_Performance.csv"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "nikhil7280/student-performance-multiple-linear-regression",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [100]:
def rename_col(cols):
    col_list = []
    for x in list(cols):
        x = x.replace(" ","_")
        col_list.append(x)
    return col_list

df.columns = rename_col(df.columns)
df.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [101]:
df.isna().sum()

Hours_Studied                       0
Previous_Scores                     0
Extracurricular_Activities          0
Sleep_Hours                         0
Sample_Question_Papers_Practiced    0
Performance_Index                   0
dtype: int64

In [102]:
dummies = pd.get_dummies(df[['Extracurricular_Activities']], dtype='int')
dummies

Unnamed: 0,Extracurricular_Activities_No,Extracurricular_Activities_Yes
0,0,1
1,1,0
2,0,1
3,0,1
4,1,0
...,...,...
9995,0,1
9996,0,1
9997,0,1
9998,0,1


In [103]:
df = pd.concat([df.drop(columns=['Extracurricular_Activities'], axis=1), dummies], axis=1)
df

Unnamed: 0,Hours_Studied,Previous_Scores,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index,Extracurricular_Activities_No,Extracurricular_Activities_Yes
0,7,99,9,1,91.0,0,1
1,4,82,4,2,65.0,1,0
2,8,51,7,2,45.0,0,1
3,5,52,5,2,36.0,0,1
4,7,75,8,5,66.0,1,0
...,...,...,...,...,...,...,...
9995,1,49,4,2,23.0,0,1
9996,7,64,8,5,58.0,0,1
9997,6,83,8,5,74.0,0,1
9998,9,97,7,0,95.0,0,1


In [104]:
from sklearn.model_selection import train_test_split
x,y = df.drop(columns=['Performance_Index'], axis=1), df[['Performance_Index']]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [105]:
print(x_train.shape)
print(y_train.shape)

(7000, 6)
(7000, 1)


In [106]:
print(x_test.shape)
print(y_test.shape)

(3000, 6)
(3000, 1)


In [107]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_scaled = scaler.fit_transform(x_train)
test_scaled = scaler.fit_transform(x_test)

In [108]:
x_train = pd.DataFrame(train_scaled,columns = x_train.columns, index = x_train.index)
x_test = pd.DataFrame(test_scaled, columns = x_test.columns, index = x_test.index) 

In [109]:
train_df = pd.concat([x_train, y_train], axis=1)
test_df = pd.concat([x_test, y_test], axis=1)

In [110]:
train_df

Unnamed: 0,Hours_Studied,Previous_Scores,Sleep_Hours,Sample_Question_Papers_Practiced,Extracurricular_Activities_No,Extracurricular_Activities_Yes,Performance_Index
2560,-0.390775,-0.323451,1.457261,0.483329,0.985815,-0.985815,49.0
1663,-1.162949,0.310880,-0.317722,-1.609437,0.985815,-0.985815,49.0
709,0.767486,-0.784782,0.865600,-1.609437,0.985815,-0.985815,45.0
5159,0.767486,1.233543,-1.501044,1.529713,-1.014389,1.014389,82.0
191,-0.390775,1.348876,1.457261,-0.214259,0.985815,-0.985815,77.0
...,...,...,...,...,...,...,...
7195,1.539660,-1.707445,-0.909383,-1.609437,-1.014389,1.014389,38.0
6818,0.767486,-0.035119,-1.501044,-0.911848,-1.014389,1.014389,59.0
6581,-0.004688,-0.554117,1.457261,-1.609437,0.985815,-0.985815,45.0
76,1.153573,-0.092785,0.273939,-0.563054,0.985815,-0.985815,61.0


In [111]:
test_df

Unnamed: 0,Hours_Studied,Previous_Scores,Sleep_Hours,Sample_Question_Papers_Practiced,Extracurricular_Activities_No,Extracurricular_Activities_Yes,Performance_Index
28,-0.753142,1.438103,-0.301784,0.170601,-1.001334,1.001334,74.0
6480,-0.366519,-0.234423,-0.887014,1.566111,0.998668,-0.998668,49.0
5572,0.406728,-0.580463,0.868677,-1.573786,0.998668,-0.998668,45.0
1295,0.406728,1.553450,0.868677,1.566111,0.998668,-0.998668,84.0
5326,-1.139766,-1.676255,-1.472244,-0.178276,0.998668,-0.998668,20.0
...,...,...,...,...,...,...,...
5722,-1.526389,-0.349769,-1.472244,-0.178276,-1.001334,1.001334,35.0
700,-0.753142,-0.176749,-1.472244,0.519479,0.998668,-0.998668,45.0
3784,1.179975,-1.560909,0.283447,0.170601,0.998668,-0.998668,37.0
640,-1.526389,-0.234423,0.868677,-0.876031,0.998668,-0.998668,40.0


In [112]:
dependant_var = train_df.columns[-1]
independant_var = train_df.drop(columns=[train_df.columns[-1]]).columns

In [113]:
formula = f"{dependant_var} ~ {' + '.join(independant_var)}"
formula

'Performance_Index ~ Hours_Studied + Previous_Scores + Sleep_Hours + Sample_Question_Papers_Practiced + Extracurricular_Activities_No + Extracurricular_Activities_Yes'

In [114]:
import statsmodels.formula.api as sm

model = sm.ols(formula, data=train_df)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:      Performance_Index   R-squared:                       0.989
Model:                            OLS   Adj. R-squared:                  0.989
Method:                 Least Squares   F-statistic:                 1.226e+05
Date:                Wed, 26 Nov 2025   Prob (F-statistic):               0.00
Time:                        09:37:11   Log-Likelihood:                -14924.
No. Observations:                7000   AIC:                         2.986e+04
Df Residuals:                    6994   BIC:                         2.990e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [115]:
test_results = result.predict(test_df)
test_results

28      75.483312
6480    48.438397
5572    47.744585
1295    87.169267
5326    15.815223
          ...    
5722    37.034040
700     45.555434
3784    36.615873
640     39.943876
3144    38.638486
Length: 3000, dtype: float64

In [116]:
from sklearn import metrics

print(f"MAE: {metrics.mean_absolute_error(test_df['Performance_Index'], test_results)}")
print(f"MSE: {metrics.mean_squared_error(test_df['Performance_Index'], test_results)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(test_df['Performance_Index'], test_results))}")
print(f"r2_score: {metrics.r2_score(test_df['Performance_Index'], test_results)}")

MAE: 1.7579816691908554
MSE: 4.794481395532994
RMSE: 2.1896304244170963
r2_score: 0.987006481433952
