In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder

In [2]:
df=pd.read_csv('Student_Performance.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9998 non-null   float64
 1   Previous Scores                   9996 non-null   float64
 2   Extracurricular Activities        9997 non-null   object 
 3   Sleep Hours                       9997 non-null   float64
 4   Sample Question Papers Practiced  9995 non-null   float64
 5   Performance Index                 9996 non-null   object 
dtypes: float64(4), object(2)
memory usage: 468.9+ KB


In [7]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7.0,99.0,Yes,9.0,1.0,91
1,4.0,82.0,No,4.0,2.0,65
2,8.0,51.0,Yes,7.0,2.0,45
3,5.0,52.0,Yes,5.0,2.0,36
4,7.0,75.0,No,8.0,5.0,66


# Bush kataklar sonini topish

In [10]:
df.isnull().sum()

Hours Studied                       2
Previous Scores                     4
Extracurricular Activities          3
Sleep Hours                         3
Sample Question Papers Practiced    5
Performance Index                   4
dtype: int64

In [12]:
df['Performance Index'].unique()

array(['91', '65', '45', '36', '66', '61', '63', '42', '69', '84', '73',
       '27', '33', '68', '43', '67', '70', '30', '71', '85', '57', '35',
       '49', '83', '74', '39', '58', '47', '60', '32', '64', '54', '17',
       '53', '75', '52', '78', '38', '98', '87', '41', '81', '15', '88',
       '95', '29', '21', '76', '25', '34', '50', '56', '82', '23', '46',
       '92', '77', '86', '44', '94', '40', '100', '31', '26', '18', '51',
       '72', '16', '28', '89', '48', '37', '62', '59', nan, '19', 'haha',
       'hehe', '79', '22', '10', '90', '80', '24', '20', '96', '55', '97',
       '12', '93', '14', '99', '11', '13'], dtype=object)

# Performance Index ustunidagi begona indekslarni raqamga aylantirish

In [15]:
df['Performance Index']=pd.to_numeric(df['Performance Index'], errors='coerce')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9998 non-null   float64
 1   Previous Scores                   9996 non-null   float64
 2   Extracurricular Activities        9997 non-null   object 
 3   Sleep Hours                       9997 non-null   float64
 4   Sample Question Papers Practiced  9995 non-null   float64
 5   Performance Index                 9994 non-null   float64
dtypes: float64(5), object(1)
memory usage: 468.9+ KB


In [19]:
df['Performance Index'].unique()

array([ 91.,  65.,  45.,  36.,  66.,  61.,  63.,  42.,  69.,  84.,  73.,
        27.,  33.,  68.,  43.,  67.,  70.,  30.,  71.,  85.,  57.,  35.,
        49.,  83.,  74.,  39.,  58.,  47.,  60.,  32.,  64.,  54.,  17.,
        53.,  75.,  52.,  78.,  38.,  98.,  87.,  41.,  81.,  15.,  88.,
        95.,  29.,  21.,  76.,  25.,  34.,  50.,  56.,  82.,  23.,  46.,
        92.,  77.,  86.,  44.,  94.,  40., 100.,  31.,  26.,  18.,  51.,
        72.,  16.,  28.,  89.,  48.,  37.,  62.,  59.,  nan,  19.,  79.,
        22.,  10.,  90.,  80.,  24.,  20.,  96.,  55.,  97.,  12.,  93.,
        14.,  99.,  11.,  13.])

# Performance Index ustundgi nan suzini tashlab yuborish

In [22]:
df = df.dropna(subset=['Performance Index'])

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9992 non-null   float64
 1   Previous Scores                   9990 non-null   float64
 2   Extracurricular Activities        9991 non-null   object 
 3   Sleep Hours                       9991 non-null   float64
 4   Sample Question Papers Practiced  9989 non-null   float64
 5   Performance Index                 9994 non-null   float64
dtypes: float64(5), object(1)
memory usage: 546.5+ KB


In [26]:
df['Performance Index'].unique()

array([ 91.,  65.,  45.,  36.,  66.,  61.,  63.,  42.,  69.,  84.,  73.,
        27.,  33.,  68.,  43.,  67.,  70.,  30.,  71.,  85.,  57.,  35.,
        49.,  83.,  74.,  39.,  58.,  47.,  60.,  32.,  64.,  54.,  17.,
        53.,  75.,  52.,  78.,  38.,  98.,  87.,  41.,  81.,  15.,  88.,
        95.,  29.,  21.,  76.,  25.,  34.,  50.,  56.,  82.,  23.,  46.,
        92.,  77.,  86.,  44.,  94.,  40., 100.,  31.,  26.,  18.,  51.,
        72.,  16.,  28.,  89.,  48.,  37.,  62.,  59.,  19.,  79.,  22.,
        10.,  90.,  80.,  24.,  20.,  96.,  55.,  97.,  12.,  93.,  14.,
        99.,  11.,  13.])

# Bush kataklarni tuldirish

In [29]:
bush_katak=df.isnull().sum()>0

In [31]:
bush_katak

Hours Studied                        True
Previous Scores                      True
Extracurricular Activities           True
Sleep Hours                          True
Sample Question Papers Practiced     True
Performance Index                   False
dtype: bool

In [33]:
for col in df.columns[bush_katak]:
    if df[col].dtype=='object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9994 non-null   float64
 1   Previous Scores                   9994 non-null   float64
 2   Extracurricular Activities        9994 non-null   object 
 3   Sleep Hours                       9994 non-null   float64
 4   Sample Question Papers Practiced  9994 non-null   float64
 5   Performance Index                 9994 non-null   float64
dtypes: float64(5), object(1)
memory usage: 546.5+ KB


In [37]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7.0,99.0,Yes,9.0,1.0,91.0
1,4.0,82.0,No,4.0,2.0,65.0
2,8.0,51.0,Yes,7.0,2.0,45.0
3,5.0,52.0,Yes,5.0,2.0,36.0
4,7.0,75.0,No,8.0,5.0,66.0


In [39]:
bush_katak

Hours Studied                        True
Previous Scores                      True
Extracurricular Activities           True
Sleep Hours                          True
Sample Question Papers Practiced     True
Performance Index                   False
dtype: bool

In [41]:
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

# Yozuvli ustunlarni topamiz

In [44]:
yozuvli_ustun=df.select_dtypes(include=['object','category']).columns

In [46]:
yozuvli_ustun

Index(['Extracurricular Activities'], dtype='object')

In [48]:
for col in yozuvli_ustun:
    print(f"column {col}: {df[col].nunique()}")

column Extracurricular Activities: 2


In [50]:
df['Extracurricular Activities'].value_counts()

Extracurricular Activities
No     5049
Yes    4945
Name: count, dtype: int64

# Yozuvli ustunimizda categorylar soni bor yugi 2ta bulganligi uchun one hot encodingdan foydalanamiz

In [53]:
df = pd.get_dummies(df, columns=['Extracurricular Activities'])

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9994 non-null   float64
 1   Previous Scores                   9994 non-null   float64
 2   Sleep Hours                       9994 non-null   float64
 3   Sample Question Papers Practiced  9994 non-null   float64
 4   Performance Index                 9994 non-null   float64
 5   Extracurricular Activities_No     9994 non-null   bool   
 6   Extracurricular Activities_Yes    9994 non-null   bool   
dtypes: bool(2), float64(5)
memory usage: 488.0 KB


In [57]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_No,Extracurricular Activities_Yes
0,7.0,99.0,9.0,1.0,91.0,False,True
1,4.0,82.0,4.0,2.0,65.0,True,False
2,8.0,51.0,7.0,2.0,45.0,False,True
3,5.0,52.0,5.0,2.0,36.0,False,True
4,7.0,75.0,8.0,5.0,66.0,True,False


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9994 non-null   float64
 1   Previous Scores                   9994 non-null   float64
 2   Sleep Hours                       9994 non-null   float64
 3   Sample Question Papers Practiced  9994 non-null   float64
 4   Performance Index                 9994 non-null   float64
 5   Extracurricular Activities_No     9994 non-null   bool   
 6   Extracurricular Activities_Yes    9994 non-null   bool   
dtypes: bool(2), float64(5)
memory usage: 488.0 KB


# booliandan integerga utib olamiz

In [62]:
df[df.columns]=df[df.columns].astype(int)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours Studied                     9994 non-null   int32
 1   Previous Scores                   9994 non-null   int32
 2   Sleep Hours                       9994 non-null   int32
 3   Sample Question Papers Practiced  9994 non-null   int32
 4   Performance Index                 9994 non-null   int32
 5   Extracurricular Activities_No     9994 non-null   int32
 6   Extracurricular Activities_Yes    9994 non-null   int32
dtypes: int32(7)
memory usage: 351.4 KB


In [66]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_No,Extracurricular Activities_Yes
0,7,99,9,1,91,0,1
1,4,82,4,2,65,1,0
2,8,51,7,2,45,0,1
3,5,52,5,2,36,0,1
4,7,75,8,5,66,1,0


# Scaling qilamiz

In [71]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns=df.columns)


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9994 non-null   float64
 1   Previous Scores                   9994 non-null   float64
 2   Sleep Hours                       9994 non-null   float64
 3   Sample Question Papers Practiced  9994 non-null   float64
 4   Performance Index                 9994 non-null   float64
 5   Extracurricular Activities_No     9994 non-null   float64
 6   Extracurricular Activities_Yes    9994 non-null   float64
dtypes: float64(7)
memory usage: 546.7 KB


In [75]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_No,Extracurricular Activities_Yes
0,0.775255,1.704352,1.456136,-1.250138,1.862232,-1.010461,1.010461
1,-0.3838,0.723956,-1.492485,-0.901342,0.508842,0.989647,-0.989647
2,1.161607,-1.063827,0.276688,-0.901342,-0.532228,-1.010461,1.010461
3,0.002551,-1.006156,-0.902761,-0.901342,-1.000709,-1.010461,1.010461
4,0.775255,0.320263,0.866412,0.145047,0.560895,0.989647,-0.989647


# Training qilamiz

In [78]:
X = df.drop(columns=['Performance Index'])  # Features
y = df['Performance Index']  # Target variable

In [80]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test,X_val,y_test,y_val=train_test_split(X_temp,y_temp,test_size=0.5,random_state=42)

# Modelni tanlaymiz

In [83]:
model=LinearRegression()

# Modelni Training qildiramiz

In [85]:
model.fit(X_train,y_train)

# Model predict 

In [89]:
y_pred = model.predict(X_test)

In [91]:
y

0       1.862232
1       0.508842
2      -0.532228
3      -1.000709
4       0.560895
          ...   
9989   -1.677405
9990    0.144467
9991    0.977323
9992    2.070446
9993    0.456788
Name: Performance Index, Length: 9994, dtype: float64

In [93]:
y_pred[0]

1.6103233746062484

In [103]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9994 non-null   float64
 1   Previous Scores                   9994 non-null   float64
 2   Sleep Hours                       9994 non-null   float64
 3   Sample Question Papers Practiced  9994 non-null   float64
 4   Extracurricular Activities_No     9994 non-null   float64
 5   Extracurricular Activities_Yes    9994 non-null   float64
dtypes: float64(6)
memory usage: 468.6 KB


In [106]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Extracurricular Activities_No,Extracurricular Activities_Yes
0,0.775255,1.704352,1.456136,-1.250138,-1.010461,1.010461
1,-0.3838,0.723956,-1.492485,-0.901342,0.989647,-0.989647
2,1.161607,-1.063827,0.276688,-0.901342,-1.010461,1.010461
3,0.002551,-1.006156,-0.902761,-0.901342,-1.010461,1.010461
4,0.775255,0.320263,0.866412,0.145047,0.989647,-0.989647


# Modelni baholaymiz

In [108]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [110]:
print(f"\nModel Evaluation Metrics:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")


Model Evaluation Metrics:
Mean Squared Error (MSE): 0.01
Mean Absolute Error (MAE): 0.09
R² Score: 0.99
