In [39]:
import pandas as pd
import numpy as np
import pickle 
import sklearn
import matplotlib.pyplot as plt

In [40]:
# Source of dataset: https://www.kaggle.com/datasets/lainguyn123/student-performance-factors
pdDf= pd.read_csv('C:\\Users\\sulej\\OneDrive\\Рабочий стол\\7 semester\\Data Mining\\StudentPerformanceFactors.csv')

In [44]:
print(pdDf)
print(pdDf.info())

      Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0                23          84                  Low                High   
1                19          64                  Low              Medium   
2                24          98               Medium              Medium   
3                29          89                  Low              Medium   
4                19          92               Medium              Medium   
...             ...         ...                  ...                 ...   
6602             25          69                 High              Medium   
6603             23          76                 High              Medium   
6604             20          90               Medium                 Low   
6605             10          86                 High                High   
6606             15          67               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours  Previous_Scores  \
0                     

In [45]:
# As we saw in the Lecture #2 we use Label Encoding for the columns with binary data
# When there are only 2 outcomes : Yes or No

In [46]:
#MinMaxScaler formula: Transformed value = (Value - Min) / (Max - Min)

In [47]:
for col in pdDf.columns:
    print (col)

Hours_Studied
Attendance
Parental_Involvement
Access_to_Resources
Extracurricular_Activities
Sleep_Hours
Previous_Scores
Motivation_Level
Internet_Access
Tutoring_Sessions
Family_Income
Teacher_Quality
School_Type
Peer_Influence
Physical_Activity
Learning_Disabilities
Parental_Education_Level
Distance_from_Home
Gender
Exam_Score


In [48]:
from sklearn.preprocessing import MinMaxScaler

In [49]:
scaler = MinMaxScaler()

In [50]:
pdDf[['Hours_Studied','Attendance']] = scaler.fit_transform(pdDf[['Hours_Studied', 'Attendance']])

In [51]:
print(pdDf)

      Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0          0.511628       0.600                  Low                High   
1          0.418605       0.100                  Low              Medium   
2          0.534884       0.950               Medium              Medium   
3          0.651163       0.725                  Low              Medium   
4          0.418605       0.800               Medium              Medium   
...             ...         ...                  ...                 ...   
6602       0.558140       0.225                 High              Medium   
6603       0.511628       0.400                 High              Medium   
6604       0.441860       0.750               Medium                 Low   
6605       0.209302       0.650                 High                High   
6606       0.325581       0.175               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours  Previous_Scores  \
0                     

In [52]:
# Encodecategorical variables using one-hot encoding with pd.get_dummies()
# or sklearn.preprocessing.OneHotEncoder.

In [53]:
# I want firstly try pd.get_dummies()

In [54]:
pdDf_encoded = pd.get_dummies(pdDf, columns = ['Gender'], drop_first=True)

In [55]:
print(pdDf_encoded)

      Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0          0.511628       0.600                  Low                High   
1          0.418605       0.100                  Low              Medium   
2          0.534884       0.950               Medium              Medium   
3          0.651163       0.725                  Low              Medium   
4          0.418605       0.800               Medium              Medium   
...             ...         ...                  ...                 ...   
6602       0.558140       0.225                 High              Medium   
6603       0.511628       0.400                 High              Medium   
6604       0.441860       0.750               Medium                 Low   
6605       0.209302       0.650                 High                High   
6606       0.325581       0.175               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours  Previous_Scores  \
0                     

In [56]:
print(pdDf_encoded['Gender_Male'])

0       1
1       0
2       1
3       1
4       0
       ..
6602    0
6603    0
6604    0
6605    0
6606    1
Name: Gender_Male, Length: 6607, dtype: uint8


In [57]:
# pd.get_dummies() is simpler to use but doesn't offer as much customization.
# OneHotEncoder is more powerful and flexible, making it better suited for integration with machine learning pipelines.

In [58]:
# Now I want to try 

In [59]:
from sklearn.preprocessing import OneHotEncoder

In [60]:
encoder = OneHotEncoder(sparse=False)

In [61]:
encoded_data = encoder.fit_transform(pdDf[['School_Type']])

In [62]:
pdDf_encoded2 = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['School_Type']))

In [63]:
print(pdDf_encoded2)

      School_Type_Private  School_Type_Public
0                     0.0                 1.0
1                     0.0                 1.0
2                     0.0                 1.0
3                     0.0                 1.0
4                     0.0                 1.0
...                   ...                 ...
6602                  0.0                 1.0
6603                  0.0                 1.0
6604                  0.0                 1.0
6605                  1.0                 0.0
6606                  0.0                 1.0

[6607 rows x 2 columns]


In [64]:
encoder = OneHotEncoder(sparse=False, drop='first')

In [65]:
encoded_data = encoder.fit_transform(pdDf[['School_Type']])

In [66]:
pdDf_encoded2 = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['School_Type']))

In [67]:
print(pdDf_encoded2)

      School_Type_Public
0                    1.0
1                    1.0
2                    1.0
3                    1.0
4                    1.0
...                  ...
6602                 1.0
6603                 1.0
6604                 1.0
6605                 0.0
6606                 1.0

[6607 rows x 1 columns]


In [68]:
# Use pd.cut()to bin continuous variables into discrete intervals.

In [69]:
# We use pd.cut() when we have continuous data and we need to categorize it by defining some ranges or intervals 

In [70]:
bins = [0,50,70,80,90,100]
labels = ['Fail', 'Pass', 'Not Bad', 'Good', 'Excellent']

In [71]:
pdDf['Previous_Scores'] = pd.cut(pdDf['Previous_Scores'], bins=bins, labels=labels)

In [72]:
print(pdDf)

      Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0          0.511628       0.600                  Low                High   
1          0.418605       0.100                  Low              Medium   
2          0.534884       0.950               Medium              Medium   
3          0.651163       0.725                  Low              Medium   
4          0.418605       0.800               Medium              Medium   
...             ...         ...                  ...                 ...   
6602       0.558140       0.225                 High              Medium   
6603       0.511628       0.400                 High              Medium   
6604       0.441860       0.750               Medium                 Low   
6605       0.209302       0.650                 High                High   
6606       0.325581       0.175               Medium                 Low   

     Extracurricular_Activities  Sleep_Hours Previous_Scores Motivation_Level  \
0     