# Importing useful libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import sklearn as skl

In [2]:
print("Pandas: ",pd.__version__)
print("Numpy: ",np.__version__)
print("Matplotlib: ",mpl.__version__)
print("Sicikit Learn: ",skl.__version__)

Pandas:  2.3.2
Numpy:  2.3.3
Matplotlib:  3.10.6
Sicikit Learn:  1.7.2


# Importing dataset

In [3]:
df = pd.read_csv("dataset/exam_performance.csv")

## Display basic info

In [4]:
df.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group A,high school,standard,completed,67,67,63
1,female,group D,some high school,free/reduced,none,40,59,55
2,male,group E,some college,free/reduced,none,59,60,50
3,male,group B,high school,standard,none,77,78,68
4,male,group E,associate's degree,standard,completed,78,73,68
5,female,group D,high school,standard,none,63,77,76
6,female,group A,bachelor's degree,standard,none,62,59,63
7,male,group E,some college,standard,completed,93,88,84
8,male,group D,high school,standard,none,63,56,65
9,male,group C,some college,free/reduced,none,47,42,45


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [6]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.396,69.002,67.738
std,15.402871,14.737272,15.600985
min,13.0,27.0,23.0
25%,56.0,60.0,58.0
50%,66.5,70.0,68.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


# Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
ohe_encoded = ohe.fit_transform(df[["lunch"]])
ohe_encoded_df = pd.DataFrame(ohe_encoded, columns=ohe.get_feature_names_out(["lunch"]))
df = pd.concat([df, ohe_encoded_df], axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["test preparation course"] = le.fit_transform(df["test preparation course"])

In [9]:
df.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,lunch_free/reduced,lunch_standard
0,male,group A,high school,standard,0,67,67,63,0.0,1.0
1,female,group D,some high school,free/reduced,1,40,59,55,1.0,0.0
2,male,group E,some college,free/reduced,1,59,60,50,1.0,0.0
3,male,group B,high school,standard,1,77,78,68,0.0,1.0
4,male,group E,associate's degree,standard,0,78,73,68,0.0,1.0
5,female,group D,high school,standard,1,63,77,76,0.0,1.0
6,female,group A,bachelor's degree,standard,1,62,59,63,0.0,1.0
7,male,group E,some college,standard,0,93,88,84,0.0,1.0
8,male,group D,high school,standard,1,63,56,65,0.0,1.0
9,male,group C,some college,free/reduced,1,47,42,45,1.0,0.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   int64  
 5   math score                   1000 non-null   int64  
 6   reading score                1000 non-null   int64  
 7   writing score                1000 non-null   int64  
 8   lunch_free/reduced           1000 non-null   float64
 9   lunch_standard               1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.3+ KB


## Normalization

In [11]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df[["math score", "reading score", "writing score"]] = mms.fit_transform(df[["math score", "reading score", "writing score"]])
df.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,lunch_free/reduced,lunch_standard
0,male,group A,high school,standard,0,0.62069,0.547945,0.519481,0.0,1.0
1,female,group D,some high school,free/reduced,1,0.310345,0.438356,0.415584,1.0,0.0
2,male,group E,some college,free/reduced,1,0.528736,0.452055,0.350649,1.0,0.0
3,male,group B,high school,standard,1,0.735632,0.69863,0.584416,0.0,1.0
4,male,group E,associate's degree,standard,0,0.747126,0.630137,0.584416,0.0,1.0
5,female,group D,high school,standard,1,0.574713,0.684932,0.688312,0.0,1.0
6,female,group A,bachelor's degree,standard,1,0.563218,0.438356,0.519481,0.0,1.0
7,male,group E,some college,standard,0,0.91954,0.835616,0.792208,0.0,1.0
8,male,group D,high school,standard,1,0.574713,0.39726,0.545455,0.0,1.0
9,male,group C,some college,free/reduced,1,0.390805,0.205479,0.285714,1.0,0.0


## Spliting the dataset

In [12]:
from sklearn.model_selection import train_test_split
X = df[["math score", "reading score", "writing score", "lunch"]]
y = df[["test preparation course"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [13]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (750, 4)
X_test shape:  (250, 4)
y_train shape:  (750, 1)
y_test shape:  (250, 1)
