# Student Performance Predictor

#  Import Libraries

In [185]:
# importing important libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

#  Load the Dataset

In [186]:
# loading the dataset

data = pd.read_csv("StudentsPerformance.csv")

# displaying the dataset

data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [187]:
data.shape # displaying the number of columns and rows in a dataset

(1000, 8)

In [188]:
data.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [189]:
data.columns # displaying the names of column in a dataset

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

#  Data Exploration

In [190]:
print(data.head())  # displaying the first 5 rows of the dataset
print(data.tail())  # displaying the last 5 rows of the dataset

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  
     gender race/ethnicity parental level of education         lunch  \
995  female        group E             master's degree      standard   
996    male    

In [191]:
print(data.info())  # displaying information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None


#  Data Preprocessing

In [192]:
# handling the missing values 

data = data.dropna()

# Checking for missing values

missing_values = data.isnull().sum()
print("Missing values per column:")
print(missing_values)

Missing values per column:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [193]:
# calculating average score
data['average_score'] = (data['math score'] + data['reading score'] + data['writing score']) / 3

# conditions of labelling:
#   average_score >= 80 :-'good'
#   average_score <= 60, :-'bad'
#   Otherwise, 'average'

data['performance'] = pd.cut(data['average_score'], bins=[0, 60, 80, 100], labels=['bad', 'average', 'good'])

X = data[['math score', 'average_score']]
y = data['performance']

In [194]:
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average_score,performance
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,average
1,female,group C,some college,standard,completed,69,90,88,82.333333,good
2,female,group B,master's degree,standard,none,90,95,93,92.666667,good
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333,bad
4,male,group C,some college,standard,none,76,78,75,76.333333,average
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,94.000000,good
996,male,group C,high school,free/reduced,none,62,55,55,57.333333,bad
997,female,group C,high school,free/reduced,completed,59,71,65,65.000000,average
998,female,group D,some college,standard,completed,68,78,77,74.333333,average


In [195]:
# Splitting the dataset into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#  Model Selection

In [196]:
from sklearn.neighbors import KNeighborsClassifier

#  Model Training

In [197]:
#using knn library directly

k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

In [184]:
#all steps of knn algo shown here

class KNeighboursClassifier:
    def _init_(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = np.zeros(X_test.shape[0])
        for i in range(X_test.shape[0]):
            distances = np.sqrt(np.sum((self.X_train - X_test[i])**2, axis=1))
            nearest_indices = distances.argsort()[:self.k]
            nearest_labels = self.y_train[nearest_indices]
            y_pred[i] = np.bincount(nearest_labels).argmax()
        return y_pred

#  Model Evaluation

In [198]:
from sklearn.metrics import accuracy_score

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("The KNN classifier acieved an accuracy of", accuracy, "on predicting student performance based on average scores.")
print('MODEL ACCURACY =',accuracy*100,'%')

Accuracy: 0.975
The KNN classifier acieved an accuracy of 0.975 on predicting student performance based on average scores.
MODEL ACCURACY = 97.5 %
