## loading the data and importing necessary liabraryes

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
df = pd.read_csv("student-mat.csv",sep=';')
df


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


## Dataset Description
This project uses the Student Performance dataset from the UCI Machine Learning Repository.  
It contains academic, family, and social information of students.

## Target Variable
The target variable is `Result`, created from the final grade (`G3`).
- Pass: `G3 ≥ 10`
- Fail: `G3 < 10`

This makes the problem a binary classification task for Naive Bayes.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [4]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [11]:
df.shape

(395, 33)

In [12]:
df.duplicated().sum()

0

In [13]:
df.isna().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

- **creating a target variable.**

In [5]:
df['Result'] = df['G3'].apply(lambda x: 'Pass' if x >= 10 else 'Fail')

In [6]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Result
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,Fail
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,Fail
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,Pass
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,Pass
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,Pass


- **Removing unnecessary columns from data**

In [7]:
df.drop(['G1', 'G2', 'G3'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,Result
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,Fail
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,Fail
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,Pass
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,Pass
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,Pass


## Data preprocessing

In [9]:
x=df.drop('Result',axis=1)
y=df['Result']

In [10]:
x.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,4
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,2
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,4


In [11]:
y.head()

0    Fail
1    Fail
2    Pass
3    Pass
4    Pass
Name: Result, dtype: object

## train - test split

In [12]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [13]:
X_train.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
16,GP,F,16,U,GT3,T,4,4,services,services,...,yes,yes,no,3,2,3,1,2,2,6
66,GP,M,15,U,GT3,A,4,4,other,services,...,yes,yes,yes,1,3,3,5,5,3,4
211,GP,M,17,U,LE3,T,4,4,services,other,...,yes,yes,yes,5,3,5,4,5,3,13
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,yes,no,no,4,1,4,1,1,1,6
19,GP,M,16,U,LE3,T,4,3,health,other,...,yes,yes,no,3,1,3,1,3,5,4


In [14]:
X_test.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
78,GP,M,17,U,GT3,T,2,1,other,other,...,no,yes,no,4,5,1,1,1,3,2
371,MS,M,18,R,LE3,T,1,2,at_home,services,...,no,yes,yes,4,3,3,2,3,3,3
248,GP,M,18,R,LE3,T,3,3,other,services,...,yes,yes,yes,4,3,3,1,3,5,8
55,GP,F,16,U,GT3,A,2,1,other,other,...,yes,yes,yes,5,3,4,1,1,2,8
390,MS,M,20,U,LE3,A,2,2,services,services,...,yes,no,no,5,5,4,4,5,4,11


## Handeling categorical data with (label encoding)

In [16]:
le=LabelEncoder()

for i in X_train.columns:
    if X_train[i].dtype=='object':
        X_train[i]=le.fit_transform(X_train[i])
        X_test[i]=le.transform(X_test[i])

## Building a model

- GaussianNB()

In [17]:
model=GaussianNB()
model.fit(X_train,y_train)

- BernoulliNB()

In [22]:
from sklearn.naive_bayes import BernoulliNB
model2 = BernoulliNB()
model2.fit(X_train,y_train)

## Evaluation matrix 

- GaussianNB model

In [21]:
y_pred = model.predict(X_test)

print("Accuracy:",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n\n",confusion_matrix(y_test,y_pred))
print("\nclassification report :\n\n",classification_report(y_test,y_pred))

Accuracy: 0.696969696969697
Confusion Matrix:

 [[11 25]
 [ 5 58]]

classification report :

               precision    recall  f1-score   support

        Fail       0.69      0.31      0.42        36
        Pass       0.70      0.92      0.79        63

    accuracy                           0.70        99
   macro avg       0.69      0.61      0.61        99
weighted avg       0.69      0.70      0.66        99



- BernouliNB model

In [23]:
y_pred = model2.predict(X_test)

print("Accuracy:",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n\n",confusion_matrix(y_test,y_pred))
print("\nclassification report :\n\n",classification_report(y_test,y_pred))

Accuracy: 0.6868686868686869
Confusion Matrix:

 [[11 25]
 [ 6 57]]

classification report :

               precision    recall  f1-score   support

        Fail       0.65      0.31      0.42        36
        Pass       0.70      0.90      0.79        63

    accuracy                           0.69        99
   macro avg       0.67      0.61      0.60        99
weighted avg       0.68      0.69      0.65        99



#### Evaluation Matrix Report
- Both GaussianNB and BernoulliNB models were evaluated on the student performance dataset.  
- GaussianNB achieved slightly higher accuracy and better recall for the **Pass** class compared to BernoulliNB.

- Overall, **GaussianNB performed better** for this dataset and is more suitable due to the presence of numerical features.

##  Conclusion

- In this project, we successfully built and evaluated Naive Bayes classification models to predict student performance.  
- The model was able to classify students as **Pass** or **Fail** using academic and personal attributes.  

- This project demonstrates how Naive Bayes can be used for **early identification of at-risk students**, helping in better academic planning and decision-making.