In [3]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

In [4]:
# This dataset contains information about hundreds of students- their academic performance, study habits, and external factors that affect their final exam scores and if they pass or fail a course.
# Target variable: 'Pass_Fail' (Pass) or (Fail)

In [5]:
#loading the data
df = pd.read_csv('student_performance_dataset.csv')
print(df)

    Student_ID  Gender  Study_Hours_per_Week  Attendance_Rate  \
0         S147    Male                    31        68.267841   
1         S136    Male                    16        78.222927   
2         S209  Female                    21        87.525096   
3         S458  Female                    27        92.076483   
4         S078  Female                    37        98.655517   
..         ...     ...                   ...              ...   
703       S492    Male                    14        84.658761   
704       S301    Male                    35        60.278990   
705       S473    Male                    25        98.384969   
706       S307  Female                    21        96.148012   
707       S046  Female                    22        80.404392   

     Past_Exam_Scores Parental_Education_Level Internet_Access_at_Home  \
0                  86              High School                     Yes   
1                  73                      PhD                      No 

In [6]:
print (df.dtypes)

Student_ID                     object
Gender                         object
Study_Hours_per_Week            int64
Attendance_Rate               float64
Past_Exam_Scores                int64
Parental_Education_Level       object
Internet_Access_at_Home        object
Extracurricular_Activities     object
Final_Exam_Score                int64
Pass_Fail                      object
dtype: object


In [7]:
# Basic data cleaning

In [8]:
#drop unwanted features
df_update= df.drop(columns=['Student_ID', 'Final_Exam_Score']) #final exam score variable implicitly tells the target variable so remove to not cheat the model
print(df_update)

     Gender  Study_Hours_per_Week  Attendance_Rate  Past_Exam_Scores  \
0      Male                    31        68.267841                86   
1      Male                    16        78.222927                73   
2    Female                    21        87.525096                74   
3    Female                    27        92.076483                99   
4    Female                    37        98.655517                63   
..      ...                   ...              ...               ...   
703    Male                    14        84.658761                78   
704    Male                    35        60.278990                83   
705    Male                    25        98.384969                75   
706  Female                    21        96.148012                84   
707  Female                    22        80.404392                93   

    Parental_Education_Level Internet_Access_at_Home  \
0                High School                     Yes   
1                      

In [9]:
#any missing values?
print(df_update.isnull().sum())

Gender                        0
Study_Hours_per_Week          0
Attendance_Rate               0
Past_Exam_Scores              0
Parental_Education_Level      0
Internet_Access_at_Home       0
Extracurricular_Activities    0
Pass_Fail                     0
dtype: int64


In [10]:
#remove any duplicates
df_nodupes = df_update.drop_duplicates()
print(df_nodupes.shape)

(500, 8)


In [11]:
# Explore categorical variables 

In [12]:
categorical = [var for var in df_nodupes.columns if df[var].dtype=='O']
print(categorical)

['Gender', 'Parental_Education_Level', 'Internet_Access_at_Home', 'Extracurricular_Activities', 'Pass_Fail']


In [13]:
#check for inconsistent inputs for categorical variables: Gender, Parental Education Level, Internet Access at Home, Extracurricular Activities, Pass/ Fail
categorical_cols = df_nodupes.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"Unique values for {col}:")
    print(df_nodupes[col].unique())

Unique values for Gender:
['Male' 'Female']
Unique values for Parental_Education_Level:
['High School' 'PhD' 'Bachelors' 'Masters']
Unique values for Internet_Access_at_Home:
['Yes' 'No']
Unique values for Extracurricular_Activities:
['Yes' 'No']
Unique values for Pass_Fail:
['Pass' 'Fail']


In [14]:
# view frequency counts of values in categorical variables
for var in categorical: 
    print(df[var].value_counts())

Gender
Female    375
Male      333
Name: count, dtype: int64
Parental_Education_Level
Bachelors      189
High School    183
Masters        171
PhD            165
Name: count, dtype: int64
Internet_Access_at_Home
No     381
Yes    327
Name: count, dtype: int64
Extracurricular_Activities
No     361
Yes    347
Name: count, dtype: int64
Pass_Fail
Pass    354
Fail    354
Name: count, dtype: int64


In [15]:
# view frequency distribution of categorical variables
for var in categorical: 
    print(df[var].value_counts()/float(len(df)))

Gender
Female    0.529661
Male      0.470339
Name: count, dtype: float64
Parental_Education_Level
Bachelors      0.266949
High School    0.258475
Masters        0.241525
PhD            0.233051
Name: count, dtype: float64
Internet_Access_at_Home
No     0.538136
Yes    0.461864
Name: count, dtype: float64
Extracurricular_Activities
No     0.509887
Yes    0.490113
Name: count, dtype: float64
Pass_Fail
Pass    0.5
Fail    0.5
Name: count, dtype: float64


In [16]:
# view frequency distribution of categorical variables
for var in categorical: 
    print(df[var].value_counts()/float(len(df)))
    # Changed np.float to built-in float as np.float is deprecated
    # Alternatively, could use np.float64 if a numpy scalar type is specifically needed

Gender
Female    0.529661
Male      0.470339
Name: count, dtype: float64
Parental_Education_Level
Bachelors      0.266949
High School    0.258475
Masters        0.241525
PhD            0.233051
Name: count, dtype: float64
Internet_Access_at_Home
No     0.538136
Yes    0.461864
Name: count, dtype: float64
Extracurricular_Activities
No     0.509887
Yes    0.490113
Name: count, dtype: float64
Pass_Fail
Pass    0.5
Fail    0.5
Name: count, dtype: float64


In [17]:
# Explore Numerical Variables

In [18]:
numerical = [var for var in df_nodupes.columns if df[var].dtype!='O']
print(numerical)

['Study_Hours_per_Week', 'Attendance_Rate', 'Past_Exam_Scores']


In [19]:
#transform categorical variables to numerical
df_numeric = df_nodupes.copy()
df_numeric.replace({'Male': 0, 'Female': 1}, inplace=True)
df_numeric.replace({'High School': 0, 'PhD': 1, 'Bachelors': 2, 'Masters': 3}, inplace = True)
df_numeric['Internet_Access_at_Home'].replace(['Yes', 'No'],[1, 0], inplace = True)
df_numeric['Extracurricular_Activities'].replace(['Yes', 'No'],[1, 0], inplace = True)

  df_numeric.replace({'Male': 0, 'Female': 1}, inplace=True)
  df_numeric.replace({'High School': 0, 'PhD': 1, 'Bachelors': 2, 'Masters': 3}, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numeric['Internet_Access_at_Home'].replace(['Yes', 'No'],[1, 0], inplace = True)
  df_numeric['Internet_Access_at_Home'].replace(['Yes', 'No'],[1, 0], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col

In [20]:
# Working on target variable 

In [21]:
X = df_numeric.drop(['Pass_Fail'], axis=1)
y = df_numeric['Pass_Fail']

In [22]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [23]:
# check the shape of X_train and X_test
X_train.shape, X_test.shape

((350, 7), (150, 7))

In [24]:
# Feature Engineering

In [25]:
# check data types in X_train
X_train.dtypes

Gender                          int64
Study_Hours_per_Week            int64
Attendance_Rate               float64
Past_Exam_Scores                int64
Parental_Education_Level        int64
Internet_Access_at_Home         int64
Extracurricular_Activities      int64
dtype: object

In [26]:
# Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [28]:
# Fitting of Naive Bayes Algorithm to the Training Dataset
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [29]:
# Prediction of the test dataset outcomes
y_pred = classifier.predict(X_test)

In [30]:
import pickle
pickle.dump(classifier, open("model.pkl", "wb"))  

In [31]:
Naive_Baye = pickle.load(open('model.pkl', "rb"))
result = classifier.predict(X_test)
print(result)

['Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail'
 'Fail' 'Fail' 'Fail' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail'
 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail'
 'Fail' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail'
 'Fail' 'Fail' 'Pass' 'Fail' 'Pass' 'Fail' 'Fail' 'Pass' 'Pass' 'Pass'
 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Pass' 'Pass' 'Fail' 'Fail'
 'Pass' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail'
 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Pass'
 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Pass' 'Pass' 'Fail'
 'Fail' 'Fail' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Pass' 'Fail'
 'Fail' 'Fail' 'Fail' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Pass' 'Fail'
 'Pass' 'Fail' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail'
 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail' 'Pass'
 'Fail' 'Pass' 'Pass' 'Fail' 'Pass' 'Fail' 'Fail' 'Fail' 'Fail' 'Fail'
 'Fail

In [32]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)

y_pred = classifier.predict(X_test)
accuray = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuray)
print("F1 Score:", f1)

Accuracy: 0.8866666666666667
F1 Score: 0.8924595176312031


In [36]:
#logistic regression model
from imblearn.over_sampling import SMOTE  
from sklearn.model_selection import train_test_split  

X = df_numeric.drop('Pass_Fail', axis=1)
y = df_numeric['Pass_Fail']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

#balancing the data
os = SMOTE(random_state=0)
oversampled_x, oversampled_y = os.fit_resample(x_train, y_train)

print(x_train.shape)
print(oversampled_x.shape)

print(oversampled_y.value_counts())

(400, 7)
(566, 7)
Pass_Fail
Fail    283
Pass    283
Name: count, dtype: int64


In [37]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=2000)
model.fit(oversampled_x, oversampled_y.values.ravel())

In [38]:
# Accuracy score
test_pred = model.predict(x_test)
print("Accuracy Score:", accuracy_score(y_test, test_pred))

Accuracy Score: 0.88
