In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# load data
df = pd.read_csv('dataset/Student_Performance.csv')
print(df.head())

   Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0              7               99                        Yes            9   
1              4               82                         No            4   
2              8               51                        Yes            7   
3              5               52                        Yes            5   
4              7               75                         No            8   

   Sample Question Papers Practiced  Performance Index  
0                                 1               91.0  
1                                 2               65.0  
2                                 2               45.0  
3                                 2               36.0  
4                                 5               66.0  


In [2]:
# convert categorical variable 'Extracurricular Activities' to numerical
label = LabelEncoder()
df['Extracurricular Activities'] = label.fit_transform(df['Extracurricular Activities'])
# create 'final outcome' column based on 'Previous Scores' column. A score of 50 or above is considered a 1 (Pass) and below 50 is considered a 0 (Fail)
df['Final Outcome'] = np.where(df['Previous Scores'] >= 50, 1, 0)
print(df[['Previous Scores', 'Final Outcome']].head(20))


    Previous Scores  Final Outcome
0                99              1
1                82              1
2                51              1
3                52              1
4                75              1
5                78              1
6                73              1
7                45              0
8                77              1
9                89              1
10               91              1
11               79              1
12               47              0
13               47              0
14               79              1
15               72              1
16               73              1
17               83              1
18               54              1
19               75              1


In [3]:
# define features and target variable
X = df[['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced']]
y = df['Final Outcome']

In [4]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# model training using Logistic Regression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# feature importance
feature_importance = model.coef_[0]
feature_names = X.columns

# model evaluation
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


In [5]:
# Risk Score and Classification
def categorize_risk(probability):
    if probability >= 0.999909:
        return 'Low Risk'
    elif probability > 0.7:
        return 'Moderate Risk'
    else:
        return 'High Risk'

df['Risk Score'] = model.predict_proba(scaler.transform(X))[:, 1]
df['Risk Category'] = df['Risk Score'].apply(lambda x: categorize_risk(x))

print(df[['Previous Scores' , 'Final Outcome', 'Risk Score', 'Risk Category']].head(20))

    Previous Scores  Final Outcome  Risk Score  Risk Category
0                99              1    1.000000       Low Risk
1                82              1    1.000000       Low Risk
2                51              1    0.760393  Moderate Risk
3                52              1    0.893323  Moderate Risk
4                75              1    1.000000       Low Risk
5                78              1    1.000000       Low Risk
6                73              1    1.000000       Low Risk
7                45              0    0.016035      High Risk
8                77              1    1.000000       Low Risk
9                89              1    1.000000       Low Risk
10               91              1    1.000000       Low Risk
11               79              1    1.000000       Low Risk
12               47              0    0.076054      High Risk
13               47              0    0.083358      High Risk
14               79              1    1.000000       Low Risk
15      

In [7]:
# function to calculate required average end of sem score to achieve target increase in cwa score
def calc_required_score(cum_weighted_marks, target_cwa, total_credit_hours_obtained, current_semester_credit_hours):
  total_credit_hours = total_credit_hours_obtained + current_semester_credit_hours
  required_cum_weighted_marks = (target_cwa * total_credit_hours) - (cum_weighted_marks)
  required_score = required_cum_weighted_marks / current_semester_credit_hours
  return required_score
   

# calculate required average score
required_score = calc_required_score(7676, 69.5, 112, 18)
print(required_score)


75.5
