In [1]:
import pandas as pd

DATA = '/kaggle/input/students-performance-dataset/Student_performance_data _.csv'

df = pd.read_csv(filepath_or_buffer=DATA, index_col=['StudentID'])

df.head()

Unnamed: 0_level_0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


Let's do some EDA.

In [2]:
from plotly import express

express.histogram(data_frame=df, x='GPA', color='GradeClass')

  sf: grouped.get_group(s if len(s) > 1 else s[0])


That's odd. From the data card we expect that the targat is just a GPA bucket, but we see that they are only highly correlated, with some 0/A students all over the GPA distribution.

All of our data is numerical, so let's use dimension reduction to see how it clusters, and if the clusters correspond in any way to the target variable.

In [3]:
import arrow
from umap import UMAP

COLUMNS = ['Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering','GPA']
TARGET = 'GradeClass'
time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-20 19:25:34.844626: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 19:25:34.844769: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 19:25:35.014713: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sat Jul 20 19:25:48 2024 Construct fuzzy simplicial set
Sat Jul 20 19:25:53 2024 Finding Nearest Neighbors
Sat Jul 20 19:25:58 2024 Finished Nearest Neighbor Search
Sat Jul 20 19:26:02 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Jul 20 19:26:07 2024 Finished embedding
done with UMAP in 0:00:19.776903


In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color=TARGET, )

The data is noisy, but we do see that the lower grades fall mostly on the left and the higher grades on the right, with a sort of gradient as we move right. Let's see how a regression model does.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=10000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 1368 iterations
accuracy: 0.7620


Our accuracy isn't great. Let's see what our regression coefficients look like.

In [6]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

Is it a surprise that females get better grades than males?

In [7]:
df['Gender'].value_counts(normalize=True).to_dict()

{1: 0.5108695652173914, 0: 0.4891304347826087}

We have slightly more females than males.

In [8]:
from plotly import express

express.bar(data_frame=df[['Gender', TARGET]].value_counts().to_frame().reset_index(), x=TARGET, y='count', facet_col='Gender',  barmode='group')

The target variable is not normally distributed; there's a lot of rightward skew in our grade distribution. We know from the data card that the target variable is a GPA bucket, and furthermore that low is better. 

In [9]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        21
         1.0       0.52      0.59      0.56        54
         2.0       0.58      0.60      0.59        78
         3.0       0.67      0.61      0.64        83
         4.0       0.90      0.97      0.93       243

    accuracy                           0.76       479
   macro avg       0.54      0.56      0.54       479
weighted avg       0.73      0.76      0.74       479



Because 0/As are rare it is probably not surprising that our regression model doesn't find them.

In [10]:
from sklearn.neural_network import MLPClassifier

neural_net = MLPClassifier(alpha=1, max_iter=10000, random_state=2024).fit(X=X_train, y=y_train)

print(classification_report(zero_division=0 , y_true=y_test, y_pred=neural_net.predict(X=X_test)))

              precision    recall  f1-score   support

         0.0       0.80      0.38      0.52        21
         1.0       0.73      0.69      0.70        54
         2.0       0.69      0.69      0.69        78
         3.0       0.62      0.70      0.66        83
         4.0       0.92      0.93      0.93       243

    accuracy                           0.80       479
   macro avg       0.75      0.68      0.70       479
weighted avg       0.80      0.80      0.80       479



We do somewhat better with a neural net model, but this is still a tough problem.