In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns 

import statsmodels

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# data (as pandas dataframes) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 

df1_raw = X.assign(Target=y.get('Target'))

# making a copy to use 
df1 = df1_raw.copy(deep=True)

df1_raw.head()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


# Training Model based on Previous Observations and Tests

## Data Cleaning and Preprocessing

Since we are predicting if the student is a drop-out or not, we will only need to divide them into two groups: dropout, and enrolled/graduate. Thus, 'Graduate' and 'Enrolled' in the original dataset are combined into one group 'Graduate/Enrolled'. In the resulting data frame, Target is a binary variable, which 0 indicates that the student is a drop-out, and 1 indicates that the student is a non drop-out.

In [3]:
# Combining 'Graduate' and 'Enrolled' into one 
def get_target(y):
    if (y == 'Graduate') or (y == 'Enrolled'):
        return 'Graduate/Enrolled'
    else:
        return 'Dropout'
        
df1['Target'] = df1.get('Target').apply(get_target)

df1['Target'] = df1['Target'].apply(lambda x: 0 if x == 'Dropout' else 1)

In [4]:
df1.head()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,1
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,1


### Determine Features, and process the Data Frame
Based on our Exploratory Data Analysis and Hypothesis Testing, we found out that comparing to students who are not dropping out, drop-out students have noticeable larger proportion in the following categories of 1) married or divorced, 2) not compeleting high school, 3) not displaced, 4)being debtors. Moreover, these students have obvious lower proportion in having a scholarship. Thus, among all the categorical columns in the dataset, the following 5 columns will be taking into account as our features for the **Baseline model**: Marital Status, Displaced, Previous Qualification, Debtor, and Scholarship. 

In [5]:
#for our baseline model, we wll only have the 5 categorical features, and the target.
df_base = df1[['Marital Status','Displaced','Previous qualification',
               'Debtor','Scholarship holder','Target']]
df_base.head()

Unnamed: 0,Marital Status,Displaced,Previous qualification,Debtor,Scholarship holder,Target
0,1,1,1,0,0,0
1,1,1,1,0,0,1
2,1,1,1,0,0,0
3,1,1,1,0,0,1
4,2,0,1,0,0,1


### Split the dataset into Features and Target Variable

The dataset above is now prepared to conduct further training and is splitted into features and target variables based on the study from earlier sections. In order to prevent overfitting of the model, 20% of the data in the dataset is selected to be the test data, an the rest 80% will be the training data. 


In [7]:
# Split dataset into features and target variable
X_base = df_base.drop('Target', axis=1)
y_base = df_base['Target']

# Split data into training and test sets
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, test_size=0.2, random_state=42)

# Feature scaling
scaler_base = StandardScaler()
X_train_base = scaler_base.fit_transform(X_train_base)
X_test_base = scaler_base.transform(X_test_base)


## Building Random Forest Classifier for Baseline Model
The final target classification is binary, with the choices of being drop-out or not. The classifer being used to build this model is **Random Forest Classifier**. In the following section, the classfier will be initialized and perform training process on the training data. 

In [8]:
from sklearn.metrics import confusion_matrix
# Initialize the model
model_base = RandomForestClassifier(random_state=42)

# Train the model
model_base.fit(X_train_base, y_train_base)

# Predict on the test data
y_pred_base = model_base.predict(X_test_base)

# Evaluate the model
baseline_accuracy = accuracy_score(y_test_base, y_pred_base)
print(f'Accuracy: {baseline_accuracy}')

#confusion matrix
confusion_matrix_base = confusion_matrix(y_test_base, y_pred_base)

confusion_matrix_base


Accuracy: 0.6926553672316385


array([[ 94, 222],
       [ 50, 519]])

The accuracy score for the baseline model is 0.69, meaning the model is able to predict 69% of test data. From the confusion matrix, we found out that 94 observations are true positive, and 519 are true negative. There is a relatively high Type 1 error rate. We will continue refining our model in the next section. 

## Adding Second Semester Grades into the Model

Calculating the mean second semester grade for both drop-out students and non drop-out students, we found out the mean is quite different. In the visualization below, it has shown that the second semester grade for drop-out students is much lower than those of non drop-out students. This implies that the second semester grade might have a greate effect on the accuracy of the prediction model. In the following updated model, Second Semester Grade will be added into the model, and this model now has 6 features. 

In [None]:
gr_dropout = (df1[df1['Target']==0]['Curricular units 2nd sem (grade)']).mean()
gr_non = (df1[df1['Target']==1]['Curricular units 2nd sem (grade)']).mean()
print(gr_dropout,gr_non)
#The second semester grade for dropout is 5.9/20, and is 12.3 for non-dropout. The dropout students have much lower grades.

In [None]:
grades = [gr_dropout,gr_non]
labels = ['Dropout', 'Non-dropout']
title = 'Second Semester Grade - Dropout vs. Non-Dropout'
y_max = 20

# Creating the bar plot
plt.figure()
plt.bar(labels, grades, color=['red', 'green'])
plt.ylim(0, y_max)

# Adding titles and labels
plt.title(title)
plt.xlabel('Status')
plt.ylabel('Grade')
# Show plot
plt.show()

In [None]:
#adding new features into the model.
df_updated = df1[[
    'Curricular units 2nd sem (grade)',
    'Marital Status','Displaced',
    'Previous qualification',
    'Debtor',
    'Scholarship holder',
    'Target']]

In [None]:
X_updated = df_updated.drop('Target', axis=1)
y_updated = df_updated['Target']

# Split data into training and test sets
X_train_updated, X_test_updated, y_train_updated, y_test_updated = train_test_split(X_updated, y_updated, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_updated = scaler.fit_transform(X_train_updated)
X_test_updated = scaler.transform(X_test_updated)

model_updated = RandomForestClassifier(random_state=42)

# Train the model
model_updated.fit(X_train_updated, y_train_updated)

# Predict on the test data
y_pred_updated = model_updated.predict(X_test_updated)

# Evaluate the model
final_accuracy = accuracy_score(y_test_updated, y_pred_updated)
print(f'Accuracy: {final_accuracy}')

confusion_matrix_updated = confusion_matrix(y_test_updated, y_pred_updated)

confusion_matrix_updated

After adding the second semester grade as one of the feature, the true positive observations have significantly increase, and the accuracy score is increased by around 9% comparing to the baseline model. The accuracy score is now 78%. This improvement has shown that the second semester grade does have prediction power in terms of deciding if a student is a drop-out. 

# Final Model

In the final model, addition to the 6 features we had in the updated model, two more features which are "Course" and "Tuition fees up to date" will be added. 

In the left visualization below, it is quite obvious that most of the enrolled/graduate students have tuition fees up to date. However, by looking at the second graph for the drop out students, 32% of drop-out students do not pay off all the tuition fees. 

In [None]:
# Data for plotting - first graph
categories = ['Yes', 'No']
values_target_0 = ((df1[df1['Target']==1]['Tuition fees up to date'].value_counts(normalize=True)).values)*100

# Data for plotting - second graph
values_target_1 = ((df1[df1['Target']==0]['Tuition fees up to date'].value_counts(normalize=True)).values)*100

# Creating the subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))  # 1 row, 2 columns

# First bar plot
axes[0].bar(categories, values_target_0, color=['lightgreen', 'lightcoral'])
axes[0].set_title('Tuition Fee Up to Date For Dropout (Target 0)')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Percentage')
axes[0].set_ylim(0, 100)

# Second bar plot
axes[1].bar(categories, values_target_1, color=['lightgreen', 'lightcoral'])
axes[1].set_title('Tuition Fee Up to Date For Non-Dropout (Target 1)')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Percentage')
axes[1].set_ylim(0, 100)

# Display the bar plots
plt.tight_layout()
plt.show()

Moreover, connecting back to the hypothesis testing from earlier, there are sufficient evidences supporting that a larger proportion of dropout students were enrolled in Equinculture, Informatics Engineering, and Management (evening attendance). To have a better understanding on the proportions, the visualization below has shown that 50.7% of the students in the Management (evening attendance) course dropped out at the end. 

In [None]:
#non drop-out
df1[df1['Target']==1]['Course'].value_counts(normalize=True)
df1[df1['Course']==9991]['Target'].value_counts(normalize=True)

import matplotlib.pyplot as plt

# Data to plot
labels = 'Dropout', 'Non-Dropout'
sizes = (df1[df1['Course']==9991]['Target'].value_counts()).values  # Values are percentages of the 'Target' column
colors = ['lightgreen', 'lightcoral']
explode = (0.1, 0)  # explode the 1st slice (Non-Dropout)

# Plotting the pie chart
plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Distribution of Drop-Out for Management Major')
plt.show()

With these analysis, we believe "course" and "Tuition fees up to date" may improve the prediction accuracy of the model.

In [None]:
#grades, 
df_final = df1[[
    'Course',
    'Tuition fees up to date',
    'Curricular units 2nd sem (grade)',
    'Marital Status','Displaced',
    'Previous qualification',
    'Debtor',
    'Scholarship holder',
    'Target']]

In [None]:
X_final = df_final.drop('Target', axis=1)
y_final = df_final['Target']

# Split data into training and test sets
X_train_gr, X_test_gr, y_train_gr, y_test_gr = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_gr = scaler.fit_transform(X_train_gr)
X_test_gr = scaler.transform(X_test_gr)

In [None]:
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_gr, y_train_gr)

# Predict on the test data
y_pred_gr = model.predict(X_test_gr)

# Evaluate the model
final_accuracy = accuracy_score(y_test_gr, y_pred_gr)
print(f'Accuracy: {final_accuracy}')

This concludes our model accuracy to 80%, which is a relatively high accuracy. In this section, through enhacing the prediction model, we found out that "Course", "Tuition fees up to date", "Curricular units 2nd sem (grade)", "Marital Status", "Displaced", "Previous qualification", "Debtor", and "Scholarship holder" have prediction powers in deciding if a student is a drop-out.