In [99]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns 

import statsmodels

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [100]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# data (as pandas dataframes) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 

df1_raw = X.assign(Target=y.get('Target'))

# making a copy to use 
df1 = df1_raw.copy(deep=True)

df1_raw.head()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


# Training Model based on Previous Observations and Tests

## Data Cleaning and Preprocessing

Since we are predicting if the student is a drop-out or not, we will only need to divide them into two groups: dropout, and enrolled/graduate. Thus, 'Graduate' and 'Enrolled' in the original dataset are combined into one group 'Graduate/Enrolled'. In the resulting data frame, Target is a binary variable, which 0 indicates that the student is not a drop-out, and 1 indicates that the student is a drop-out.

In [101]:
# Combining 'Graduate' and 'Enrolled' into one 
def get_target(y):
    if (y == 'Graduate') or (y == 'Enrolled'):
        return 'Graduate/Enrolled'
    else:
        return 'Dropout'
        
df1['Target'] = df1.get('Target').apply(get_target)

df1['Target'] = df1['Target'].apply(lambda x: 1 if x == 'Dropout' else 0)

In [102]:
df1.head()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,1
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,0
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,1
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,0
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,0


### Determine Features, and process the Data Frame
Based on our Exploratory Data Analysis and Hypothesis Testing, we found out that comparing to students who are not dropping out, drop-out students have noticeable larger proportion in the following categories of 1) married or divorced, 2) not compeleting high school, 3) not displaced, 4)being debtors. Moreover, these students have obvious lower proportion in having a scholarship. Thus, among all the categorical columns in the dataset, the following 5 columns will be taking into account as our features for the **Baseline model**: Marital Status, Displaced, Previous Qualification, Debtor, and Scholarship. 

In [123]:
#for our basic model, we wll only have the 5 categorical features, and the target.
df_base = df1[['Marital Status','Displaced','Previous qualification',
               'Debtor','Scholarship holder','Target','Nacionality']]
df_base.head()

Unnamed: 0,Marital Status,Displaced,Previous qualification,Debtor,Scholarship holder,Target,Nacionality
0,1,1,1,0,0,1,1
1,1,1,1,0,0,0,1
2,1,1,1,0,0,1,1
3,1,1,1,0,0,0,1
4,2,0,1,0,0,0,1


### Split the dataset into Features and Target Variable

The dataset above is now prepared to conduct further training and is splitted into features and target variables based on the study from earlier sections. 20% of the data in the dataset is selected to be the test data, an the rest 80% will be the training data. 


In [104]:
# Split dataset into features and target variable
X = df_base.drop('Target', axis=1)
y = df_base['Target']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Building Random Forest Classifier for Baseline Model
The final target classification is binary, with the choices of being drop-out or not. The classifer being used to predict the given model is **Random Forest Classifier**. In the following section, the classfier will be initialized and perform training process on the training data. 

In [105]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
baseline_accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {baseline_accuracy}')

Accuracy: 0.6903954802259887


The accuracy score for the baseline model is 0.69, meaning the model is able to predict 69% of test data. To improve the accuracy, the final model will have some add-on features, which are the 12 variables relating to students' unit points and grade points from 1st and 2nd sesmester. 

## Building Random Forest Classifer for Final Model 

The final model now has 18 features in total, with 12 being newly added. The 12 new features include the number of circular units credited, enrolled, evaluated, not evaluated and approved from first and second semester ; and also the grade average of first and second semester.

In [122]:
#new dataset needed for building the final model including new features
df_final = df1[[
          'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)','Target','Marital Status','Displaced','Previous qualification',
               'Debtor','Scholarship holder','Target']]
df_final.head()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Target,Marital Status,Displaced,Previous qualification,Debtor,Scholarship holder,Target.1
0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,1,1,1,1,0,0,1
1,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,0,1,1,1,0,0,0
2,0,6,0,0,0.0,0,0,6,0,0,0.0,0,1,1,1,1,0,0,1
3,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,0,1,1,1,0,0,0
4,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,0,2,0,1,0,0,0


In [120]:
X = df_final.drop('Target', axis=1)
y = df_final['Target']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [121]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
final_accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {final_accuracy}')

# Now, you can use model.predict() on new data to predict dropout status

Accuracy: 0.8406779661016949


The accuracy score for the model is now improved to 0.84, meaning the final model is now able predict 84% of test data correctly. There is a around 15% increase in accuracy after adding the 12 features relating to the units and average grade. 