In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [5]:
#1 - Pick a problem that interests you and find a dataset

# Read file
df = pd.read_csv('higher-education-predictors-of-student-retention.csv')

# Getting Size
df.shape

(4424, 35)

In [6]:
# Look at the head
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,10,1,0,0,1,1,0,20,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,4,1,0,0,0,1,0,19,0,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,10,1,0,0,0,1,0,19,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,4,1,0,0,1,0,0,20,0,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,10,0,0,0,1,0,0,45,0,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [7]:
#2 - Describe the problem and how ML can help
#Education is one of the pillars of the society hence is one of the most discussed topics in many nations worldwide.
#The students are at the center of it as they are the product of such system and trying to get as many learn and be prepared for the future is the main goal.
#However, this is a very difficult task.
#As students come from different backgrounds it is more important than ever to consider which factors impacts the possibility for a students to graduate or dropout.
#A model like the one presented in this project will help predict this outcome based on social-economic, demographic and academic factors that can be applied accross different countries.

In [8]:
#3 - Prepare the data and run EDA

In [9]:
# 3.1 - Change name of columns to be lower case
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,marital status,application mode,application order,course,daytime/evening attendance,previous qualification,nacionality,mother's qualification,father's qualification,mother's occupation,father's occupation,displaced,educational special needs,debtor,tuition fees up to date,gender,scholarship holder,age at enrollment,international,curricular units 1st sem (credited),curricular units 1st sem (enrolled),curricular units 1st sem (evaluations),curricular units 1st sem (approved),curricular units 1st sem (grade),curricular units 1st sem (without evaluations),curricular units 2nd sem (credited),curricular units 2nd sem (enrolled),curricular units 2nd sem (evaluations),curricular units 2nd sem (approved),curricular units 2nd sem (grade),curricular units 2nd sem (without evaluations),unemployment rate,inflation rate,gdp,target
0,1,8,5,2,1,1,1,13,10,6,10,1,0,0,1,1,0,20,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,4,1,0,0,0,1,0,19,0,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,10,1,0,0,0,1,0,19,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,4,1,0,0,1,0,0,20,0,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,10,0,0,0,1,0,0,45,0,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [10]:
# 3.2 - Remove unwanted records from our dataset
# Our model's target is to determine whether a students "Graduate" or "Dropout".
# However, the column "target" has also the value "Enrolled" as shown below
df['target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [12]:
# Here are the values for each category:
df['target'].value_counts()

target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

In [14]:
# In order to focus on Graduate and Dropout we will remove the Enrolled records
df = df[df['target'] != 'Enrolled']

In [15]:
# Validating that the "Enrolled" records were removed
df['target'].value_counts()

target
Graduate    2209
Dropout     1421
Name: count, dtype: int64

In [17]:
#3.3 - Identify columns with null
print("Number of NaNs per column:")
print(df.isna().sum())

Number of NaNs per column:
marital status                                    0
application mode                                  0
application order                                 0
course                                            0
daytime/evening attendance                        0
previous qualification                            0
nacionality                                       0
mother's qualification                            0
father's qualification                            0
mother's occupation                               0
father's occupation                               0
displaced                                         0
educational special needs                         0
debtor                                            0
tuition fees up to date                           0
gender                                            0
scholarship holder                                0
age at enrollment                                 0
international                        