In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [28]:
import warnings
warnings.simplefilter('ignore')

In [29]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

### Read the dataset for churn 

In [62]:
ds = pd.read_csv("C:/Users/Olena/Desktop/ITSS/files/variant_4.csv")

In [63]:
for col in ds.columns:
    if ds[col].isnull().values.any():
        print("Missing data in ", col)

missing = list()
for x in ds.columns:
    if ds[x].isnull().sum() != 0:
        print(x, ds[x].isnull().sum())
        missing.append(x)



Missing data in  Course
Missing data in  Daytime/evening attendance	
Missing data in  Previous qualification
Missing data in  Previous qualification (grade)
Missing data in  Mother's occupation
Missing data in  Father's occupation
Missing data in  International
Missing data in  Curricular units 1st sem (enrolled)
Missing data in  Curricular units 1st sem (approved)
Missing data in  Curricular units 1st sem (grade)
Missing data in  Curricular units 1st sem (without evaluations)
Missing data in  Curricular units 2nd sem (credited)
Missing data in  Curricular units 2nd sem (enrolled)
Missing data in  Curricular units 2nd sem (evaluations)
Missing data in  Curricular units 2nd sem (grade)
Missing data in  Inflation rate
Missing data in  Citizenship
Missing data in  Family Position
Missing data in  Special Needs
Course 221
Daytime/evening attendance	 110
Previous qualification 221
Previous qualification (grade) 2323
Mother's occupation 221
Father's occupation 110
International 325
Curricula

## Data engineering 

#### Missing data imputation

In [64]:
mean_impute_columns = [
       'Admission grade',
       'Previous qualification',
       'Previous qualification (grade)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)',
       'Inflation rate', 'GDP']
mode_impute_columns = ['Unemployment rate']

In [65]:
def impute_na(df, variable, value):

    return df[variable].fillna(value)

In [66]:
# Let's create a dict of mean values

mean_impute_values = dict()
for column in mean_impute_columns:
    mean_impute_values[column] = ds[column].mean()
print(mean_impute_values)

{'Admission grade': 126.97811934900544, 'Previous qualification': 4.6302640970735185, 'Previous qualification (grade)': 132.57405997144215, 'Curricular units 1st sem (enrolled)': 6.285986200333095, 'Curricular units 1st sem (evaluations)': 8.299050632911392, 'Curricular units 1st sem (approved)': 4.696883178681894, 'Curricular units 1st sem (grade)': 10.635909431107466, 'Curricular units 1st sem (without evaluations)': 0.13517179023508138, 'Curricular units 2nd sem (credited)': 0.549845348560552, 'Curricular units 2nd sem (enrolled)': 6.2373667130273525, 'Curricular units 2nd sem (evaluations)': 8.061196105702365, 'Curricular units 2nd sem (approved)': 4.435804701627487, 'Curricular units 2nd sem (grade)': 10.232829111546907, 'Curricular units 2nd sem (without evaluations)': 0.15031645569620253, 'Inflation rate': 1.2293694946685212, 'GDP': 0.001968806509945778}


In [67]:
for column in mean_impute_columns:
    ds[column] = impute_na(ds, column, mean_impute_values[column])

In [68]:
# Let's create a dict of mode values

mode_impute_values = dict()
for column in mode_impute_columns:
    mode_impute_values[column] = ds[column].mode()[0]
print(mode_impute_values)

{'Unemployment rate': 7.6}


In [69]:
for column in mode_impute_columns:
    ds[column] = impute_na(ds, column, mode_impute_values[column])

##### Outlier Engineering

In [70]:
outlier_columns = ['Admission grade',
       'Previous qualification',
       'Previous qualification (grade)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)']

In [71]:
ds = ds[(np.abs(stats.zscore(ds[outlier_columns])) < 4).all(axis=1)]

##### Categorical encoding

In [72]:
cat_columns = ['Marital status', 'Application mode', 'Application order',
       'Course', 'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International', 'Unemployment rate',
       'Inflation rate', 'GDP']

In [73]:
map_dicts = dict()
for column in cat_columns:
    ds[column] = ds[column].astype('category')
    map_dicts[column] = dict(zip(ds[column], ds[column].cat.codes))
    ds[column] = ds[column].cat.codes
    

In [74]:
display(ds.sample(15))

Unnamed: 0.1,Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Citizenship,Family Position,Attendance,Field of Study,Special Needs
1585,1585,0,0,0,1,1,0,33,0,29,38,9.0,8.0,215,1,0,0,1,1,0,3,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,7,4,Graduate,Athens,First Child,Evening Classes,Engineering,No Special Needs
745,745,1,11,0,3,1,0,36,0,1,1,4.0,5.0,535,1,0,0,0,1,0,12,0,0,6.0,6,0.0,0.0,0.135172,0.0,6.0,8.0,0,0.0,0,8,8,0,Dropout,Athens,,Full-Time,Arts,
1335,1335,0,7,1,15,1,0,36,0,1,19,2.0,10.0,255,1,0,0,1,0,0,2,0,0,6.0,9,5.0,13.0,0.135172,0.0,6.0,8.0,5,10.4,0,0,7,4,Enrolled,Istanbul,,Online,Commerce,
4079,4079,0,0,1,13,1,0,52,0,37,37,9.0,7.0,313,1,0,0,1,0,1,1,0,0,6.0,8,6.0,13.571429,0.0,0.0,6.0,6.0,5,12.8,0,5,3,7,Enrolled,Saint Petersburg,Youngest Child,Full-Time,Arts,No Special Needs
2831,2831,0,7,2,8,1,0,36,0,1,1,4.0,7.0,239,1,0,0,1,0,0,1,0,0,5.0,5,5.0,12.6,0.135172,0.0,5.0,5.0,4,11.0,0,4,4,8,Enrolled,Budapest,,Evening Classes,Science,
416,416,0,0,0,5,1,0,36,0,1,1,9.0,9.0,269,1,0,0,1,0,0,2,0,0,6.0,13,6.0,15.230769,0.135172,0.0,6.0,10.0,6,12.375,0,8,8,0,Graduate,Brussels,,Online,Arts,
1751,1751,0,0,0,-1,1,0,36,0,38,19,9.0,9.0,373,1,0,0,1,1,1,2,0,0,6.285986,0,0.0,10.635909,0.135172,0.549845,0.0,0.0,0,0.0,0,7,1,5,Graduate,Brussels,,Part-Time,Science,
1516,1516,0,3,0,11,1,5,36,0,19,1,4.0,1.0,395,1,0,0,0,0,0,22,0,0,7.0,0,0.0,0.0,0.135172,0.0,8.0,0.0,0,0.0,0,5,3,7,Dropout,Rome,,Evening Classes,Commerce,
3907,3907,0,0,0,9,1,0,36,0,3,38,1.0,5.0,362,1,0,0,1,0,0,5,0,0,6.0,11,4.0,12.5,0.135172,0.0,6.0,13.0,3,13.666667,0,2,0,1,Enrolled,Edinburgh,,Online,Medicine,
1111,1111,0,0,4,11,1,0,36,0,1,19,6.0,7.0,150,1,0,0,1,0,1,2,0,0,7.0,8,7.0,12.135714,0.135172,0.0,8.0,10.0,7,13.328571,0,5,3,7,Graduate,Istanbul,,Part-Time,Engineering,


# Model tuning


https://en.wikipedia.org/wiki/Hyperparameter_optimization 

#### Parameters vs Hyperparameters
Let’s now define what are hyperparameters, but before doing that let’s consider the difference between a parameter and a hyperparameter.

A parameter can be considered to be intrinsic or internal to the model and can be obtained after the model has learned from the data. 
Examples of parameters are regression coefficients in linear regression, support vectors in support vector machines and weights in neural networks.

A hyperparameter can be considered to be extrinsic or external to the model and can be set arbitrarily by the practitioner. 
Examples of hyperparameters include the k in k-nearest neighbors, number of trees and maximum number of features in random forest, learning rate and momentum in neural networks, the C and gamma parameters in support vector machines.

#### Hyperparameter tuning
As there are no universal best hyperparameters to use for any given problem, hyperparameters are typically set to default values. However, the optimal set of hyperparameters can be obtained from manual empirical (trial-and-error) hyperparameter search or in an automated fashion via the use of optimization algorithm to maximize the fitness function.

Two common hyperparameter tuning methods include grid search and random search. As the name implies, a grid search entails the creation of a grid of possible hyperparameter values whereby models are iteratively built for all of these hyperparameter combinations in a brute force manner. In a random search, not all hyperparameter combinations are used, but instead each iteration makes use of a random hyperparameter combination.

#### Define target and features columns

In [75]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4199 entries, 0 to 4423
Data columns (total 43 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      4199 non-null   int64  
 1   Marital status                                  4199 non-null   int8   
 2   Application mode                                4199 non-null   int8   
 3   Application order                               4199 non-null   int8   
 4   Course                                          4199 non-null   int8   
 5   Daytime/evening attendance	                     4199 non-null   int8   
 6   Previous qualification                          4199 non-null   int8   
 7   Previous qualification (grade)                  4199 non-null   int8   
 8   Nacionality                                     4199 non-null   int8   
 9   Mother's qualification                        

In [76]:
y_column = ['Scholarship holder'] # target variable
X_columns = ['Marital status', 'Application mode', 'Application order',
       'Course', 'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
    'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',
       'Inflation rate', 'GDP']
X = ds[X_columns]
y = ds[y_column]

In [77]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
# In the first step we will split the data in training and remaining dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(3359, 31)
(3359, 1)
(840, 31)
(840, 1)


In [78]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
metrics.precision_score(y_test, y_pred)

0.3349282296650718

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
metrics.recall_score(y_test, y_pred)

0.3150684931506849

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
metrics.f1_score(y_test, y_pred)

0.3023255813953488

In [81]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.6523809523809524

In [82]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report
# Створюємо модель класифікатора
gnb = GaussianNB()
# Оголошуємо простір гіперпараметрів для перебору
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],
    'priors': [None, [0.3, 0.7], [0.5, 0.5], [0.7, 0.3]]
}

# Оголошуємо метрики, які будемо використовувати для оцінки
scoring = {'accuracy':'accuracy', 'recall':'recall', 'f1': 'f1'}
# Створюємо обʼєкт GridSearchCV
grid_search = GridSearchCV(estimator=gnb, param_grid=param_grid, scoring=scoring, refit='f1', cv=5)
# Виконуємо пошук найкращих параметрів
grid_search.fit(X_train, y_train)
# Отримуємо найкращі параметри та модель
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
# Прогнозуємо на тестових даних
y_pred = best_model.predict(X_test)
# Виводимо звіт метрик класифікації
print('Test set metrics:')
print(classification_report(y_test, y_pred))

Test set metrics:
              precision    recall  f1-score   support

           0       0.82      0.50      0.62       621
           1       0.33      0.70      0.45       219

    accuracy                           0.55       840
   macro avg       0.58      0.60      0.53       840
weighted avg       0.69      0.55      0.57       840



In [91]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
# Створюємо модель класифікатора
gnb = GaussianNB()
# Оголошуємо простір гіперпараметрів для перебору
param_grid = {
    'var_smoothing': uniform(loc=1e-9, scale=1e-5),
    'priors': [None, [0.3, 0.7], [0.5, 0.5], [0.7, 0.3]]
}

# Оголошуємо метрики, які будемо використовувати для оцінки
scoring = {'accuracy':'accuracy', 'recall':'recall', 'f1': 'f1'}
# Створюємо обʼєкт GridSearchCV
grid_search = RandomizedSearchCV(estimator=gnb, param_distributions=param_grid, scoring=scoring, refit='f1', cv=5)
# Виконуємо пошук найкращих параметрів
grid_search.fit(X_train, y_train)
# Отримуємо найкращі параметри та модель
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
# Прогнозуємо на тестових даних
y_pred = best_model.predict(X_test)
# Виводимо звіт метрик класифікації
print('Test set metrics:')
print(classification_report(y_test, y_pred))

Test set metrics:
              precision    recall  f1-score   support

           0       0.83      0.49      0.62       621
           1       0.33      0.72      0.45       219

    accuracy                           0.55       840
   macro avg       0.58      0.60      0.54       840
weighted avg       0.70      0.55      0.58       840



#### Building a Baseline Random Forest Model
Here, we will first start by building a baseline random forest model that will serve as a baseline for comparative purpose with the model using the optimal set of hyperparameters.
For the baseline model, we will set an arbitrary number for the 2 hyperparameters (e.g. n_estimators and max_features) that we will also use in the next section for hyperparameter tuning.

In [83]:
rf = RandomForestClassifier(max_features=5, n_estimators=100)

In [84]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [85]:
print('test set metrics: ', metrics.classification_report(y_test, y_pred))

test set metrics:                precision    recall  f1-score   support

           0       0.76      0.93      0.84       621
           1       0.49      0.18      0.27       219

    accuracy                           0.74       840
   macro avg       0.63      0.56      0.55       840
weighted avg       0.69      0.74      0.69       840



In [86]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 5,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

#### Hyperparameter Tuning
Now we will be performing the tuning of hyperparameters of the random forest model. 

n_estimators = number of trees in the foreset

max_features = max number of features considered for splitting a node

max_depth = max number of levels in each decision tree

min_samples_split = min number of data points placed in a node before the node is split

min_samples_leaf = min number of data points allowed in a leaf node

bootstrap = method for sampling data points (with or without replacement)


https://towardsdatascience.com/hyperparameter-tuning-always-tune-your-models-7db7aeaf47e9

In [87]:
%%time
# Create the random grid
param_grid = {'n_estimators': np.arange(25,55,10),
               'max_features': [0.5, 0.6, 0.8],
               'min_samples_split': [10,15],
               'min_samples_leaf': [3,4],
               'bootstrap': [False]}

# print(random_grid)


rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, 
                    param_grid=param_grid, 
                    scoring='precision', 
                    cv=5,
                    verbose = 3,
                    return_train_score=True)

grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.998, test=0.380) total time=   0.3s
[CV 2/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.998, test=0.460) total time=   0.2s
[CV 3/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.995, test=0.465) total time=   0.2s
[CV 4/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.995, test=0.460) total time=   0.2s
[CV 5/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=0.997, test=0.509) total time=   0.2s
[CV 1/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=35;, score=(train=0.998, test=0.389) tot

In [88]:
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),
                          pd.DataFrame(grid.cv_results_["mean_test_score"], 
                          columns=["precision"])],
                          axis=1)

grid_results

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,precision
0,False,0.5,3,10,25,0.454778
1,False,0.5,3,10,35,0.474705
2,False,0.5,3,10,45,0.482185
3,False,0.5,3,15,25,0.481301
4,False,0.5,3,15,35,0.494113
5,False,0.5,3,15,45,0.480504
6,False,0.5,4,10,25,0.475348
7,False,0.5,4,10,35,0.485283
8,False,0.5,4,10,45,0.48633
9,False,0.5,4,15,25,0.482868


In [58]:
grid_results.columns

Index(['bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split',
       'n_estimators', 'precision'],
      dtype='object')

In [27]:
grid_contour = grid_results.groupby([  'max_features', 'min_samples_leaf',
       'min_samples_split', 'n_estimators']).mean()
grid_contour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,bootstrap,precision
max_features,min_samples_leaf,min_samples_split,n_estimators,Unnamed: 4_level_1,Unnamed: 5_level_1
0.5,3,10,25,0.0,0.500409
0.5,3,10,35,0.0,0.509282
0.5,3,10,45,0.0,0.523058
0.5,3,15,25,0.0,0.49974
0.5,3,15,35,0.0,0.499457
0.5,3,15,45,0.0,0.516006
0.5,4,10,25,0.0,0.49844
0.5,4,10,35,0.0,0.516017
0.5,4,10,45,0.0,0.5179
0.5,4,15,25,0.0,0.511928
