In [1]:
%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import getcwd
from os.path import join, abspath, pardir
import numpy as np
import pandas as pd

import pickle

# sklearn libraries

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer

from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model

# IPython
from IPython.core.interactiveshell import InteractiveShell

##### Config settings

In [2]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
model_dir = join(parent_dir, "models")
data_file = join(data_dir, "preprocessed.csv")

# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For pandas

pd.options.display.max_columns = 200 # display upto 200 columns (instead of default 20)
pd.options.display.max_rows = 200 # display upto 200 rows (instead of default 60)

# random state
__random_state = 0

#### Helper functions

In [3]:
def save_model(model, file_path: str) -> None:
    """
    Save model as a pickle file
    """
    with open(file_path, "wb") as file:
        pickle.dump(model, file)

def load_model(file_path: str):
    """
    Load model from a pickle file
    """
    with open(file_path, "rb") as file:
        return pickle.load(file)

def dataframe_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Save dataframe as .csv file
    """
    df.to_csv(file_path, index=False)

def get_best_clf(clf, param_grid, X, y, **kwargs):
    """
    Grid Search with stratified splitting and other parameters

    Returns best estimator and it's score (F1 score)
    """

    # f1 score rather then accuracy
    f1 = make_scorer(f1_score, average='micro')

    # stratified split
    split_count = 10
    kf = StratifiedKFold(n_splits=split_count, random_state=__random_state, shuffle=True)

    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring=f1,
        cv=kf,
        n_jobs=-1,
        **kwargs
    )

    grid_search.fit(X, y)

    return grid_search
    #return grid_search.best_score_, grid_search.best_estimator_

#### Load preprocessed data

In [4]:
df = pd.read_csv(data_file, encoding= 'ISO-8859-1')
df.head()

Unnamed: 0,gender,pf_o_att,pf_o_sin,pf_o_int,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,imprelig,date,go_out,tvsports,exercise,dining,museums,art,hiking,gaming,reading,tv,theater,concerts,music,attr1_1,sinc1_1,intel1_1,shar1_1,intel3_1,attr,sinc,intel,fun,match_es,length,numdat_2,race_o_1.0,race_o_4.0,field_cd_2.0,field_cd_3.0,field_cd_4.0,field_cd_6.0,field_cd_7.0,field_cd_9.0,field_cd_11.0,field_cd_15.0,race_2.0,race_3.0,race_6.0,goal_5.0,career_c_1.0,career_c_3.0,career_c_4.0,career_c_5.0,career_c_7.0,career_c_10.0,career_c_11.0,subject_attractiveness_mean,subject_sincerity_mean,subject_intelligence_mean,subject_fun_mean,subject_ambition_mean,subject_shared_interest_mean,age_difference,attractiveness_difference,fun_difference,ambition_difference,shared_interest_difference
0,False,25,25,25,False,-0.093734,0.47719,0.405713,0.306851,0.129562,-0.215712,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,7,6,7,7,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.213498,-0.687122,-1.210467,-1.270431,-0.087847
1,False,25,20,15,False,-0.093734,0.47719,0.405713,-0.720215,-0.984028,-1.151207,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,6,6,6,5,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,0.113945,-1.261822,-1.210467,-0.630769,1.092457
2,False,30,15,20,True,1.958469,1.053427,1.055676,-0.206682,0.129562,0.252036,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,6,8,8,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.54094,1.036978,0.02066,-1.270431,-0.678
3,False,40,20,20,False,1.445418,1.629664,1.705638,1.333917,0.686357,-0.215712,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,7,7,7,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.868383,-0.112422,-0.594903,-0.630769,-0.087847
4,False,30,10,25,False,-0.606785,-1.251521,-1.544174,-0.720215,-0.984028,0.252036,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,9,7,8,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.54094,1.036978,0.636223,0.648555,-1.268152


## Modelling

In [5]:
# define feature and target variables
features, target = df, df['dec_o']
features.drop(['dec_o'], axis=1, inplace=True)

### 1. Baseline Models

#### 1.1. [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [6]:
parameters = {
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'C': np.logspace(-4, 4, 20),
    'max_iter': [10000]
}

classifier_lr = get_best_clf(
    clf=LogisticRegression(random_state=__random_state),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_lr.best_params_

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.2s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.2s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total

In [None]:
clf_logistic_regression = classifier_lr.best_estimator_
clf_logistic_regression

In [None]:
save_model(clf_logistic_regression, join(model_dir, "clf_logistic_regression.pkl"))

#### 1.2. [SVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

In [None]:
parameters = {
    'kernel': ['rbf'],
    'gamma': [1e-4, 1e-3, 1e-2],
    'C': [1, 10, 100, 1000]
}

classifier_sv = get_best_clf(
    clf=SVC(random_state=__random_state),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_sv.best_params_

In [None]:
clf_svc = classifier_sv.best_estimator_
clf_svc

In [None]:
save_model(clf_svc, join(model_dir, "clf_svc.pkl"))

#### 1.3. [KNN](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [None]:
parameters = {
    'n_neighbors': [5, 11, 19, 29],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}

classifier_kn = get_best_clf(
    clf=KNeighborsClassifier(),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_kn.best_params_

In [None]:
clf_knn = classifier_kn.best_estimator_
clf_knn

In [None]:
save_model(clf_knn, join(model_dir, "clf_knn.pkl"))

### 2. [Ensemble models](https://scikit-learn.org/stable/modules/ensemble.html#ensemble)

#### 2.1. [Gradient Boost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

In [None]:
parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.05],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'max_features': ['sqrt', 'log2']
}

classifier_gb = get_best_clf(
    clf=GradientBoostingClassifier(random_state=__random_state),
    X=features,
    y=target,
    param_grid=parameters,
    verbose=2
)

classifier_gb.best_params_

In [None]:
clf_gb = classifier_gb.best_estimator_
clf_gb

In [None]:
save_model(clf_gb, join(model_dir, "clf_gb.pkl"))

#### 2.2. [Voting Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html)

Let's combine all classifiers and train a voting model

In [None]:
estimators = [
    ('lr', clf_logistic_regression), # logistic regression
    ('sv', clf_svc), # svc
    ('kn', clf_knn), # knn
    ('gb', clf_gb) # gradient boosting
]

# voting classifier
clf_voting = VotingClassifier(
    estimators=estimators,
    voting='hard'
)

In [None]:
clf_voting.fit(features, target)
clf_voting

In [None]:
clf_voting = clf_voting.best_estimator_

In [None]:
save_model(clf_voting, join(model_dir, "clf_voting.pkl"))

#### 2.3. [Stacking Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn.ensemble.StackingClassifier)

In [None]:
# stacking classifier
clf_stacking = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)
clf_stacking

In [None]:
clf_stacking.fit(features, target)
clf_stacking

In [None]:
clf_stacking = clf_stacking.best_estimator_

In [None]:
save_model(clf_stacking, join(model_dir, "clf_stacking.pkl"))