In [1]:
%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import getcwd
from os.path import join, abspath, pardir, exists
import numpy as np
import pandas as pd

import pickle, json

import matplotlib.pyplot as plt
import seaborn as sns

# plotly
import plotly.express as px
import plotly.figure_factory as ff
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# scipy
from scipy.stats import ttest_ind, chi2_contingency, boxcox, skew
from scipy.stats.stats import pearsonr

# sklearn libraries
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.impute import KNNImputer, SimpleImputer, MissingIndicator
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector, make_column_transformer, make_column_transformer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier


from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model


# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning

# IPython
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell

##### Config settings

In [11]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
model_dir = join(parent_dir, "models")
data_file = join(data_dir, "preprocessed.csv")

# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For pandas

pd.options.display.max_columns = 200 # display upto 200 columns (instead of default 20)
pd.options.display.max_rows = 200 # display upto 200 rows (instead of default 60)

# random state
__random_state = 0

#### Helper functions

In [3]:
def save_model(model, file_path: str) -> None:
    """
    Save model as a pickle file
    """
    with open(file_path, "wb") as file:
        pickle.dump(model, file)

def load_model(file_path: str):
    """
    Load model from a pickle file
    """
    with open(file_path, "rb") as file:
        return pickle.load(file)

def dataframe_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Save dataframe as .csv file
    """
    df.to_csv(file_path, index=False)

def plot_distribution(data, bins, title, xlabel, ylabel):
    """
    Plot distribution functions
    """
    ax = sns.distplot(
        data,
        bins=bins,
        hist_kws={
            "linewidth": 1,
            'edgecolor': 'black',
            'alpha': 1.0
            },
        kde=False
    )
    _ = ax.set_title(title)
    _ = ax.set_xlabel(xlabel)
    _ = ax.set_ylabel(ylabel)

def plot_relationship(x, y, title, xlabel, ylabel):
    """
    Plot relationship between two features
    """
    ax = sns.barplot(
        x=x,
        y=y,
        orient='h'
    )
    _ = ax.set_title(title)
    _ = ax.set_xlabel(xlabel)
    _ = ax.set_ylabel(ylabel)

def print_moments(title, feature):
    """
    Print a feature's mean, standard deviation, skewness and kurtosis
    """
    print(title)
    print('Mean: '+'{:>18.2f}'.format(feature.mean()))
    print('Standard deviation: '+'{:.2f}'.format(feature.std()))
    print('Skewness: '+'{:>14.2f}'.format(feature.skew()))
    print('Kurtosis: '+'{:>14.2f}'.format(feature.kurtosis()))

#### Load preprocessed data

In [7]:
df = pd.read_csv(data_file, encoding= 'ISO-8859-1')
df.head()

Unnamed: 0,gender,pf_o_att,pf_o_sin,pf_o_int,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,imprelig,date,go_out,tvsports,exercise,dining,museums,art,hiking,gaming,reading,tv,theater,concerts,music,attr1_1,sinc1_1,intel1_1,shar1_1,intel3_1,attr,sinc,intel,fun,match_es,length,numdat_2,race_o_1.0,race_o_4.0,field_cd_2.0,field_cd_3.0,field_cd_4.0,field_cd_6.0,field_cd_7.0,field_cd_9.0,field_cd_11.0,field_cd_15.0,race_2.0,race_3.0,race_6.0,goal_5.0,career_c_1.0,career_c_3.0,career_c_4.0,career_c_5.0,career_c_7.0,career_c_10.0,career_c_11.0,subject_attractiveness_mean,subject_sincerity_mean,subject_intelligence_mean,subject_fun_mean,subject_ambition_mean,subject_shared_interest_mean,age_difference,attractiveness_difference,fun_difference,ambition_difference,shared_interest_difference
0,False,25,25,25,False,-0.093734,0.47719,0.405713,0.306851,0.129562,-0.215712,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,7,6,7,7,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.213498,-0.687122,-1.210467,-1.270431,-0.087847
1,False,25,20,15,False,-0.093734,0.47719,0.405713,-0.720215,-0.984028,-1.151207,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,6,6,6,5,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,0.113945,-1.261822,-1.210467,-0.630769,1.092457
2,False,30,15,20,True,1.958469,1.053427,1.055676,-0.206682,0.129562,0.252036,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,6,8,8,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.54094,1.036978,0.02066,-1.270431,-0.678
3,False,40,20,20,False,1.445418,1.629664,1.705638,1.333917,0.686357,-0.215712,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,7,7,7,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.868383,-0.112422,-0.594903,-0.630769,-0.087847
4,False,30,10,25,False,-0.606785,-1.251521,-1.544174,-0.720215,-0.984028,0.252036,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,9,7,8,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.54094,1.036978,0.636223,0.648555,-1.268152


### Modelling

#### 1. Baseline Models

In [9]:
# define feature and target variables
features, target = df, df['dec_o']
features.drop(['dec_o'], axis=1, inplace=True)

1. Logistic Regression

In [10]:
parameters = {
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'C': np.logspace(-4, 4, 20),
    'max_iter': [10000]
}
classifier_lr = LogisticRegression(random_state=__random_state)
classifier_lr = GridSearchCV(
    estimator=classifier_lr,
    param_grid=parameters,
    cv=5,
    verbose=2,
    n_jobs=-1
)
classifier_lr.fit(features, target)
classifier_lr.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.0006951927961775605, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END .C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.00026366508987303583, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.0006951927961775605, max_iter=10000, penalty=l2, solver=lbfgs; total time=   0.1s
[CV] END C=0.0006951927961775605, max_iter=10000, penal

{'C': 4.281332398719396, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'lbfgs'}

In [13]:
clf_logistic_regression = LogisticRegression(
    random_state=__random_state,
    penalty=classifier_lr.best_params_['penalty'],
    solver=classifier_lr.best_params_['solver'],
    C=classifier_lr.best_params_['C'],
    max_iter=classifier_lr.best_params_['max_iter']
)
clf_logistic_regression

2. SVC

In [14]:
parameters = {
    'kernel': ['rbf'],
    'gamma': [1e-4, 1e-3, 1e-2],
    'C': [1, 10, 100, 1000]
}
classifier_sv = SVC(random_state=__random_state)
classifier_sv = GridSearchCV(
    estimator=classifier_sv,
    param_grid=parameters,
    cv=5,
    verbose=2,
    n_jobs=-1
)
classifier_sv.fit(features, target)
classifier_sv.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   3.7s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   3.7s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   3.8s
[CV] END ......................C=1, gamma=0.0001, kernel=rbf; total time=   4.0s
[CV] END ......................C=1, gamma=0.0001, kernel=rbf; total time=   4.2s
[CV] END ......................C=1, gamma=0.0001, kernel=rbf; total time=   4.3s
[CV] END ......................C=1, gamma=0.0001, kernel=rbf; total time=   4.3s
[CV] END ......................C=1, gamma=0.0001, kernel=rbf; total time=   4.4s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   3.5s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   3.9s
[CV] END .....................C=10, gamma=0.0001, kernel=rbf; total time=   3.6s
[CV] END ........................C=1, gamma=0.01

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}

In [15]:
clf_svc = SVC(
    random_state=__random_state,
    kernel=classifier_sv.best_params_['kernel'],
    gamma=classifier_sv.best_params_['gamma'],
    C=classifier_sv.best_params_['C']
)
clf_svc

3. KNN

In [16]:
parameters = {
    'n_neighbors': [5, 11, 19, 29],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}
classifier_kn = KNeighborsClassifier()
classifier_kn = GridSearchCV(
    estimator=classifier_kn,
    param_grid=parameters,
    cv=5,
    verbose=2,
    n_jobs=-1
)
classifier_kn.fit(features, target)
classifier_kn.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   0.2s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   0.2s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   0.2s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   0.1s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   0.1s
[CV] END ..metric=minkowski, n_neighbors=11, weights=uniform; total time=   0.2s
[CV] END .metric=minkowski, n_neighbors=11, wei

{'metric': 'manhattan', 'n_neighbors': 29, 'weights': 'uniform'}

In [17]:
clf_knn = KNeighborsClassifier(
    n_neighbors=classifier_kn.best_params_['n_neighbors'],
    weights=classifier_kn.best_params_['weights'],
    metric=classifier_kn.best_params_['metric']
)
clf_knn

#### 2. Ensemble models

1. Gradient Boost

In [18]:
parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.05],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'max_features': ['sqrt', 'log2']
}
classifier_gb = GradientBoostingClassifier(random_state=__random_state)
classifier_gb = GridSearchCV(
    estimator=classifier_gb,
    param_grid=parameters,
    cv=5,
    verbose=2,
    n_jobs=-1
)
classifier_gb.fit(features, target)
classifier_gb.best_params_

Fitting 5 folds for each of 36 candidates, totalling 180 fits




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.5s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=200; total time=   0.8s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=200; total time=   0.9s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=200; total time=   0.9s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=200; total time=   0.9s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=200; total time=   0.9s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=100; total time=   0.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=300; total time=   1.2s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=100; total time=   0.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=100; total time=   0.5s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=300; total time=   1.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=300; total time=   1.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=100; total time=   0.4s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=300; total time=   1.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=sqrt, n_estimators=300; total time=   1.3s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=200; total time=   0.7s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=200; total time=   0.8s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=200; total time=   0.7s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=200; total time=   0.8s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=200; total time=   0.8s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=300; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=300; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=300; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=100; total time=   0.6s




[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=300; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=100; total time=   0.8s
[CV] END learning_rate=0.05, loss=deviance, max_depth=3, max_features=log2, n_estimators=300; total time=   1.2s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=100; total time=   0.8s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=100; total time=   0.9s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=200; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=200; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=200; total time=   1.2s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=200; total time=   1.1s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=200; total time=   1.2s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=100; total time=   0.5s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=100; total time=   0.6s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=300; total time=   1.9s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=100; total time=   0.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=300; total time=   1.7s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=300; total time=   1.7s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=300; total time=   1.7s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=sqrt, n_estimators=300; total time=   1.8s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=200; total time=   1.0s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=200; total time=   0.9s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=200; total time=   0.9s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=200; total time=   0.9s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=200; total time=   0.9s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=100; total time=   0.7s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=100; total time=   0.7s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=300; total time=   1.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=100; total time=   0.7s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=300; total time=   1.4s




[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=300; total time=   1.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=300; total time=   1.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=4, max_features=log2, n_estimators=300; total time=   1.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=100; total time=   0.7s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=100; total time=   0.8s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=200; total time=   1.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=200; total time=   1.5s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=200; total time=   1.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=200; total time=   1.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=200; total time=   1.8s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=100; total time=   0.6s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=300; total time=   2.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=300; total time=   2.2s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=300; total time=   2.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=100; total time=   0.6s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=300; total time=   2.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=sqrt, n_estimators=300; total time=   2.3s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=200; total time=   1.5s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=200; total time=   1.3s




[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=200; total time=   1.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=200; total time=   1.3s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=200; total time=   1.2s
[CV] END learning_rate=0.05, loss=exponential, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=exponential, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=exponential, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=deviance, max_depth=5, max_features=log2, n_estimators=300; total time=   1.9s
[CV] END learning_rate=0.05, loss=exponential, max_depth=3, max_features=sqrt, n_estimators=100; total time=   0.4s
[CV] END learning_rate=0.05, loss=exponential, max_depth=3, max_features=sqrt, n_est

{'learning_rate': 0.05,
 'loss': 'exponential',
 'max_depth': 4,
 'max_features': 'sqrt',
 'n_estimators': 300}

In [19]:
clf_gb = GradientBoostingClassifier(
    random_state=__random_state,
    loss=classifier_gb.best_params_['loss'],
    learning_rate=classifier_gb.best_params_['learning_rate'],
    n_estimators=classifier_gb.best_params_['n_estimators'],
    max_depth=classifier_gb.best_params_['max_depth'],
    max_features=classifier_gb.best_params_['max_features']
)
clf_gb

2. Voting Classifier

Let's combine all classifiers and train a voting and stacking model

In [None]:
estimators = [
    ('lr', clf_logistic_regression), # logistic regression
    ('sv', clf_svc), # svc
    ('kn', clf_knn), # knn
    ('gb', clf_gb) # gradient boosting
]

# voting classifier
clf_voting = VotingClassifier(
    estimators=estimators,
    voting='hard'
)

# stacking classifier


#### 1. Basic checking of data

Generally, don't tend to care about data types until we get an error or some unexpected results. However, it's better to check that data types are correctly loaded before you start your analysis. You can read more about data types in pandas [here](https://pbpython.com/pandas_dtypes.html)

In [None]:
df.shape
df.memory_usage().sum() # memory usage in bytes
df.dtypes.to_frame(name='data types').T # T will represent the transpose of the resulting dataframe, better for visualization

##### Comment
- Seems like all the features were loaded correctly

In [None]:
display("Categorical Features")
df.describe(include='O').T 

#display("Numeric Features")
#df.describe(include='number').T 

Some features have wrong data types

In [None]:
cols_str_to_int = ['mn_sat', 'tuition', 'zipcode', 'income'] # features which needs to be converted to int
for col in cols_str_to_int:
    df[col] = df[col].fillna('0').str.replace(",","").astype(np.float).astype(int)
    # df[col] = pd.to_numeric(df[col].str.replace(",",""), errors='coerce')

Verify that data type is changed correctly

In [None]:
display("Categorical Features")
df.describe(include='O').T

#### 2. Columns renaming & dropping irrelevant columns

It's better to rename columns (without white characters and preferably in _lower case_), so it's easier to deal with them

In [None]:
df.columns.to_frame().reset_index().T.head(1) # showing columns in a more visual way

##### Comment

- No need to change the column names since all are lower case and without any white characters.

Based on our domain knowledge, we have identified relevant features and their associated datatypes. Let's drop the irrelevant features and update the data types again (just for clarity)

In [None]:
relevant_features = [
    ['iid', 'int16'], ['gender', 'bool'],
    ['wave', 'int16'], ['position', 'int16'],
    ['order', 'int16'], ['pid', 'int16'],
    ['age_o', 'int16'], ['race_o', 'category'],
    ['pf_o_att', 'int16'], ['pf_o_sin', 'int16'],
    ['pf_o_int', 'int16'], ['pf_o_fun', 'int16'],
    ['pf_o_amb', 'int16'], ['pf_o_sha', 'int16'],
    ['dec_o', 'bool'], ['attr_o', 'int16'], ['sinc_o', 'int16'], 
    ['intel_o', 'int16'], ['fun_o', 'int16'], ['amb_o', 'int16'], 
    ['shar_o', 'int16'], ['like_o', 'int16'],
    ['prob_o', 'int16'], ['met_o', 'bool'], ['age', 'int16'], ['field_cd', 'category'], ['race', 'category'],
    ['imprace', 'int16'], ['imprelig', 'int16'], ['goal', 'category'], ['date', 'int16'],
    ['go_out', 'int16'], ['career_c', 'category'], ['sports', 'int16'], ['tvsports', 'int16'], ['exercise', 'int16'],
    ['dining', 'int16'], ['museums', 'int16'], ['art', 'int16'], ['hiking', 'int16'],
    ['gaming', 'int16'], ['clubbing', 'int16'], ['reading', 'int16'], ['tv', 'int16'],
    ['theater', 'int16'], ['movies', 'int16'], ['concerts', 'int16'], ['music', 'int16'],
    ['shopping', 'int16'], ['yoga', 'int16'], ['exphappy', 'int16'], ['expnum', 'int16'],
    ['attr1_1', 'int16'], ['sinc1_1', 'int16'], ['intel1_1', 'int16'], ['fun1_1', 'int16'],
    ['amb1_1', 'int16'], ['shar1_1', 'int16'], ['attr3_1', 'int16'], ['sinc3_1', 'int16'],
    ['fun3_1', 'int16'], ['intel3_1', 'int16'], ['amb3_1', 'int16'], ['dec', 'bool'],
    ['attr', 'int16'], ['sinc', 'int16'], ['intel', 'int16'], ['fun', 'int16'],
    ['amb', 'int16'], ['shar', 'int16'], ['like', 'int16'], ['prob', 'int16'],
    ['met', 'int16'], ['match_es', 'int16'], ['satis_2', 'int16'], ['length', 'int16'],
    ['numdat_2', 'int16']
]

In [None]:
df = df[[feature[0] for feature in relevant_features]]
df.shape
df.memory_usage().sum() # memory usage in bytes

Let's update the data types for the relevant features

In [None]:
df = df.astype({feature: datatype if all(df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})
df.dtypes.to_frame(name='data types').T # T will represent the transpose of the resulting dataframe, better for visualization
df.shape
df.memory_usage().sum() # memory usage in bytes

##### Comment

- After dropping irrelevant data and updating the datatypes, the dataframe size almost reduced by 82% the original size.

#### 3. Split your dataset into train and test datasets

We will split our dataset into two parts: `train` & `test` datasets. We will do all the processing on the `train` dataset. `test` dataset will remain unknown to us. And we will use it only for testing analysis. This is to simulate the real world scenario in which we don't know the data which would be run on our model (after deploying)

However, there may be some issues with the `test` dataset (especially if our original dataset is bit imbalance):

1. What if few categories are missed in `test` dataset ? we won't have any hot encoding for those categories.

> One possible solution is to hot encode the new unseen categories as zero (or some default category)

> Another possible solution is to do stratified sampling (in case of imbalance data), so you have data for all the categories in both `train` and `test` datasets

In [None]:
# df.dec.replace(0, 'No').replace(1, 'Yes').value_counts().plot(kind = 'bar')
dec_options = {False: "No", True: "Yes"}
count_ss = df.dec.replace(dec_options).value_counts() # final decision. 1 = Yes, 0 = No
ax = sns.barplot(x = count_ss.index, y = count_ss.values)
_ = ax.set(xlabel='decision', ylabel='count')
_ = ax.bar_label(ax.containers[0])

Let's plot the distributions of subject attribute ratings from their partners

In [None]:
_ = plt.figure(figsize=(16,10))
_ = plt.tight_layout(pad=5.0)

bins = np.arange(0, 10, 0.5).tolist()
ylabel = "No. of subjects"

cols = ['attr_o', 'sinc_o', 'intel_o', 'fun_o', 'amb_o', 'shar_o']
xlabels = ['Attractiveness rating', 'Sincerity rating', 'Intelligence rating', 'Fun rating', 'Ambition rating', 'Shared interest rating']

for i, col in enumerate(cols):
    _ = plt.subplot(2,3, i + 1)
    xlabel = xlabels[i]
    plot_distribution(
        data=df[col],
        bins=bins,
        title="Subject's {}".format(xlabel),
        xlabel="{}".format(xlabel),
        ylabel=ylabel
    )

We have a sightly imbalance data (for `decision` parameter). We will be doing a stratified split for equal proprtion in `train` and `test` datasets. You can check the official doc for `train_test_split` [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:
# keep the random state too, so you we can reproduce the results later as well
__random_state = 0

# let's do a 85% | 15% split
train_df, test_df = train_test_split(df, test_size=0.15, shuffle=True, random_state=__random_state, stratify=df['dec'])

# reset the index for train and test
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
_ = train_df.dec.replace(dec_options).value_counts().plot(kind = 'bar')

In [None]:
_ = test_df.dec.replace(dec_options).value_counts().plot(kind = 'bar')

Both `train` and `test` datasets have the same ratio of values  

In [None]:
# train_df.describe(include='all').T # check train dataset

Save the `train` & `test` dataset as a `.csv` file

In [None]:
train_file_path = join(data_dir, "train.csv")
dataframe_to_csv(train_df, train_file_path)

test_file_path = join(data_dir, "test.csv")
dataframe_to_csv(test_df, test_file_path)

#### 4. Check for missing data

We need to check for missing data and imputate or remove it. It is really important to deal with all the missing data to get better EDA and less incorrect results during model training

In [None]:
# a basic first step is to check if any data is missing in predicted value
# because if some labels are not there in y_train, there isn't any point to include those rows

train_df['dec'].isnull().any()

So basically we are good here. Now let's see how much missing values we have for each feature

In [None]:
((train_df.isna().sum()/len(train_df)) * 100).to_frame(name='missing %').sort_values(by=['missing %'], ascending=False).T

Okay! so we have lots of columns with missing values (_as also already shown in EDA too_). Let's see what are the typical ways to deal with them

There are multiple ways to compute the missing values. Although most of them are `distance based` and only work on _numeric_ features. Missing values are computed on training data only otherwise it can cause _Data Leakage_ (accidentally share the information between the `test` and `train` datasets) as well. List below are the typical ways we use to deal with missing values:

1. Check manually what values are missing and correct them. We can check if there is a pattern to the missing values, think about the possible reasons why it happened in the first place

2. Also, it is know practice to drop the columns if they have more than ~50% of the missing values

3. Deleting rows with missing values - Deleting rows (if we have enough data) or column (if column is not important intutive idea from domain knowledge)

4. Impute missing values for continuous variable - mean/median - without causing _data leakage_

5. Impute missing values for categorical variable - mode ('most_frequent' or 'constant' strategy)

6. Other Imputation Methods - interpolation (provide missing term using nearby trends, newton forward/backward formula, for multivariate - nearest neighbour interpolation, gaussian, polynomial (good for time series interpolation))

7. Using Algorithms that support missing values (k-NN - based on k, get the closest rows using euclidean distance and take the average of there missing value column values)

> Note: Imputation: Replace missing datas with statistical values. sklearn has a [`Imputation` module (sklearn.impute)](https://scikit-learn.org/stable/modules/impute.html#impute) which is quite helpful here.

#### How to impute the missing data ? What do we need ?

- A generalized model for missing values so that if values are missing in test data the model will not break
- Although the mordern tree based algorithms manages the missing values by default using techniques such as `fragments` and `surrogate splits` so while using these algos we do not need to worry about missing values
- We can make a gernalized imputation model which can deal with all the unexpected missing values if required

---

#### Some of the imputers being used from [_sklearn.impute_](https://scikit-learn.org/stable/modules/impute.html#impute) module

##### [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer)

- Replace `NaN` (`np.nan`) by the `mean`, `median`, `most_frequent` or `constant`


##### [KNNImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html#sklearn.impute.KNNImputer)

- Replace your missing values by closest ones
- Imputation for completing missing values using k-Nearest Neighbors
- Each sample’s missing values are imputed using the _mean_ value from `n_neighbors` nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close

##### [MissingIndicator](https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html#sklearn.impute.MissingIndicator)

- Position of missing values by a `boolean` mask

##### [IterativeImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html)

- Multivariate imputer that estimates each feature from all the others

In [None]:
# threshold for missing value - remove all columns which has greater than 50% of missing values
__missing_threshold = float(50)

missing_df = ((train_df.isna().sum()/len(train_df)) * 100).to_frame(name='missing').sort_values(by=['missing'], ascending=False).query("missing > {}".format(__missing_threshold))
missing_cols = missing_df.index.to_list()

In [None]:
# imputate missing values
def iterative_imputate_missing_features(data, random_state = 0, relevant_features=None):
    """
    Method to imputate missing values using IterativeImputer
    """
    imputer = IterativeImputer(
        missing_values=np.nan,
        sample_posterior=True, # sample from gaussian predictive posterior
        n_nearest_features=5,
        min_value=0,
        max_value=100,
        random_state=random_state
    )
    imputer.fit(data)
    data_imputed = np.around(imputer.transform(data))
    data = pd.DataFrame(data_imputed, columns=data.columns)
    if relevant_features:
        data = data.astype({feature: datatype if all(data[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})
    return data, imputer

def imputate_missing_features(data, missing_threshold = 50):
    """
    Method to imputate missing values and return a imputate model
    """

    # threshold for missing value - remove all columns which has greater than 50% of missing values
    __missing_threshold = missing_threshold

    missing_df = ((data.isna().sum()/len(data)) * 100).to_frame(name='missing').sort_values(by=['missing'], ascending=False).query("missing > {}".format(__missing_threshold))
    missing_cols = missing_df.index.to_list()

    cols_to_use = list(set(data.columns.to_list()) - set(missing_cols))
    cols_to_use = [x for x in cols_to_use if x != 'dec']
    
    #X, y = data[cols_to_use], data['dec']
    X, y = data, data['dec']
    X.drop(['dec'], axis=1, inplace=True)

    num_features = make_column_selector(dtype_include=np.number) # get all numeric data
    cat_features = make_column_selector(dtype_exclude=np.number)

    imputate_pipeline = make_column_transformer(
        (SimpleImputer(strategy='mean'), num_features),
        (MissingIndicator(missing_values=np.nan, features="all", error_on_new=False), num_features),

        remainder="drop", # drop the remaining columns
        n_jobs=-1, # run jobs using all available processors (for speedup computation)
    )

    num_pipe = make_pipeline(
        KNNImputer(n_neighbors=5, add_indicator=True),    # stack MissingIndicator on the output
        #imputate_pipeline,
        StandardScaler()
    )

    cat_pipe = make_pipeline(
        SimpleImputer(strategy = 'most_frequent'),
        OneHotEncoder(handle_unknown='ignore', sparse=False)
    )

    col_trans = make_column_transformer(
        (num_pipe, num_features),
        (cat_pipe, cat_features)
    )

    col_trans.fit(X)

    return X, cols_to_use, col_trans

In [None]:
train_df, col_trans = iterative_imputate_missing_features(train_df, __random_state, relevant_features)

In [None]:
train_df.shape
col_trans

In [None]:
#X, cols_to_use, col_trans = imputate_missing_features(train_df)

Save the imputated model as pickle (`.pkl`) file

In [None]:
save_model(col_trans, join(model_dir, "col_trans.pkl"))

Imputate the `train` dataset

In [None]:
train_df.isnull().any() # check missing values now

#### 5. Check for outliers

In order to check outliers in your dataset, there are various methods worth looking into:

1. _Box plots_
2. _Z-score test (normal distribution assumption)_
3. _Model based (One class SVM, density based algorithams, etc)_

Some useful resources:

- [Novelty and Outlier Detection](https://scikit-learn.org/stable/modules/outlier_detection.html)
- [Ways to Detect and Remove the Outliers](https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba)

> For our use case, we will be checking outliers with _box plots_ and _density based algoritham_ (`DBSCAN`)

##### Box Plots

In [None]:
def draw_box_plots(df, cols):
    """
    Draw Box Plot & Histogram for each column
    """
    for col in cols:
        fig, axes= plt.subplots(1,2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9,5))
        _ = df.boxplot(column=col,ax=axes[0]);
        _ = df.hist(column=col, ax=axes[1]);

In [None]:
# get all numeric columns
train_numeric = train_df.select_dtypes(include="number")
draw_box_plots(train_df, train_numeric.columns)
# train_numeric.shape
# train_numeric.head()
# total_num_cols = len(train_numeric.columns)
# for col in train_numeric.columns:
#     fig, axes= plt.subplots(1,2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9,5))
#     _ = train_df.boxplot(column=col,ax=axes[0]);
#     _ = train_df.hist(column=col, ax=axes[1]);

In [None]:
def remove_outlier(df, col, mode=1, times_std=3):
    """
    Basic way to remove outliers

    mode = 1 (via mean and std) 
    mode = 2 (via IQR)
    """
    if mode == 1: # with mean and std
        upper_limit = df[col].mean() + times_std * df[col].std()
        lower_limit = df[col].mean() - times_std * df[col].std()
        df[col] = np.where(
            df[col] > upper_limit,
            upper_limit,
            np.where(
                df[col] < lower_limit,
                lower_limit,
                df[col]
            )
        )
    elif mode == 2:

        p_25, p_75 = df[col].quantile(0.25), df[col].quantile(0.75)
        iqr = p_75 - p_25
        upper_limit = p_75 + 1.5 * iqr
        lower_limit = p_25 - 1.5 * iqr

        df[col] = np.where(
            df[col] > upper_limit,
            upper_limit,
            np.where(
                df[col] < lower_limit,
                lower_limit,
                df[col]
            )
        )
        
    else:
        print("Unsupported mode")
    return df

In [None]:
#train_df = remove_outlier(train_df, train_numeric.columns, mode=2)
#draw_box_plots(train_df, train_numeric.columns)

##### Comment

- Removed all the outliers from the data

---

#### Clustering Algorithm

##### [DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html)

- DBSCAN - Density-Based Spatial Clustering of Applications with Noise.


In [None]:
# keeping the attribute using domain knowledge
# outlier_detection = DBSCAN(min_samples = 4, eps = 0.5)

# make clusters
# normalized_df=(train_numeric - train_numeric.min()) / (train_numeric.max() - train_numeric.min())
# clusters = outlier_detection.fit_predict(normalized_df)

#### 6. Feature Engineering

Encode nominal features using one-hot encoding

In [214]:
features_nominal = train_df.dtypes[train_df.dtypes == 'category'].index.values
train_df = pd.get_dummies(train_df, prefix=features_nominal)

Calculate the average attribute ratings for each subject

In [223]:
data = train_df.copy()

subject_attractiveness_mean = data[['iid', 'attr_o']].groupby(['iid']).mean()['attr_o']
subject_sincerity_mean = data[['iid', 'sinc_o']].groupby(['iid']).mean()['sinc_o']
subject_intelligence_mean = data[['iid', 'intel_o']].groupby(['iid']).mean()['intel_o']
subject_fun_mean = data[['iid', 'fun_o']].groupby(['iid']).mean()['fun_o']
subject_ambition_mean = data[['iid', 'amb_o']].groupby(['iid']).mean()['amb_o']
subject_shared_interest_mean = data[['iid', 'shar_o']].groupby(['iid']).mean()['shar_o']

Insert average attribute ratings into dataframe

In [224]:
data = data.merge(
    right=subject_attractiveness_mean,
    how='inner',
    on='iid'
).rename(columns={
    'attr_o_x': 'attr_o',
    'attr_o_y': 'subject_attractiveness_mean'
})
data = data.merge(
    right=subject_sincerity_mean,
    how='inner',
    on='iid'
).rename(columns={
    'sinc_o_x': 'sinc_o',
    'sinc_o_y': 'subject_sincerity_mean'
})
data = data.merge(
    right=subject_intelligence_mean,
    how='inner',
    on='iid'
).rename(columns={
    'intel_o_x': 'intel_o',
    'intel_o_y': 'subject_intelligence_mean'
})
data = data.merge(
    right=subject_fun_mean,
    how='inner',
    on='iid'
).rename(columns={
    'fun_o_x': 'fun_o',
    'fun_o_y': 'subject_fun_mean'
})
data = data.merge(
    right=subject_ambition_mean,
    how='inner',
    on='iid'
).rename(columns={
    'amb_o_x': 'amb_o',
    'amb_o_y': 'subject_ambition_mean'
})
data = data.merge(
    right=subject_shared_interest_mean,
    how='inner',
    on='iid'
).rename(columns={
    'shar_o_x': 'shar_o',
    'shar_o_y': 'subject_shared_interest_mean'
})

Calculate difference between subject and partner's ages

In [225]:
data['age_difference'] = abs(data['age'] - data['age_o'])

Calculate difference between subject's attribute ratings and partner's attributes ratings

In [226]:
data['attractiveness_difference'] = abs(data['attr'] - data['attr_o'])
data['sincerity_difference'] = abs(data['sinc'] - data['sinc_o'])
data['intelligence_difference'] = abs(data['intel'] - data['intel_o'])
data['fun_difference'] = abs(data['fun'] - data['fun_o'])
data['ambition_difference'] = abs(data['amb'] - data['amb_o'])
data['shared_interest_difference'] = abs(data['shar'] - data['shar_o'])

Scale normal features to zero mean and unit variance

In [227]:
features_normal = [
    'attr_o',
    'sinc_o',
    'intel_o',
    'fun_o',
    'amb_o',
    'shar_o',
    'age_difference',
    'attractiveness_difference',
    'sincerity_difference',
    'intelligence_difference',
    'fun_difference',
    'ambition_difference',
    'shared_interest_difference'
]

data[features_normal] = data[features_normal].apply(lambda x: preprocessing.scale(x))

Drop some features

In [228]:
# Drop irrelevant features which contain no information about the target variable
features_no_information = [
    'iid',
    'pid',
    'wave',
    'position',
    'order'
]
# Drop features that are known in the future
features_future_information = [
    'dec',
    'dec_o',
    'like',
    'prob',
    'like_o',
    'prob_o'
]

# Drop features that have low variance
feature_variances = data.std().sort_values(ascending=True)
features_low_variance = feature_variances[feature_variances < 0.1].index.values.tolist()

# Drop features that have weak correlation with target variable
correlations = data.corr().abs().unstack().sort_values(ascending=False).drop_duplicates()
correlations = correlations[correlations != 1]
partner_decision_correlations = correlations.loc['dec_o']
features_weak_correlation = partner_decision_correlations[partner_decision_correlations < 0.1].axes[0].to_list()
features_weak_correlation = list(set(features_weak_correlation) - set(features_future_information) - set(features_no_information))

# Drop features that were used in interaction variables
features_interaction = [
    'age',
    'age_o',
]

features_remove = features_no_information + features_future_information + features_low_variance + features_weak_correlation + features_interaction
data.drop(columns=features_remove, inplace=True)

In [230]:
data.head()
data.memory_usage().sum()

Unnamed: 0,gender,pf_o_att,pf_o_sin,pf_o_int,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,imprelig,date,go_out,tvsports,exercise,dining,museums,art,hiking,gaming,reading,tv,theater,concerts,music,attr1_1,sinc1_1,intel1_1,shar1_1,intel3_1,attr,sinc,intel,fun,match_es,length,numdat_2,race_o_1.0,race_o_4.0,field_cd_2.0,field_cd_3.0,field_cd_4.0,field_cd_6.0,field_cd_7.0,field_cd_9.0,field_cd_11.0,field_cd_15.0,race_2.0,race_3.0,race_6.0,goal_5.0,career_c_1.0,career_c_3.0,career_c_4.0,career_c_5.0,career_c_7.0,career_c_10.0,career_c_11.0,subject_attractiveness_mean,subject_sincerity_mean,subject_intelligence_mean,subject_fun_mean,subject_ambition_mean,subject_shared_interest_mean,age_difference,attractiveness_difference,fun_difference,ambition_difference,shared_interest_difference
0,False,25,25,25,-0.093734,0.47719,0.405713,0.306851,0.129562,-0.215712,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,7,6,7,7,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.213498,-0.687122,-1.210467,-1.270431,-0.087847
1,False,25,20,15,-0.093734,0.47719,0.405713,-0.720215,-0.984028,-1.151207,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,6,6,6,5,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,0.113945,-1.261822,-1.210467,-0.630769,1.092457
2,False,30,15,20,1.958469,1.053427,1.055676,-0.206682,0.129562,0.252036,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,6,8,8,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.54094,1.036978,0.02066,-1.270431,-0.678
3,False,40,20,20,1.445418,1.629664,1.705638,1.333917,0.686357,-0.215712,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,7,7,7,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.868383,-0.112422,-0.594903,-0.630769,-0.087847
4,False,30,10,25,-0.606785,-1.251521,-1.544174,-0.720215,-0.984028,0.252036,7,4,2,6,7,8,6,8,8,4,7,4,7,7,7,16,19,16,17,7,9,7,8,8,3,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,6.357143,7.714286,8.071429,6.357143,6.785714,5.285714,-0.54094,1.036978,0.636223,0.648555,-1.268152


1609346

##### Comment

- After dropping more irrelevant data, the dataframe size has further decreased.

Save this pre-processed data for modelling

In [231]:
preprocessed_file_path = join(data_dir, "preprocessed.csv")
dataframe_to_csv(data, preprocessed_file_path)

Below mentioned are the articles, blogs, papers and other resources which we took inspiration from or we think are useful for future use case

#### References

- [_Getting Started with Data Preprocessing in Python_](https://www.section.io/engineering-education/data-preprocessing-python/)
- [_Data Preprocessing in Python_](https://medium.datadriveninvestor.com/data-preprocessing-3cd01eefd438)
- [_Data Pre-processing in Python for Beginner_](https://medium.com/data-science-indo/data-preparation-in-python-for-beginner-d3e1e60c03a6)
- [_Data Preprocessing using Python_](https://medium.com/@suneet.bhopal/data-preprocessing-using-python-1bfee9268fb3)
- [_Data preprocessing for Machine Learning in Python_](https://towardsdatascience.com/data-preprocessing-for-machine-learning-in-python-2d465f83f18c)
- [_Data Preprocessing in Machine Learning_](https://www.analytixlabs.co.in/blog/data-preprocessing-in-machine-learning/)
- [_Normalization vs Standardization — Quantitative analysis_](https://towardsdatascience.com/normalization-vs-standardization-quantitative-analysis-a91e8a79cebf)
- [__]()
- [__]()
- [__]()
- [__]()
- [__]()
- [__]()