In [1]:
%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import getcwd
from os.path import join, abspath, pardir, exists
import numpy as np
import pandas as pd

import pickle, json

import matplotlib.pyplot as plt
import seaborn as sns

# plotly
import plotly.express as px
import plotly.figure_factory as ff
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# scipy
from scipy.stats import ttest_ind, chi2_contingency, boxcox, skew
from scipy.stats.stats import pearsonr

# sklearn libraries
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.impute import KNNImputer, SimpleImputer, MissingIndicator
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector, make_column_transformer, make_column_transformer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier


from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model


# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning

# IPython
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell

##### Config settings

In [2]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
model_dir = join(parent_dir, "models")
#data_file = join(data_dir, "test.csv")
data_file = join(data_dir, "train.csv")

# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For pandas

pd.options.display.max_columns = 200 # display upto 200 columns (instead of default 20)
pd.options.display.max_rows = 200 # display upto 200 rows (instead of default 60)

#### Helper functions

In [3]:
relevant_features = [
    ['iid', 'int16'], ['gender', 'bool'],
    ['wave', 'int16'], ['position', 'int16'],
    ['order', 'int16'], ['pid', 'int16'],
    ['age_o', 'int16'], ['race_o', 'category'],
    ['pf_o_att', 'int16'], ['pf_o_sin', 'int16'],
    ['pf_o_int', 'int16'], ['pf_o_fun', 'int16'],
    ['pf_o_amb', 'int16'], ['pf_o_sha', 'int16'],
    ['dec_o', 'bool'], ['attr_o', 'int16'], ['sinc_o', 'int16'], 
    ['intel_o', 'int16'], ['fun_o', 'int16'], ['amb_o', 'int16'], 
    ['shar_o', 'int16'], ['like_o', 'int16'],
    ['prob_o', 'int16'], ['met_o', 'bool'], ['age', 'int16'], ['field_cd', 'category'], ['race', 'category'],
    ['imprace', 'int16'], ['imprelig', 'int16'], ['goal', 'category'], ['date', 'int16'],
    ['go_out', 'int16'], ['career_c', 'category'], ['sports', 'int16'], ['tvsports', 'int16'], ['exercise', 'int16'],
    ['dining', 'int16'], ['museums', 'int16'], ['art', 'int16'], ['hiking', 'int16'],
    ['gaming', 'int16'], ['clubbing', 'int16'], ['reading', 'int16'], ['tv', 'int16'],
    ['theater', 'int16'], ['movies', 'int16'], ['concerts', 'int16'], ['music', 'int16'],
    ['shopping', 'int16'], ['yoga', 'int16'], ['exphappy', 'int16'], ['expnum', 'int16'],
    ['attr1_1', 'int16'], ['sinc1_1', 'int16'], ['intel1_1', 'int16'], ['fun1_1', 'int16'],
    ['amb1_1', 'int16'], ['shar1_1', 'int16'], ['attr3_1', 'int16'], ['sinc3_1', 'int16'],
    ['fun3_1', 'int16'], ['intel3_1', 'int16'], ['amb3_1', 'int16'], ['dec', 'bool'],
    ['attr', 'int16'], ['sinc', 'int16'], ['intel', 'int16'], ['fun', 'int16'],
    ['amb', 'int16'], ['shar', 'int16'], ['like', 'int16'], ['prob', 'int16'],
    ['met', 'int16'], ['match_es', 'int16'], ['satis_2', 'int16'], ['length', 'int16'],
    ['numdat_2', 'int16']
]

In [4]:
def save_model(model, file_path: str) -> None:
    """
    Save model as a pickle file
    """
    with open(file_path, "wb") as file:
        pickle.dump(model, file)

def load_model(file_path: str):
    """
    Load model from a pickle file
    """
    with open(file_path, "rb") as file:
        return pickle.load(file)

def dataframe_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Save dataframe as .csv file
    """
    df.to_csv(file_path, index=False)

def plot_distribution(data, bins, title, xlabel, ylabel):
    """
    Plot distribution functions
    """
    ax = sns.distplot(
        data,
        bins=bins,
        hist_kws={
            "linewidth": 1,
            'edgecolor': 'black',
            'alpha': 1.0
            },
        kde=False
    )
    _ = ax.set_title(title)
    _ = ax.set_xlabel(xlabel)
    _ = ax.set_ylabel(ylabel)

def plot_relationship(x, y, title, xlabel, ylabel):
    """
    Plot relationship between two features
    """
    ax = sns.barplot(
        x=x,
        y=y,
        orient='h'
    )
    _ = ax.set_title(title)
    _ = ax.set_xlabel(xlabel)
    _ = ax.set_ylabel(ylabel)

def print_moments(title, feature):
    """
    Print a feature's mean, standard deviation, skewness and kurtosis
    """
    print(title)
    print('Mean: '+'{:>18.2f}'.format(feature.mean()))
    print('Standard deviation: '+'{:.2f}'.format(feature.std()))
    print('Skewness: '+'{:>14.2f}'.format(feature.skew()))
    print('Kurtosis: '+'{:>14.2f}'.format(feature.kurtosis()))

#### Load test data

In [5]:
df = pd.read_csv(data_file, encoding= 'ISO-8859-1')
df.head()

Unnamed: 0,iid,gender,wave,position,order,pid,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field_cd,race,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,satis_2,length,numdat_2
0,477,False,19,4,11,494.0,30.0,2.0,25.0,25.0,25.0,25.0,0.0,0.0,False,6.0,8.0,8.0,7.0,7.0,5.0,5.0,5.0,True,27.0,5.0,3.0,5.0,7.0,1.0,4.0,2.0,5.0,9.0,6.0,7.0,8.0,6.0,8.0,8.0,4.0,7.0,7.0,4.0,7.0,8.0,7.0,7.0,6.0,4.0,7.0,,16.0,19.0,16.0,17.0,15.0,17.0,6.0,8.0,7.0,7.0,7.0,True,7.0,6.0,7.0,7.0,7.0,,6.0,5.0,0.0,3.0,8.0,1.0,3.0
1,382,False,15,17,2,409.0,23.0,2.0,20.0,15.0,20.0,20.0,10.0,15.0,False,3.0,6.0,7.0,4.0,7.0,2.0,3.0,4.0,True,28.0,14.0,2.0,8.0,1.0,2.0,7.0,3.0,6.0,6.0,5.0,10.0,10.0,9.0,9.0,6.0,8.0,3.0,8.0,10.0,8.0,10.0,9.0,9.0,9.0,1.0,6.0,,25.0,25.0,25.0,10.0,10.0,5.0,7.0,7.0,7.0,7.0,10.0,True,6.0,8.0,8.0,8.0,6.0,8.0,6.0,5.0,2.0,3.0,5.0,3.0,3.0
2,395,False,15,11,11,411.0,34.0,2.0,35.0,15.0,15.0,25.0,10.0,0.0,False,8.0,10.0,10.0,8.0,9.0,5.0,5.0,4.0,True,33.0,6.0,2.0,1.0,2.0,3.0,4.0,2.0,6.0,3.0,4.0,4.0,10.0,8.0,8.0,6.0,1.0,8.0,9.0,3.0,10.0,10.0,7.0,9.0,7.0,8.0,4.0,,20.0,20.0,20.0,10.0,10.0,20.0,8.0,8.0,7.0,9.0,7.0,False,7.0,8.0,9.0,7.0,,8.0,7.0,7.0,2.0,5.0,5.0,3.0,2.0
3,153,False,7,11,14,162.0,27.0,4.0,15.22,15.22,19.57,17.39,13.04,19.57,False,5.0,7.0,6.0,3.0,7.0,3.0,4.0,3.0,True,28.0,7.0,4.0,1.0,1.0,2.0,6.0,3.0,2.0,8.0,5.0,7.0,7.0,7.0,6.0,6.0,5.0,7.0,10.0,6.0,7.0,9.0,7.0,7.0,8.0,5.0,6.0,,9.76,19.51,19.51,14.63,19.51,17.07,7.0,7.0,6.0,7.0,6.0,True,5.0,5.0,7.0,6.0,7.0,6.0,5.0,5.0,0.0,4.0,5.0,1.0,3.0
4,41,True,2,10,15,24.0,27.0,2.0,10.0,20.0,20.0,15.0,20.0,15.0,False,7.0,8.0,,,8.0,,8.0,4.0,True,30.0,1.0,4.0,1.0,1.0,2.0,6.0,5.0,1.0,5.0,5.0,6.0,5.0,5.0,5.0,5.0,10.0,5.0,6.0,6.0,6.0,6.0,5.0,5.0,5.0,1.0,7.0,3.0,30.0,20.0,10.0,30.0,0.0,10.0,5.0,7.0,5.0,8.0,5.0,False,5.0,5.0,5.0,5.0,5.0,,5.0,,2.0,,5.0,3.0,3.0


## Test Analysis

#### Load models

In [6]:
clf_gb = load_model(join(model_dir, "clf_gb.pkl"))
clf_knn = load_model(join(model_dir, "clf_knn.pkl"))
clf_logistic_regression = load_model(join(model_dir, "clf_logistic_regression.pkl"))
clf_stacking = load_model(join(model_dir, "clf_stacking.pkl"))
clf_svc = load_model(join(model_dir, "clf_svc.pkl"))
clf_voting = load_model(join(model_dir, "clf_voting.pkl"))
col_trans = load_model(join(model_dir, "col_trans.pkl"))

##### Basic checking

In [7]:
df['dec'].isnull().any()
((df.isna().sum()/len(df)) * 100).to_frame(name='missing %').sort_values(by=['missing %'], ascending=False).T

False

Unnamed: 0,expnum,match_es,shar_o,shar,numdat_2,length,satis_2,amb_o,amb,met,fun_o,fun,prob_o,intel_o,prob,intel,sinc_o,sinc,like_o,like,attr_o,attr,career_c,shar1_1,pf_o_sha,amb1_1,exphappy,pf_o_amb,amb3_1,intel3_1,fun3_1,sinc3_1,attr3_1,date,age,age_o,pf_o_fun,fun1_1,field_cd,pf_o_int,pf_o_att,pf_o_sin,shopping,intel1_1,sinc1_1,attr1_1,yoga,art,music,museums,imprace,imprelig,goal,go_out,concerts,tvsports,exercise,dining,sports,gaming,clubbing,reading,tv,theater,movies,hiking,race,race_o,pid,wave,position,order,gender,dec_o,met_o,dec,iid
missing %,78.654683,13.888499,12.821233,12.736975,11.276506,10.883303,10.883303,8.636427,8.580256,4.578009,4.297149,4.142677,3.833731,3.777559,3.777559,3.651173,3.468614,3.440528,3.033282,2.906895,2.640079,2.485606,1.755371,1.55877,1.474512,1.263867,1.263867,1.249824,1.249824,1.249824,1.249824,1.249824,1.249824,1.221739,1.207696,1.193653,1.137481,1.123438,1.053223,1.025137,1.025137,1.025137,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.997051,0.800449,0.800449,0.14043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df = df.astype({feature: datatype if all(df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})

In [9]:
df.shape

(7121, 77)

##### Imputate missing values

In [10]:
scaled = np.around(col_trans.transform(df))
test_df = pd.DataFrame(scaled, columns=df.columns)
test_df = test_df.astype({feature: datatype if all(test_df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})
test_df.shape
test_df

(7121, 77)

Unnamed: 0,iid,gender,wave,position,order,pid,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field_cd,race,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,satis_2,length,numdat_2
0,477,False,19,4,11,494,30,2.0,25,25,25,25,0,0,False,6,8,8,7,7,5,5,5,True,27,5.0,3.0,5,7,1.0,4,2,5.0,9,6,7,8,6,8,8,4,7,7,4,7,8,7,7,6,4,7,10,16,19,16,17,15,17,6,8,7,7,7,True,7,6,7,7,7,7,6,5,0,3,8,1,3
1,382,False,15,17,2,409,23,2.0,20,15,20,20,10,15,False,3,6,7,4,7,2,3,4,True,28,14.0,2.0,8,1,2.0,7,3,6.0,6,5,10,10,9,9,6,8,3,8,10,8,10,9,9,9,1,6,1,25,25,25,10,10,5,7,7,7,7,10,True,6,8,8,8,6,8,6,5,2,3,5,3,3
2,395,False,15,11,11,411,34,2.0,35,15,15,25,10,0,False,8,10,10,8,9,5,5,4,True,33,6.0,2.0,1,2,3.0,4,2,6.0,3,4,4,10,8,8,6,1,8,9,3,10,10,7,9,7,8,4,10,20,20,20,10,10,20,8,8,7,9,7,False,7,8,9,7,10,8,7,7,2,5,5,3,2
3,153,False,7,11,14,162,27,4.0,15,15,20,17,13,20,False,5,7,6,3,7,3,4,3,True,28,7.0,4.0,1,1,2.0,6,3,2.0,8,5,7,7,7,6,6,5,7,10,6,7,9,7,7,8,5,6,5,10,20,20,15,20,17,7,7,6,7,6,True,5,5,7,6,7,6,5,5,0,4,5,1,3
4,41,True,2,10,15,24,27,2.0,10,20,20,15,20,15,False,7,8,7,7,8,6,8,4,True,30,1.0,4.0,1,1,2.0,6,5,1.0,5,5,6,5,5,5,5,10,5,6,6,6,6,5,5,5,1,7,3,30,20,10,30,0,10,5,7,5,8,5,False,5,5,5,5,5,7,5,3,2,4,5,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7116,349,False,14,7,13,363,23,4.0,25,20,20,20,5,10,True,7,8,7,8,8,7,7,5,True,33,3.0,2.0,1,7,1.0,4,2,7.0,2,3,10,8,7,8,7,1,6,7,4,8,8,8,9,6,7,6,13,25,20,10,20,20,5,7,8,8,9,7,False,4,8,6,3,7,7,7,6,0,5,4,3,2
7117,384,False,15,16,14,413,26,2.0,10,10,30,30,5,15,False,8,8,7,8,8,7,8,7,True,23,9.0,2.0,9,6,2.0,5,3,10.0,8,5,7,9,10,10,3,3,7,6,7,10,10,5,6,6,3,8,10,20,20,20,20,20,0,8,8,9,8,9,False,7,10,10,8,10,7,6,8,2,10,7,1,3
7118,274,True,11,5,20,258,25,6.0,7,35,20,20,5,13,False,8,6,5,2,5,4,5,6,True,24,3.0,3.0,3,3,3.0,4,2,2.0,3,3,4,7,6,5,4,5,1,10,7,6,8,9,10,8,7,6,9,30,10,20,20,10,10,7,7,10,9,9,False,7,6,5,6,7,6,8,6,0,4,8,1,2
7119,270,False,11,2,8,280,25,2.0,25,20,30,15,5,5,True,8,7,8,8,7,7,8,4,True,23,10.0,4.0,2,4,2.0,5,1,2.0,8,4,6,9,8,7,8,4,6,9,2,7,5,6,7,6,4,5,11,16,16,19,17,14,18,5,4,7,8,6,True,6,7,8,7,6,6,7,6,0,6,6,1,2


In [11]:
test_df.isnull().any() # check missing values

iid         False
gender      False
wave        False
position    False
order       False
pid         False
age_o       False
race_o      False
pf_o_att    False
pf_o_sin    False
pf_o_int    False
pf_o_fun    False
pf_o_amb    False
pf_o_sha    False
dec_o       False
attr_o      False
sinc_o      False
intel_o     False
fun_o       False
amb_o       False
shar_o      False
like_o      False
prob_o      False
met_o       False
age         False
field_cd    False
race        False
imprace     False
imprelig    False
goal        False
date        False
go_out      False
career_c    False
sports      False
tvsports    False
exercise    False
dining      False
museums     False
art         False
hiking      False
gaming      False
clubbing    False
reading     False
tv          False
theater     False
movies      False
concerts    False
music       False
shopping    False
yoga        False
exphappy    False
expnum      False
attr1_1     False
sinc1_1     False
intel1_1    False
fun1_1    

Imputation model transformed `test` data successfully

Encode nominal features using one-hot encoding

In [12]:
features_nominal = test_df.dtypes[test_df.dtypes == 'category'].index.values
test_df = pd.get_dummies(test_df, prefix=features_nominal)
test_df.shape

(7121, 129)

Calculate the average attribute ratings for each subject

In [13]:
data = test_df.copy()

subject_attractiveness_mean = data[['iid', 'attr_o']].groupby(['iid']).mean()['attr_o']
subject_sincerity_mean = data[['iid', 'sinc_o']].groupby(['iid']).mean()['sinc_o']
subject_intelligence_mean = data[['iid', 'intel_o']].groupby(['iid']).mean()['intel_o']
subject_fun_mean = data[['iid', 'fun_o']].groupby(['iid']).mean()['fun_o']
subject_ambition_mean = data[['iid', 'amb_o']].groupby(['iid']).mean()['amb_o']
subject_shared_interest_mean = data[['iid', 'shar_o']].groupby(['iid']).mean()['shar_o']

Insert average attribute ratings into dataframe

In [14]:
data = data.merge(
    right=subject_attractiveness_mean,
    how='inner',
    on='iid'
).rename(columns={
    'attr_o_x': 'attr_o',
    'attr_o_y': 'subject_attractiveness_mean'
})
data = data.merge(
    right=subject_sincerity_mean,
    how='inner',
    on='iid'
).rename(columns={
    'sinc_o_x': 'sinc_o',
    'sinc_o_y': 'subject_sincerity_mean'
})
data = data.merge(
    right=subject_intelligence_mean,
    how='inner',
    on='iid'
).rename(columns={
    'intel_o_x': 'intel_o',
    'intel_o_y': 'subject_intelligence_mean'
})
data = data.merge(
    right=subject_fun_mean,
    how='inner',
    on='iid'
).rename(columns={
    'fun_o_x': 'fun_o',
    'fun_o_y': 'subject_fun_mean'
})
data = data.merge(
    right=subject_ambition_mean,
    how='inner',
    on='iid'
).rename(columns={
    'amb_o_x': 'amb_o',
    'amb_o_y': 'subject_ambition_mean'
})
data = data.merge(
    right=subject_shared_interest_mean,
    how='inner',
    on='iid'
).rename(columns={
    'shar_o_x': 'shar_o',
    'shar_o_y': 'subject_shared_interest_mean'
})

Calculate difference between subject and partner's ages

In [15]:
data['age_difference'] = abs(data['age'] - data['age_o'])

Calculate difference between subject's attribute ratings and partner's attributes ratings

In [16]:
data['attractiveness_difference'] = abs(data['attr'] - data['attr_o'])
data['sincerity_difference'] = abs(data['sinc'] - data['sinc_o'])
data['intelligence_difference'] = abs(data['intel'] - data['intel_o'])
data['fun_difference'] = abs(data['fun'] - data['fun_o'])
data['ambition_difference'] = abs(data['amb'] - data['amb_o'])
data['shared_interest_difference'] = abs(data['shar'] - data['shar_o'])

Scale normal features to zero mean and unit variance

In [17]:
features_normal = [
    'attr_o',
    'sinc_o',
    'intel_o',
    'fun_o',
    'amb_o',
    'shar_o',
    'age_difference',
    'attractiveness_difference',
    'sincerity_difference',
    'intelligence_difference',
    'fun_difference',
    'ambition_difference',
    'shared_interest_difference'
]

data[features_normal] = data[features_normal].apply(lambda x: preprocessing.scale(x))

Drop some features

In [18]:
# Drop irrelevant features which contain no information about the target variable
features_no_information = [
    'iid',
    'pid',
    'wave',
    'position',
    'order'
]
# Drop features that are known in the future
features_future_information = [
    'dec',
    #'dec_o',
    'like',
    'prob',
    'like_o',
    'prob_o'
]

# Drop features that have low variance
feature_variances = data.std().sort_values(ascending=True)
features_low_variance = feature_variances[feature_variances < 0.1].index.values.tolist()

# Drop features that have weak correlation with target variable
correlations = data.corr().abs().unstack().sort_values(ascending=False).drop_duplicates()
correlations = correlations[correlations != 1]
partner_decision_correlations = correlations.loc['dec_o']
features_weak_correlation = partner_decision_correlations[partner_decision_correlations < 0.1].axes[0].to_list()
features_weak_correlation = list(set(features_weak_correlation) - set(features_future_information) - set(features_no_information))

# Drop features that were used in interaction variables
features_interaction = [
    'age',
    'age_o',
]

features_remove = features_no_information + features_future_information + features_low_variance + features_weak_correlation + features_interaction
data.drop(columns=features_remove, inplace=True)

In [19]:
data.shape

(7121, 59)

In [None]:
test_df.shape

In [None]:
def preprocessing_pipeline(data):
    """
    Pre-processing Pipeline for testing data
    """
    # Encode nominal features using one-hot encoding
    # features_nominal = test_df.dtypes[test_df.dtypes == 'category'].index.values
    # data = pd.get_dummies(test_df, prefix=features_nominal)

    # Calculate the average attribute ratings for each subject
    subject_attractiveness_mean = data[['iid', 'attr_o']].groupby(['iid']).mean()['attr_o']
    subject_sincerity_mean = data[['iid', 'sinc_o']].groupby(['iid']).mean()['sinc_o']
    subject_intelligence_mean = data[['iid', 'intel_o']].groupby(['iid']).mean()['intel_o']
    subject_fun_mean = data[['iid', 'fun_o']].groupby(['iid']).mean()['fun_o']
    subject_ambition_mean = data[['iid', 'amb_o']].groupby(['iid']).mean()['amb_o']
    subject_shared_interest_mean = data[['iid', 'shar_o']].groupby(['iid']).mean()['shar_o']

    # Insert average attribute ratings into dataframe
    data = data.merge(
        right=subject_attractiveness_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'attr_o_x': 'attr_o',
        'attr_o_y': 'subject_attractiveness_mean'
    })
    data = data.merge(
        right=subject_sincerity_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'sinc_o_x': 'sinc_o',
        'sinc_o_y': 'subject_sincerity_mean'
    })
    data = data.merge(
        right=subject_intelligence_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'intel_o_x': 'intel_o',
        'intel_o_y': 'subject_intelligence_mean'
    })
    data = data.merge(
        right=subject_fun_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'fun_o_x': 'fun_o',
        'fun_o_y': 'subject_fun_mean'
    })
    data = data.merge(
        right=subject_ambition_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'amb_o_x': 'amb_o',
        'amb_o_y': 'subject_ambition_mean'
    })
    data = data.merge(
        right=subject_shared_interest_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'shar_o_x': 'shar_o',
        'shar_o_y': 'subject_shared_interest_mean'
    })

    # Calculate difference between subject and partner's ages
    data['age_difference'] = abs(data['age'] - data['age_o'])

    #Calculate difference between subject's attribute ratings and partner's attributes ratings
    data['attractiveness_difference'] = abs(data['attr'] - data['attr_o'])
    data['sincerity_difference'] = abs(data['sinc'] - data['sinc_o'])
    data['intelligence_difference'] = abs(data['intel'] - data['intel_o'])
    data['fun_difference'] = abs(data['fun'] - data['fun_o'])
    data['ambition_difference'] = abs(data['amb'] - data['amb_o'])
    data['shared_interest_difference'] = abs(data['shar'] - data['shar_o'])

    #Scale normal features to zero mean and unit variance
    features_normal = [
        'attr_o',
        'sinc_o',
        'intel_o',
        'fun_o',
        'amb_o',
        'shar_o',
        'age_difference',
        'attractiveness_difference',
        'sincerity_difference',
        'intelligence_difference',
        'fun_difference',
        'ambition_difference',
        'shared_interest_difference'
    ]

    data[features_normal] = data[features_normal].apply(lambda x: preprocessing.scale(x))

    # Drop some features
    # Drop irrelevant features which contain no information about the target variable
    features_no_information = [
        'iid',
        'pid',
        'wave',
        'position',
        'order'
    ]
    # Drop features that are known in the future
    features_future_information = [
        'dec',
        #'dec_o',
        'like',
        'prob',
        'like_o',
        'prob_o'
    ]

    # Drop features that have low variance
    feature_variances = data.std().sort_values(ascending=True)
    features_low_variance = feature_variances[feature_variances < 0.1].index.values.tolist()

    # Drop features that have weak correlation with target variable
    correlations = data.corr().abs().unstack().sort_values(ascending=False).drop_duplicates()
    correlations = correlations[correlations != 1]
    partner_decision_correlations = correlations.loc['dec_o']
    features_weak_correlation = partner_decision_correlations[partner_decision_correlations < 0.1].axes[0].to_list()
    features_weak_correlation = list(set(features_weak_correlation) - set(features_future_information) - set(features_no_information))

    # Drop features that were used in interaction variables
    features_interaction = [
        'age',
        'age_o',
    ]

    features_remove = features_no_information + features_future_information + features_low_variance + features_weak_correlation + features_interaction
    data.drop(columns=features_remove, inplace=True)

    return data

In [None]:
df = df.astype({feature: datatype if all(df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})

In [None]:
scaled = col_trans.transform(df)
test_df = pd.DataFrame(scaled, columns=df.columns)
test_df.shape
test_df.head(3)

In [None]:
test_df = test_df.astype({feature: datatype if all(test_df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})

In [None]:
# Encode nominal features using one-hot encoding
features_nominal = test_df.dtypes[test_df.dtypes == 'category'].index.values
test_df = pd.get_dummies(test_df, prefix=features_nominal)
test_df.shape

In [None]:
test_df = preprocessing_pipeline(test_df)
test_df.shape 

In [None]:
X_test, y_test = test_df.loc[:,~test_df.columns.isin(['dec_o'])], test_df['dec_o']

In [None]:
test_df.columns

### 1. Testing on Baseline Models

#### 1.1. [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
y_pred = clf_logistic_regression.predict(X_test)
y_pred