In [1]:
%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import getcwd
from os.path import join, abspath, pardir, exists
import numpy as np
import pandas as pd

import pickle, json

import matplotlib.pyplot as plt
import seaborn as sns

# plotly
import plotly.express as px
import plotly.figure_factory as ff
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# scipy
from scipy.stats import ttest_ind, chi2_contingency, boxcox, skew
from scipy.stats.stats import pearsonr

# sklearn libraries
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.impute import KNNImputer, SimpleImputer, MissingIndicator
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector, make_column_transformer, make_column_transformer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier


from sklearn.experimental import enable_iterative_imputer # enable experimental imputer
from sklearn.impute import IterativeImputer               # sample imputation
from sklearn import preprocessing                         # encoders, transformations
from sklearn.model_selection import cross_validate        # cross-validation, model evaluation
from sklearn.model_selection import GridSearchCV          # hyper-parameter tuning
from sklearn.linear_model import LogisticRegression       # logistic regression model
from sklearn.svm import SVC                               # support vector machine model
from sklearn.neighbors import KNeighborsClassifier        # k-nearest neighbours model
from sklearn.ensemble import GradientBoostingClassifier   # gradient boosting model
from sklearn.ensemble import VotingClassifier             # voting ensemble model
from sklearn.ensemble import StackingClassifier           # stacking ensemble model


# statsmodel
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning

# IPython
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell

##### Config settings

In [2]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
model_dir = join(parent_dir, "models")
data_file = join(data_dir, "test.csv")

# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For pandas

pd.options.display.max_columns = 200 # display upto 200 columns (instead of default 20)
pd.options.display.max_rows = 200 # display upto 200 rows (instead of default 60)

#### Helper functions

In [None]:
relevant_features = [
    ['iid', 'int16'], ['gender', 'bool'],
    ['wave', 'int16'], ['position', 'int16'],
    ['order', 'int16'], ['pid', 'int16'],
    ['age_o', 'int16'], ['race_o', 'category'],
    ['pf_o_att', 'int16'], ['pf_o_sin', 'int16'],
    ['pf_o_int', 'int16'], ['pf_o_fun', 'int16'],
    ['pf_o_amb', 'int16'], ['pf_o_sha', 'int16'],
    ['dec_o', 'bool'], ['attr_o', 'int16'], ['sinc_o', 'int16'], 
    ['intel_o', 'int16'], ['fun_o', 'int16'], ['amb_o', 'int16'], 
    ['shar_o', 'int16'], ['like_o', 'int16'],
    ['prob_o', 'int16'], ['met_o', 'bool'], ['age', 'int16'], ['field_cd', 'category'], ['race', 'category'],
    ['imprace', 'int16'], ['imprelig', 'int16'], ['goal', 'category'], ['date', 'int16'],
    ['go_out', 'int16'], ['career_c', 'category'], ['sports', 'int16'], ['tvsports', 'int16'], ['exercise', 'int16'],
    ['dining', 'int16'], ['museums', 'int16'], ['art', 'int16'], ['hiking', 'int16'],
    ['gaming', 'int16'], ['clubbing', 'int16'], ['reading', 'int16'], ['tv', 'int16'],
    ['theater', 'int16'], ['movies', 'int16'], ['concerts', 'int16'], ['music', 'int16'],
    ['shopping', 'int16'], ['yoga', 'int16'], ['exphappy', 'int16'], ['expnum', 'int16'],
    ['attr1_1', 'int16'], ['sinc1_1', 'int16'], ['intel1_1', 'int16'], ['fun1_1', 'int16'],
    ['amb1_1', 'int16'], ['shar1_1', 'int16'], ['attr3_1', 'int16'], ['sinc3_1', 'int16'],
    ['fun3_1', 'int16'], ['intel3_1', 'int16'], ['amb3_1', 'int16'], ['dec', 'bool'],
    ['attr', 'int16'], ['sinc', 'int16'], ['intel', 'int16'], ['fun', 'int16'],
    ['amb', 'int16'], ['shar', 'int16'], ['like', 'int16'], ['prob', 'int16'],
    ['met', 'int16'], ['match_es', 'int16'], ['satis_2', 'int16'], ['length', 'int16'],
    ['numdat_2', 'int16']
]

In [None]:
def save_model(model, file_path: str) -> None:
    """
    Save model as a pickle file
    """
    with open(file_path, "wb") as file:
        pickle.dump(model, file)

def load_model(file_path: str):
    """
    Load model from a pickle file
    """
    with open(file_path, "rb") as file:
        return pickle.load(file)

def dataframe_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Save dataframe as .csv file
    """
    df.to_csv(file_path, index=False)

def plot_distribution(data, bins, title, xlabel, ylabel):
    """
    Plot distribution functions
    """
    ax = sns.distplot(
        data,
        bins=bins,
        hist_kws={
            "linewidth": 1,
            'edgecolor': 'black',
            'alpha': 1.0
            },
        kde=False
    )
    _ = ax.set_title(title)
    _ = ax.set_xlabel(xlabel)
    _ = ax.set_ylabel(ylabel)

def plot_relationship(x, y, title, xlabel, ylabel):
    """
    Plot relationship between two features
    """
    ax = sns.barplot(
        x=x,
        y=y,
        orient='h'
    )
    _ = ax.set_title(title)
    _ = ax.set_xlabel(xlabel)
    _ = ax.set_ylabel(ylabel)

def print_moments(title, feature):
    """
    Print a feature's mean, standard deviation, skewness and kurtosis
    """
    print(title)
    print('Mean: '+'{:>18.2f}'.format(feature.mean()))
    print('Standard deviation: '+'{:.2f}'.format(feature.std()))
    print('Skewness: '+'{:>14.2f}'.format(feature.skew()))
    print('Kurtosis: '+'{:>14.2f}'.format(feature.kurtosis()))

#### Load test data

In [None]:
df = pd.read_csv(data_file, encoding= 'ISO-8859-1')
df.head()

## Test Analysis

#### Load models

In [None]:
clf_gb = load_model(join(model_dir, "clf_gb.pkl"))
clf_knn = load_model(join(model_dir, "clf_knn.pkl"))
clf_logistic_regression = load_model(join(model_dir, "clf_logistic_regression.pkl"))
clf_stacking = load_model(join(model_dir, "clf_stacking.pkl"))
clf_svc = load_model(join(model_dir, "clf_svc.pkl"))
clf_voting = load_model(join(model_dir, "clf_voting.pkl"))
col_trans = load_model(join(model_dir, "col_trans.pkl"))

##### Basic checking

In [None]:
df['dec'].isnull().any()
((df.isna().sum()/len(df)) * 100).to_frame(name='missing %').sort_values(by=['missing %'], ascending=False).T

##### Imputate missing values

In [None]:
scaled = np.around(col_trans.transform(df))
test_df = pd.DataFrame(scaled, columns=df.columns)
test_df = test_df.astype({feature: datatype if all(test_df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})
test_df.shape
test_df

In [None]:
test_df.isnull().any() # check missing values

Imputation model transformed `test` data successfully

In [None]:
def preprocessing_pipeline(data):
    """
    Pre-processing Pipeline for testing data
    """
    # Encode nominal features using one-hot encoding
    # features_nominal = test_df.dtypes[test_df.dtypes == 'category'].index.values
    # data = pd.get_dummies(test_df, prefix=features_nominal)

    # Calculate the average attribute ratings for each subject
    subject_attractiveness_mean = data[['iid', 'attr_o']].groupby(['iid']).mean()['attr_o']
    subject_sincerity_mean = data[['iid', 'sinc_o']].groupby(['iid']).mean()['sinc_o']
    subject_intelligence_mean = data[['iid', 'intel_o']].groupby(['iid']).mean()['intel_o']
    subject_fun_mean = data[['iid', 'fun_o']].groupby(['iid']).mean()['fun_o']
    subject_ambition_mean = data[['iid', 'amb_o']].groupby(['iid']).mean()['amb_o']
    subject_shared_interest_mean = data[['iid', 'shar_o']].groupby(['iid']).mean()['shar_o']

    # Insert average attribute ratings into dataframe
    data = data.merge(
        right=subject_attractiveness_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'attr_o_x': 'attr_o',
        'attr_o_y': 'subject_attractiveness_mean'
    })
    data = data.merge(
        right=subject_sincerity_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'sinc_o_x': 'sinc_o',
        'sinc_o_y': 'subject_sincerity_mean'
    })
    data = data.merge(
        right=subject_intelligence_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'intel_o_x': 'intel_o',
        'intel_o_y': 'subject_intelligence_mean'
    })
    data = data.merge(
        right=subject_fun_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'fun_o_x': 'fun_o',
        'fun_o_y': 'subject_fun_mean'
    })
    data = data.merge(
        right=subject_ambition_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'amb_o_x': 'amb_o',
        'amb_o_y': 'subject_ambition_mean'
    })
    data = data.merge(
        right=subject_shared_interest_mean,
        how='inner',
        on='iid'
    ).rename(columns={
        'shar_o_x': 'shar_o',
        'shar_o_y': 'subject_shared_interest_mean'
    })

    # Calculate difference between subject and partner's ages
    data['age_difference'] = abs(data['age'] - data['age_o'])

    #Calculate difference between subject's attribute ratings and partner's attributes ratings
    data['attractiveness_difference'] = abs(data['attr'] - data['attr_o'])
    data['sincerity_difference'] = abs(data['sinc'] - data['sinc_o'])
    data['intelligence_difference'] = abs(data['intel'] - data['intel_o'])
    data['fun_difference'] = abs(data['fun'] - data['fun_o'])
    data['ambition_difference'] = abs(data['amb'] - data['amb_o'])
    data['shared_interest_difference'] = abs(data['shar'] - data['shar_o'])

    #Scale normal features to zero mean and unit variance
    features_normal = [
        'attr_o',
        'sinc_o',
        'intel_o',
        'fun_o',
        'amb_o',
        'shar_o',
        'age_difference',
        'attractiveness_difference',
        'sincerity_difference',
        'intelligence_difference',
        'fun_difference',
        'ambition_difference',
        'shared_interest_difference'
    ]

    data[features_normal] = data[features_normal].apply(lambda x: preprocessing.scale(x))

    # Drop some features
    # Drop irrelevant features which contain no information about the target variable
    features_no_information = [
        'iid',
        'pid',
        'wave',
        'position',
        'order'
    ]
    # Drop features that are known in the future
    features_future_information = [
        'dec',
        #'dec_o',
        'like',
        'prob',
        'like_o',
        'prob_o'
    ]

    # Drop features that have low variance
    feature_variances = data.std().sort_values(ascending=True)
    features_low_variance = feature_variances[feature_variances < 0.1].index.values.tolist()

    # Drop features that have weak correlation with target variable
    correlations = data.corr().abs().unstack().sort_values(ascending=False).drop_duplicates()
    correlations = correlations[correlations != 1]
    partner_decision_correlations = correlations.loc['dec_o']
    features_weak_correlation = partner_decision_correlations[partner_decision_correlations < 0.1].axes[0].to_list()
    features_weak_correlation = list(set(features_weak_correlation) - set(features_future_information) - set(features_no_information))

    # Drop features that were used in interaction variables
    features_interaction = [
        'age',
        'age_o',
    ]

    features_remove = features_no_information + features_future_information + features_low_variance + features_weak_correlation + features_interaction
    data.drop(columns=features_remove, inplace=True)

    return data

In [None]:
df = df.astype({feature: datatype if all(df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})

In [None]:
scaled = col_trans.transform(df)
test_df = pd.DataFrame(scaled, columns=df.columns)
test_df.shape
test_df.head(3)

In [None]:
test_df = test_df.astype({feature: datatype if all(test_df[feature].notna().values) else 'float32' if datatype == 'int16' else datatype for (feature, datatype) in relevant_features})

In [None]:
# Encode nominal features using one-hot encoding
features_nominal = test_df.dtypes[test_df.dtypes == 'category'].index.values
test_df = pd.get_dummies(test_df, prefix=features_nominal)
test_df.shape

In [None]:
test_df = preprocessing_pipeline(test_df)
test_df.shape 

In [None]:
X_test, y_test = test_df.loc[:,~test_df.columns.isin(['dec_o'])], test_df['dec_o']

In [None]:
test_df.columns

### 1. Testing on Baseline Models

#### 1.1. [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
y_pred = clf_logistic_regression.predict(X_test)
y_pred