In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from keras.layers import Dense, Input, Dropout
from keras import Sequential
from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../datasets/rec_df.csv')

In [3]:
df.isnull().sum().sort_values(ascending=False).head()

exphappy_w    81
shar1_1_w     81
amb3_1_w      67
intel3_1_w    67
fun3_1_w      67
dtype: int64

In [4]:
# Dropping 85 rows with nulls
df.dropna(inplace=True)

In [5]:
df.shape

(3999, 84)

In [None]:
obj_traits = ['attr_o_w', 'fun_o_w',
       'intel_o_w', 'sinc_o_w', 'amb_o_w',
    'attr_o_m', 'fun_o_m',
       'intel_o_m', 'sinc_o_m', 'amb_o_m']

In [6]:
X = df.drop('match', 'dec_m', 'dec_f',)
y = df['match']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=42,
                                                   stratify=y)

In [8]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [9]:
pf = PolynomialFeatures(degree=2,
                        interaction_only=True,
                        include_bias=False)
pf = pf.fit(X)
X_pf = pf.transform(X)

X_train_pf, X_test_pf, y_train_pf, y_test_pf = train_test_split(X_pf, y,
                                                               random_state=42, stratify=y)

In [10]:
ss_pf = StandardScaler()
X_train_ss_pf = ss_pf.fit_transform(X_train)
X_test_ss_pf = ss_pf.transform(X_test)

### Logistic Regression

In [11]:
pipe_lr = Pipeline([
    ('pf', PolynomialFeatures(interaction_only=True,
                              include_bias=False)),
    ('ss', StandardScaler()),
    ('lr',LogisticRegressionCV(solver = 'liblinear',
                               random_state=42))
])

In [12]:
lr_param_grid = {
    'lr__penalty':['l1', 'l2'],
    'lr__max_iter':[100, 200, 500, 1000, 2500, 5000, 10000]
}

In [13]:
gs_lr = GridSearchCV(pipe_lr, lr_param_grid, verbose=0, cv=3)

gs_lr.fit(X_train, y_train)





GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pf', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, i...te=42,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'lr__penalty': ['l1', 'l2'], 'lr__max_iter': [100, 200, 500, 1000, 2500, 5000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
gs_lr.score(X_train, y_train), gs_lr.score(X_test, y_test)

(0.8366122040680227, 0.836)

### Kmeans

In [15]:
knn_pipeline= Pipeline([
    ('pf', PolynomialFeatures(include_bias=False)),
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [16]:
knn_param_grid = {
    'knn__n_neighbors': [5, 10, 20, 35, 50, 75, 100, 125],
    'knn__p': [1, 2]
}

In [17]:
gs_knn = GridSearchCV(knn_pipeline, knn_param_grid,
                     verbose=0, cv=3)
gs_knn.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pf', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'knn__n_neighbors': [5, 10, 20, 35, 50, 75, 100, 125], 'knn__p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
gs_knn.score(X_train, y_train), gs_knn.score(X_test, y_test)

(0.8489496498832945, 0.839)

### Neural Network

In [28]:
def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Dense(20,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])

    return model

estimators = []
estimators.append(('pf', PolynomialFeatures(interaction_only=True,
                                            include_bias=False)))
estimators.append(('ss', StandardScaler()))
estimators.append(('nn', KerasClassifier(build_fn=create_model, epochs=10, batch_size=5, verbose=0)))
nn_pipe = Pipeline(estimators)

nn_param_grid = {
    'nn__epochs': [2, 3, 5, 10],
    'nn__batch_size':[1, 3, 5],
}

gs_nn = GridSearchCV(nn_pipe, nn_param_grid,
                     verbose=0, cv=3)
gs_nn.fit(X_train, y_train)

gs_nn.score(X_train, y_train), gs_nn.score(X_test, y_test)

(0.8372790930310103, 0.836)

In [29]:
def create_model(optimizer='adam',
                 dropout=0.1):
    model = Sequential()
    model.add(Dense(20,activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])

    return model

estimators = []
estimators.append(('pf', PolynomialFeatures(interaction_only=True,
                                            include_bias=False)))
estimators.append(('ss', StandardScaler()))
estimators.append(('nn', KerasClassifier(build_fn=create_model, epochs=10, batch_size=5, verbose=0)))
nn_pipe = Pipeline(estimators)

nn_param_grid = {
    'nn__epochs': [2, 3, 5, 10],
    'nn__batch_size':[1, 3, 5],
}

gs_nn = GridSearchCV(nn_pipe, nn_param_grid,
                     verbose=0, cv=3)
gs_nn.fit(X_train, y_train)

gs_nn.score(X_train, y_train), gs_nn.score(X_test, y_test)

(0.8409469823274425, 0.834)

In [30]:
def create_model(optimizer='adam',
                 dropout=0.3):
    model = Sequential()
    model.add(Dense(20,activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])

    return model

estimators = []
estimators.append(('pf', PolynomialFeatures(interaction_only=True,
                                            include_bias=False)))
estimators.append(('ss', StandardScaler()))
estimators.append(('nn', KerasClassifier(build_fn=create_model, epochs=10, batch_size=5, verbose=0)))
nn_pipe = Pipeline(estimators)

nn_param_grid = {
    'nn__epochs': [2, 3, 5, 10],
    'nn__batch_size':[1, 3, 5],
}

gs_nn = GridSearchCV(nn_pipe, nn_param_grid,
                     verbose=0, cv=3)
gs_nn.fit(X_train, y_train)

gs_nn.score(X_train, y_train), gs_nn.score(X_test, y_test)

(0.8406135378459486, 0.829)

In [None]:
def create_model(optimizer='adam',
                 dropout=0.5):
    model = Sequential()
    model.add(Dense(20,activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])

    return model

estimators = []
estimators.append(('pf', PolynomialFeatures(interaction_only=True,
                                            include_bias=False)))
estimators.append(('ss', StandardScaler()))
estimators.append(('nn', KerasClassifier(build_fn=create_model, epochs=10, batch_size=5, verbose=0)))
nn_pipe = Pipeline(estimators)

nn_param_grid = {
    'nn__epochs': [2, 3, 5, 10],
    'nn__batch_size':[1, 3, 5],
}

gs_nn = GridSearchCV(nn_pipe, nn_param_grid,
                     verbose=0, cv=3)
gs_nn.fit(X_train, y_train)

gs_nn.score(X_train, y_train), gs_nn.score(X_test, y_test)