In [33]:
#First let's import the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
from IPython.display import display, HTML

pd.set_option('display.max_columns', 500)

In [34]:
#Specifying the Data Path

cwd = os.getcwd()
file_path = os.path.join(cwd, 'cleaned_speed_dating.csv')

In [35]:
df=pd.read_csv(file_path)

In [36]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.2, random_state=42)

In [37]:
female_df = df.loc[df['gender'] == 0]
male_df = df.loc[df['gender'] == 1]

In [38]:
female_train, female_test = train_test_split(female_df, test_size = 0.2, random_state=42)
male_train, male_test = train_test_split(male_df, test_size = 0.2, random_state=42)

# Logistic Regression

In [39]:
from sklearn import linear_model
lr = linear_model.LogisticRegression()

In [40]:
predictors = ['attr_partner']
lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))

training set performance is 0.727696032781
test set performance is 0.729709605361


In fact, if you only use one variable, that is no different than just doing a cut off based on one of the previous graphs and make naive predictions based on that.

In [41]:
list(df.columns[1:-1])

['gender',
 'age',
 'date',
 'sports',
 'tvsports',
 'exercise',
 'dining',
 'museums',
 'art',
 'hiking',
 'gaming',
 'clubbing',
 'reading',
 'tv',
 'theater',
 'movies',
 'concerts',
 'music',
 'shopping',
 'yoga',
 'attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want',
 'attr_self',
 'sinc_self',
 'fun_self',
 'intel_self',
 'amb_self',
 'pid',
 'age_partner',
 'int_corr',
 'samerace',
 'attr_partner',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner',
 'prob']

In [42]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner']

lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))

training set performance is 0.752281616688
test set performance is 0.754281459419


In [43]:
# You can also see how important each of those factors is (sort of)
print(lr_model.coef_)

[[-0.02260688 -0.04643842  0.02284131 -0.0359313  -0.14527917  0.02399599
   0.24632643 -0.1914495   0.21257761  0.1772327   0.57646054]]


But it does seems like attractiveness is more indicative than anything else. Next you can try to combine them and repeat the same procedure. Is there an improvement to the performance? What can you infer from the coefficients this time? 

In [44]:
predictors = ['gender',
 'data',
 'age',
 'attr_self',
 'sinc_self',
 'fun_self',
 'intel_self',
 'amb_self',
 'age_partner',
 'int_corr',
 'samerace',
 'attr_partner',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner',
 'prob']

predictors = list(df.columns[1:-1])
lr_model = lr.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(lr_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(lr_model.score(test[predictors].values, test['dec'].values)))

training set performance is 0.782082324455
test set performance is 0.765450483991


Try to include also those variables that you think are important and repeat the same step again. Observe what happen to the performances when you add more and more predictors.

Based on the graphs on EDA, it seems that male and female make their decisions quite differently. Try to repeat the above with female_df and male_df and see if the results improve.

# Benchmark

In order to know how good our prediction performance is, we should at least compare it to the performances of some naive algorithms. 

In [50]:
# what if I just look at the training set and guess the most popular decisions?
no_female_train = female_train.query('dec == 0')
yes_female_train = female_train.query('dec == 1')
print('Proportion of rejection by female in training set is {}'\
      .format(yes_female_train.shape[0]/female_train.shape[0]))

no_female_test = female_test.query('dec == 0')
yes_female_test = female_test.query('dec == 1')
print('Proportion of rejection by female in test set is {}'\
      .format(float(yes_female_test.shape[0])/female_test.shape[0]))

Proportion of rejection by female in training set is 0
Proportion of rejection by female in test set is 0.376327769347


In [48]:
# what if I just look at the training set and guess the most popular decisions?
no_male_train = male_train.query('dec == 0')
yes_male_train = male_train.query('dec == 1')
print('Proportion of rejection by male in training set is {}'\
      .format(yes_male_train.shape[0]/male_train.shape[0]))

no_male_test = male_test.query('dec == 0')
yes_male_test = male_test.query('dec == 1')
print('Proportion of rejection by male in test set is {}'\
      .format(yes_male_test.shape[0]/male_test.shape[0]))

Proportion of rejection by male in training set is 0
Proportion of rejection by male in test set is 0


In [49]:
# what if I simply do a cut off at attr_partner and base my decision on that? (refer to graphs plotted in EDA)
male_test['attr_cut_predict'] = (male_test['attr_partner']>=7)
print('male test set performance is {}'\
      .format((male_test['attr_cut_predict'] == male_test['dec']).sum()/male_test.shape[0]))

male test set performance is 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [111]:
female_test['attr_cut_predict'] = (female_test['attr_partner']>=8)
print('female test set performance is {}'\
      .format((female_test['attr_cut_predict'] == female_test['dec']).sum()/female_test.shape[0]))

female test set performance is 0.7329286798179059


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Tree

Tree model allows combination of factors (as opposed to logistic regression model).

In [54]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(min_impurity_split=0.3)

In [55]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner','attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want']
dt_model = dt.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(dt_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(dt_model.score(test[predictors].values, test['dec'].values)))

training set performance is 0.8794933879679643
test set performance is 0.7267311988086373


In [56]:
dt_model.feature_importances_

array([ 0.02697355,  0.01483152,  0.04568818,  0.00881074,  0.01654893,
        0.01588581,  0.03525998,  0.02560737,  0.11024847,  0.05146821,
        0.42485679,  0.05590846,  0.03568541,  0.04481684,  0.03602042,
        0.02073288,  0.03065645])

# Ensemble Methods

In [58]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [59]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner','attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want']
rf_model = rf.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(rf_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(rf_model.score(test[predictors].values, test['dec'].values)))

training set performance is 0.987707208046191
test set performance is 0.7580044676098288


In [73]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth=7)

In [74]:
predictors = ['age','date','int_corr',
 'samerace',
 'sinc_partner',
 'intel_partner',
 'fun_partner',
 'amb_partner',
 'shar_partner','prob', 'attr_partner','attr_want',
 'sinc_want',
 'intel_want',
 'fun_want',
 'amb_want',
 'shar_want']
gb_model = gb.fit(train[predictors].values, train['dec'].values)
print('training set performance is {}'.format(gb_model.score(train[predictors].values, train['dec'].values)))
print('test set performance is {}'.format(gb_model.score(test[predictors].values, test['dec'].values)))

training set performance is 0.979698267833861
test set performance is 0.7944899478778853
