# Speed Dating Data
https://www.kaggle.com/annavictoria/speed-dating-experiment

# 3 Feature Engineering/Pre-processing & Training Data Development

## 3.1  Imports

In [8]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from pandas_profiling import ProfileReport
from scipy import stats

from sb_utils import save_file

In [9]:
# not sure if I need this
import datetime
import unicodedata
import re
from sklearn.preprocessing import scale

## 3.2 Objectives

In the data wrangling notebook, we have identified our target dependent variable as desicion of parner of the specific subject, dec_o (might also consider match, desicion from both the subject and the partner) and cleaned the data accordingly. In this notebook, we will conduct further EDA, hoping to answer following questions.

1. The difference of desirable attributes in a male partner vs female partner.
2. The difference of desirable attributes among  races.
3. The difference of desirable major of male partner vs female partner
4. The difference of desirable majors among races.  

**Learning Objectives**:
1. Understand the importance of creating a model training development data set.
2. Correctly identify when to create dummy features or one-hot encoded features.
3. Understand the importance of magnitude standardization.
4. Apply the train and test split to the development dataset effectively

Since speed dating data is relatively clean we may not need to perform 2&3 pre-processing

Here is possible workflow: TBD
- Use stats.model package for logistic regression model (sloves classification problem): for my model notebook
    - import statsmodels.api as sm (This model  is kind of doing the similar thing as ANOVA)
- Apply this on the whole data set including the dec_o
- Use ‘Speed_Dating_data_cleaned.csv’ from data wrangling output
- Fill the missing data (NaN) with mean to model
- Use PCA to choose features (but will loose interpretability)
- Keep components 0-5 for ~90% var. 
- Use stepwise selection, elastic-net (or L1/L2 regularizers) 
    - Statsmodels should have the code to run this.


## 3.3 Load The Data

In [10]:
# df = pd.read_csv('../data/ski_data_cleaned.csv')
spd = pd.read_csv('spd_data_wrangling_output/Speed_Dating_data_cleaned.csv') #spd1_2 in data wrangling notebook
spd_fp = pd.read_csv('spd_data_wrangling_output/Speed_Dating_data_FemaleRatingMale_cleaned.csv') # spd1_2fp in data wrangling notebook 
spd_mp = pd.read_csv('spd_data_wrangling_output/Speed_Dating_data_MaleRatingFemale_cleaned.csv') # spd1_2mp in data wrangling notebook

In [11]:
spd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6266 entries, 0 to 6265
Data columns (total 24 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   gender    6266 non-null   int64  
 1   match     6266 non-null   int64  
 2   age       6198 non-null   float64
 3   race      6208 non-null   float64
 4   field     6208 non-null   object 
 5   career    6182 non-null   object 
 6   from      6192 non-null   object 
 7   goal      6192 non-null   float64
 8   int_corr  6118 non-null   float64
 9   samerace  6266 non-null   int64  
 10  imprace   6192 non-null   float64
 11  imprelig  6192 non-null   float64
 12  age_o     6189 non-null   float64
 13  race_o    6198 non-null   float64
 14  dec_o     6266 non-null   int64  
 15  attr_o    6127 non-null   float64
 16  sinc_o    6064 non-null   float64
 17  intel_o   6054 non-null   float64
 18  fun_o     5999 non-null   float64
 19  amb_o     5709 non-null   float64
 20  shar_o    5399 non-null   floa

In [12]:
spd.head()

Unnamed: 0,gender,match,age,race,field,career,from,goal,int_corr,samerace,...,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o
0,0,0,21.0,4.0,Law,lawyer,Chicago,2.0,0.14,0,...,0,6.0,8.0,8.0,8.0,8.0,6.0,7.0,4.0,2.0
1,0,0,21.0,4.0,Law,lawyer,Chicago,2.0,0.54,0,...,0,7.0,8.0,10.0,7.0,7.0,5.0,8.0,4.0,2.0
2,0,1,21.0,4.0,Law,lawyer,Chicago,2.0,0.16,1,...,1,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0
3,0,1,21.0,4.0,Law,lawyer,Chicago,2.0,0.61,0,...,1,7.0,8.0,9.0,8.0,9.0,8.0,7.0,7.0,2.0
4,0,1,21.0,4.0,Law,lawyer,Chicago,2.0,0.21,0,...,1,8.0,7.0,9.0,6.0,9.0,7.0,8.0,6.0,2.0


## 3.4 Pre-processing data

### 3.4.1 filling NaN with mean

In [18]:
# make new df
spd_mean = spd.fillna(spd.mean())
spd_fp_mean = spd_fp.fillna(spd_fp.mean())
spd_mp_mean = spd_mp.fillna(spd_mp.mean())

In [19]:
# check for NaN
spd_mean.isna().sum()

gender       0
match        0
age          0
race         0
field       58
career      84
from        74
goal         0
int_corr     0
samerace     0
imprace      0
imprelig     0
age_o        0
race_o       0
dec_o        0
attr_o       0
sinc_o       0
intel_o      0
fun_o        0
amb_o        0
shar_o       0
like_o       0
prob_o       0
met_o        0
dtype: int64

In [23]:
spd_mean.shape

(6266, 24)

In [20]:
spd_fp_mean.isna().sum()

gender       0
match        0
age          0
race         0
field       20
career      20
from        20
goal         0
int_corr     0
samerace     0
imprace      0
imprelig     0
age_o        0
race_o       0
dec_o        0
attr_o       0
sinc_o       0
intel_o      0
fun_o        0
amb_o        0
shar_o       0
like_o       0
prob_o       0
met_o        0
dtype: int64

In [24]:
spd_fp_mean.shape

(3138, 24)

In [21]:
spd_mp_mean.isna().sum()

gender       0
match        0
age          0
race         0
field       38
career      64
from        54
goal         0
int_corr     0
samerace     0
imprace      0
imprelig     0
age_o        0
race_o       0
dec_o        0
attr_o       0
sinc_o       0
intel_o      0
fun_o        0
amb_o        0
shar_o       0
like_o       0
prob_o       0
met_o        0
dtype: int64

In [25]:
spd_mp_mean.shape

(3128, 24)

### 3.4.2 drop columns with NaN

In [32]:
# make new df
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
spd_mean1 = spd_mean.dropna(axis='columns')
spd_fp_mean1 = spd_fp_mean.dropna(axis='columns')
spd_mp_mean1 = spd_fp_mean.dropna(axis='columns')

In [33]:
# check for NaN
spd_mean1.isna().sum()

gender      0
match       0
age         0
race        0
goal        0
int_corr    0
samerace    0
imprace     0
imprelig    0
age_o       0
race_o      0
dec_o       0
attr_o      0
sinc_o      0
intel_o     0
fun_o       0
amb_o       0
shar_o      0
like_o      0
prob_o      0
met_o       0
dtype: int64

In [34]:
spd_mean1.shape

(6266, 21)

In [35]:
spd_fp_mean1.isna().sum()

gender      0
match       0
age         0
race        0
goal        0
int_corr    0
samerace    0
imprace     0
imprelig    0
age_o       0
race_o      0
dec_o       0
attr_o      0
sinc_o      0
intel_o     0
fun_o       0
amb_o       0
shar_o      0
like_o      0
prob_o      0
met_o       0
dtype: int64

In [36]:
spd_fp_mean1.shape

(3138, 21)

In [37]:
spd_mp_mean1.isna().sum()

gender      0
match       0
age         0
race        0
goal        0
int_corr    0
samerace    0
imprace     0
imprelig    0
age_o       0
race_o      0
dec_o       0
attr_o      0
sinc_o      0
intel_o     0
fun_o       0
amb_o       0
shar_o      0
like_o      0
prob_o      0
met_o       0
dtype: int64

In [38]:
spd_mp_mean1.shape

(3138, 21)

### 3.4.3 Extracting more seemingly relevant features

In [47]:
# refer to 'Speed dating_2_EDA_mk'
spd_mean1_mini = spd_mean1.loc[:, 'dec_o':'prob_o']

In [48]:
# check
spd_mean1_mini.isna().sum()

dec_o      0
attr_o     0
sinc_o     0
intel_o    0
fun_o      0
amb_o      0
shar_o     0
like_o     0
prob_o     0
dtype: int64

In [49]:
spd_mean1_mini.shape

(6266, 9)

In [51]:
spd_mean1_mini.describe()

Unnamed: 0,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o
count,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0
mean,0.427705,6.233132,7.223615,7.403204,6.418736,6.826152,5.554269,6.166317,5.233812
std,0.494785,1.912573,1.689306,1.502261,1.910492,1.689681,1.987716,1.826321,2.106286
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.0,6.0,7.0,5.0,6.0,5.0,5.0,4.0
50%,0.0,6.0,7.0,7.403204,6.418736,7.0,5.554269,6.0,5.0
75%,1.0,8.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0
max,1.0,10.5,10.0,10.0,11.0,10.0,10.0,10.0,10.0


In [None]:
### 3.4.4 Scale the whole data

### 3.4.5 Set up input data for logistic Regression Model (X and y)

In [40]:
# spd_mean1
X = spd_mean1.drop(columns='dec_o')
y = spd_mean1['dec_o']

In [41]:
# check
X.shape, y.shape

((6266, 20), (6266,))

In [45]:
X.describe()

Unnamed: 0,gender,match,age,race,goal,int_corr,samerace,imprace,imprelig,age_o,race_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o
count,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0
mean,0.500798,0.170763,26.25718,2.691205,2.152616,0.192596,0.405043,3.804748,3.734981,26.265148,2.690384,6.233132,7.223615,7.403204,6.418736,6.826152,5.554269,6.166317,5.233812,1.956145
std,0.500039,0.376332,3.506465,1.213586,1.421687,0.300382,0.49094,2.865457,2.797904,3.499901,1.212258,1.912573,1.689306,1.502261,1.910492,1.689681,1.987716,1.826321,2.106286,0.258137
min,0.0,0.0,18.0,1.0,1.0,-0.83,0.0,1.0,1.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,24.0,2.0,1.0,-0.01,0.0,1.0,1.0,24.0,2.0,5.0,6.0,7.0,5.0,6.0,5.0,5.0,4.0,2.0
50%,1.0,0.0,26.0,2.0,2.0,0.2,0.0,3.0,3.0,26.0,2.0,6.0,7.0,7.403204,6.418736,7.0,5.554269,6.0,5.0,2.0
75%,1.0,0.0,28.0,4.0,2.0,0.42,1.0,6.0,6.0,28.0,4.0,8.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,2.0
max,1.0,1.0,42.0,6.0,6.0,0.9,1.0,10.0,10.0,42.0,6.0,10.5,10.0,10.0,11.0,10.0,10.0,10.0,10.0,8.0


Need scaling to use X (whole) as input. Will try both methods: 1) extract relavant features from X (scale 10), 2) use whole feature X with scale.

In [54]:
# spd_mean1_mini
Xm = spd_mean1_mini.drop(columns='dec_o')
ym = spd_mean1_mini['dec_o']

In [55]:
# check
Xm.shape, ym.shape

((6266, 8), (6266,))

In [56]:
Xm.describe()

Unnamed: 0,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o
count,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0,6266.0
mean,6.233132,7.223615,7.403204,6.418736,6.826152,5.554269,6.166317,5.233812
std,1.912573,1.689306,1.502261,1.910492,1.689681,1.987716,1.826321,2.106286
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,6.0,7.0,5.0,6.0,5.0,5.0,4.0
50%,6.0,7.0,7.403204,6.418736,7.0,5.554269,6.0,5.0
75%,8.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0
max,10.5,10.0,10.0,11.0,10.0,10.0,10.0,10.0


## 3.5 Training Data Development

### 3.5.1 LogisticRegression via sklearn

#### 3.5.1.1 Use X, y as whole without scaling

In [60]:
# refer to '14.1.2_3_Supervised Learning_FineTuning'
# Import necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Split the data into a training and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(X_train,y_train)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Tuned Logistic Regression Parameters: {'C': 100000000.0}
Best score is 0.8282119708738058


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

#### 3.5.1.2 Use Xm, ym: extracted relevant features

In [62]:
# refer to '14.1.2_3_Supervised Learning_FineTuning'
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Split the data into a training and test set.
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.2, random_state=42)

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(Xm_train,ym_train)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

# apply best estimators to test set
# refer to '14.2.11 Logistic Regression Advanced Case Study_mk'
logreg_best = logreg_cv.best_estimator_
training_accuracy = logreg_best.score(Xm_train, ym_train)
test_accuracy = logreg_best.score(Xm_test, ym_test)
print("Accuracy on training data: {:0.2f}".format(training_accuracy))
print("Accuracy on test data:     {:0.2f}".format(test_accuracy))

Tuned Logistic Regression Parameters: {'C': 0.4393970560760795}
Best score is 0.7711496249773633
Accuracy on training data: 0.77
Accuracy on test data:     0.76


In [None]:
### should build a pipeline!?? fill nan with median..etc
# refer to '14.1.2_4_Supervised Learning with scikit-learn_Preprocessing and Pipeline'

#### 3.5.1.3 Use Xs, ys as whole with scaling

### 3.5.2 LogisticRegression via statsmodels

#### 3.5.2.1 Use Xm, ym: extracted relevant features

In [63]:
# Import the statsmodels module
# refer to 'https://www.geeksforgeeks.org/logistic-regression-using-statsmodels/'
import statsmodels.api as sm
log_reg = sm.Logit(ym_train, Xm_train).fit() #%%%%% how to perform cross validation fro statsmodels and pick best C??

Optimization terminated successfully.
         Current function value: 0.558129
         Iterations 6


In [64]:
# printing the summary table 
print(log_reg.summary()) 

                           Logit Regression Results                           
Dep. Variable:                  dec_o   No. Observations:                 5012
Model:                          Logit   Df Residuals:                     5004
Method:                           MLE   Df Model:                            7
Date:                Sun, 07 Feb 2021   Pseudo R-squ.:                  0.1822
Time:                        00:57:53   Log-Likelihood:                -2797.3
converged:                       True   LL-Null:                       -3420.4
Covariance Type:            nonrobust   LLR p-value:                7.472e-265
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
attr_o         0.2678      0.025     10.737      0.000       0.219       0.317
sinc_o        -0.2946      0.029    -10.325      0.000      -0.351      -0.239
intel_o       -0.3439      0.033    -10.393      0.0

In [65]:
# Import necessary modules
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV

# Split the data into a training and test set.
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.2, random_state=42)

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
log_reg = sm.Logit(ym_train, Xm_train).fit() 

# Instantiate the GridSearchCV object: logreg_cv
log_reg_cv = GridSearchCV(log_reg, param_grid, cv=5)

# Fit it to the data
log_reg_cv.fit(Xm_train,ym_train)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(log_reg_cv.best_params_)) 
print("Best score is {}".format(log_reg_cv.best_score_))

Optimization terminated successfully.
         Current function value: 0.558129
         Iterations 6


TypeError: estimator should be an estimator implementing 'fit' method, <statsmodels.discrete.discrete_model.BinaryResultsWrapper object at 0x0000023CA9F053D0> was passed