# EE379K Final Project
# An Experiment on Speed Dating
### Abigail Johnson, Jeremy Shahan, Raymond Wen, Jacob Williamson

##### I. Some preliminary exploration...

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv('data/speed_dating.csv', encoding="ISO-8859-1")
print (df.head())
print (df.describe())


   iid   id  gender  idg  condtn  wave  round  position  positin1  order  \
0    1  1.0       0    1       1     1     10         7       NaN      4   
1    1  1.0       0    1       1     1     10         7       NaN      3   
2    1  1.0       0    1       1     1     10         7       NaN     10   
3    1  1.0       0    1       1     1     10         7       NaN      5   
4    1  1.0       0    1       1     1     10         7       NaN      7   

   partner   pid  match  int_corr  samerace  age_o  race_o  pf_o_att  \
0        1  11.0      0      0.14         0   27.0     2.0      35.0   
1        2  12.0      0      0.54         0   22.0     2.0      60.0   
2        3  13.0      1      0.16         1   22.0     4.0      19.0   
3        4  14.0      1      0.61         0   23.0     2.0      30.0   
4        5  15.0      1      0.21         0   24.0     3.0      30.0   

   pf_o_sin  pf_o_int  pf_o_fun  pf_o_amb  pf_o_sha  dec_o  attr_o  sinc_o  \
0      20.0      20.0      20.0 



               iid           id       gender          idg       condtn  \
count  8378.000000  8377.000000  8378.000000  8378.000000  8378.000000   
mean    283.675937     8.960248     0.500597    17.327166     1.828837   
std     158.583367     5.491329     0.500029    10.940735     0.376673   
min       1.000000     1.000000     0.000000     1.000000     1.000000   
25%     154.000000          NaN     0.000000     8.000000     2.000000   
50%     281.000000          NaN     1.000000    16.000000     2.000000   
75%     407.000000          NaN     1.000000    26.000000     2.000000   
max     552.000000    22.000000     1.000000    44.000000     2.000000   

              wave        round     position     positin1        order  \
count  8378.000000  8378.000000  8378.000000  6532.000000  8378.000000   
mean     11.350919    16.872046     9.042731     9.295775     8.927668   
std       5.995903     4.358458     5.514939     5.650199     5.477009   
min       1.000000     5.000000     1

##### Females: which features are most significant ?

In [2]:
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

input_vars = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob']

# female model
f = df.loc[df.gender == 0, :]
f_data = f.copy()
f_data = f.dropna(subset=input_vars)
f_model = sm.OLS(f_data.dec, sm.add_constant(f_data.loc[:, input_vars]))
f_results = f_model.fit()
print(f_results.params, "\n")
print("P-values (Women) :\n" , f_results.pvalues, "\n" ) 
# all p-values are less than 0.05 which means all of features are significant in the model.

# Now let's rank the features based on their importance in the model using RFE method, for women.
estimator = SVC(kernel= "linear", C=0.1)
selector = RFE(estimator,1)
cfl=selector.fit(f_data.loc[:, input_vars], f_data.dec)
ranking_f=cfl.ranking_
feature_ranking_f=zip(ranking_f, input_vars)
print("Ranked Feature Importance (Women):\n")
list(feature_ranking_f)

(const   -0.395832
attr     0.049571
sinc    -0.024828
intel    0.010104
fun      0.020365
amb     -0.020752
shar     0.023560
like     0.061468
prob     0.018177
dtype: float64, '\n')
('P-values (Women) :\n', const    3.024043e-28
attr     3.585211e-24
sinc     4.757680e-06
intel    1.412131e-01
fun      1.578416e-04
amb      5.948697e-05
shar     4.867489e-07
like     5.096468e-21
prob     1.299559e-06
dtype: float64, '\n')
Ranked Feature Importance (Women):



[(2, 'attr'),
 (6, 'sinc'),
 (8, 'intel'),
 (3, 'fun'),
 (4, 'amb'),
 (5, 'shar'),
 (1, 'like'),
 (7, 'prob')]

##### Males: which features are most significant ?

In [3]:
# female model
f = df.loc[df.gender == 1, :]
f_data = f.copy()
f_data = f.dropna(subset=input_vars)
f_model = sm.OLS(f_data.dec, sm.add_constant(f_data.loc[:, input_vars]))
f_results = f_model.fit()
print(f_results.params, "\n")
print("P-values (Men) :\n" , f_results.pvalues, "\n" ) 
# all p-values are less than 0.05 which means all of features are significant in the model.

# Now let's rank the features based on their importance in the model using RFE method, for women.
estimator = SVC(kernel= "linear", C=0.1)
selector = RFE(estimator,1)
cfl=selector.fit(f_data.loc[:, input_vars], f_data.dec)
ranking_f=cfl.ranking_
feature_ranking_f=zip(ranking_f, input_vars)
print("Ranked Feature Importance (Men):\n")
list(feature_ranking_f)

(const   -0.430886
attr     0.081242
sinc    -0.038042
intel   -0.009979
fun      0.018040
amb     -0.021523
shar     0.011099
like     0.091558
prob     0.025312
dtype: float64, '\n')
('P-values (Men) :\n', const    4.115154e-31
attr     1.052945e-56
sinc     6.215313e-11
intel    1.436562e-01
fun      1.034067e-03
amb      3.929744e-05
shar     1.463562e-02
like     2.415622e-43
prob     6.211989e-11
dtype: float64, '\n')
Ranked Feature Importance (Men):



[(2, 'attr'),
 (3, 'sinc'),
 (8, 'intel'),
 (5, 'fun'),
 (6, 'amb'),
 (7, 'shar'),
 (1, 'like'),
 (4, 'prob')]

Good Script:
https://www.kaggle.com/piecurus/d/annavictoria/speed-dating-experiment/a-gender-perspective-of-participants

In [5]:
"""
Standardizes preference scores to correct collection variations introduced by experiment
From data description : 
Waves 1-5 and 10-21: You have 100 points to distribute among the following attributes -- 
give more points to those attributes that are more important in a potential date, 
and fewer points to those attributes that are less important in a potential date.  Total points must equal 100.
Waves 6-9: Please rate the importance of the following attributes in a potential 
date on a scale of 1-10 (1=not at all important, 10=extremely important): 
"""
def standardize_pref_ratings(df):    
    print(df.head())
    
    # TODO: gather list of all preference cols
    # pref attributes : pf_o_att, attr_o
    pref_attr = df.filter( regex='attr|sinc|intel|fun|amb|shar' ).columns.tolist() 
    
    # locate rows that have been scaled 1-10 (waves 6 - 10)
    off = df.loc[ df['wave'].isin( range(6,10 ) ) ]
    print(off.head())
    
    for index, row in off.iterrows():
        row_attr_tot_points = np.sum( [row[pref] for pref in pref_attr ] )
        #print(row_attr_tot_points)
        for pref in pref_attr:
            # scale cell value to relative percentage considering provided preference scores, mult by 100 
            df.loc[index, pref] = ( row[pref] / row_attr_tot_points ) * 100        
    print(df)
    return df

# =========================================================
# Below used as test dataframe for development of above
# =========================================================

d =  {'wave' : pd.Series([ 1, 8, 6, 7 ]) ,
    'attr1_2': pd.Series([25, 2, 1, 5 ]) ,
    'attr4_3': pd.Series([25, 2, 1, 10 ]) ,
    'fun8675309': pd.Series([50, 2, 1, 0 ] )
}

df = pd.DataFrame(d)
df = standardize_pref_ratings(df)


   attr1_2  attr4_3  fun8675309  wave
0       25       25          50     1
1        2        2           2     8
2        1        1           1     6
3        5       10           0     7
   attr1_2  attr4_3  fun8675309  wave
1        2        2           2     8
2        1        1           1     6
3        5       10           0     7
   attr1_2  attr4_3  fun8675309  wave
0       25       25          50     1
1        0        0           0     8
2        0        0           0     6
3        0        0           0     7


In [21]:
df = pd.read_csv('data/speed_dating.csv', encoding="ISO-8859-1")
df = df.dropna(axis=1,how="all")

In [22]:
lookup = pd.DataFrame(index=range(1,553), columns=df.columns.values)

In [31]:
for x in range(1,553):
    lookup.loc[x]['gender']=np.average(df.loc[df["iid"]==x]["gender"])
    lookup.loc[x]['match']=np.average(df.loc[df["iid"]==x]["match"])
    lookup.loc[x]['age']=np.average(df.loc[df["iid"]==x]["age"])
    lookup.loc[x]['career_c']=np.average(df.loc[df["iid"]==x]["career_c"])
    lookup.loc[x]['field_cd']=np.average(df.loc[df["iid"]==x]["field_cd"])
    #lookup.loc[x]['income']=np.average(df.loc[df["iid"]==x]["income"])
    lookup.loc[x]['imprace']=np.average(df.loc[df["iid"]==x]["imprace"])
    lookup.loc[x]['imprelig']=np.average(df.loc[df["iid"]==x]["imprelig"])
    lookup.loc[x]['goal']=np.average(df.loc[df["iid"]==x]["goal"])
    lookup.loc[x]['date']=np.average(df.loc[df["iid"]==x]["date"])
    lookup.loc[x]['go_out']=np.average(df.loc[df["iid"]==x]["go_out"])
    lookup.loc[x]['date']=np.average(df.loc[df["iid"]==x]["date"])
    lookup.loc[x]['sports']=np.average(df.loc[df["iid"]==x]["sports"])
    lookup.loc[x]['tvsports']=np.average(df.loc[df["iid"]==x]["tvsports"])
    lookup.loc[x]['exercise']=np.average(df.loc[df["iid"]==x]["exercise"])
    lookup.loc[x]['museums']=np.average(df.loc[df["iid"]==x]["museums"])
    lookup.loc[x]['dining']=np.average(df.loc[df["iid"]==x]["dining"])
    lookup.loc[x]['art']=np.average(df.loc[df["iid"]==x]["art"])
    lookup.loc[x]['hiking']=np.average(df.loc[df["iid"]==x]["hiking"])
    lookup.loc[x]['gaming']=np.average(df.loc[df["iid"]==x]["gaming"])
    lookup.loc[x]['clubbing']=np.average(df.loc[df["iid"]==x]["clubbing"])
    lookup.loc[x]['reading']=np.average(df.loc[df["iid"]==x]["reading"])
    lookup.loc[x]['tv']=np.average(df.loc[df["iid"]==x]["tv"])
    lookup.loc[x]['theater']=np.average(df.loc[df["iid"]==x]["theater"])
    lookup.loc[x]['movies']=np.average(df.loc[df["iid"]==x]["movies"])
    lookup.loc[x]['concerts']=np.average(df.loc[df["iid"]==x]["concerts"])
    lookup.loc[x]['music']=np.average(df.loc[df["iid"]==x]["music"])
    lookup.loc[x]['shopping']=np.average(df.loc[df["iid"]==x]["shopping"])
    lookup.loc[x]['yoga']=np.average(df.loc[df["iid"]==x]["yoga"])
    lookup.loc[x]['attr3_1']=np.average(df.loc[df["iid"]==x]["attr3_1"])
    lookup.loc[x]['sinc3_1']=np.average(df.loc[df["iid"]==x]["sinc3_1"])
    lookup.loc[x]['intel3_1']=np.average(df.loc[df["iid"]==x]["intel3_1"])
    lookup.loc[x]['fun3_1']=np.average(df.loc[df["iid"]==x]["fun3_1"])
    lookup.loc[x]['amb3_1']=np.average(df.loc[df["iid"]==x]["amb3_1"])
    lookup.loc[x]['attr5_1']=np.average(df.loc[df["iid"]==x]["attr5_1"])
    lookup.loc[x]['sinc5_1']=np.average(df.loc[df["iid"]==x]["sinc5_1"])
    lookup.loc[x]['intel5_1']=np.average(df.loc[df["iid"]==x]["intel5_1"])
    lookup.loc[x]['fun5_1']=np.average(df.loc[df["iid"]==x]["fun5_1"])
    lookup.loc[x]['attr_o']=np.nanmean(df.loc[df["iid"]==x]["attr_o"])
    lookup.loc[x]['sinc_o']=np.nanmean(df.loc[df["iid"]==x]["sinc_o"])
    lookup.loc[x]['intel_o']=np.nanmean(df.loc[df["iid"]==x]["intel_o"])
    lookup.loc[x]['fun_o']=np.nanmean(df.loc[df["iid"]==x]["fun_o"])
    lookup.loc[x]['amb_o']=np.nanmean(df.loc[df["iid"]==x]["amb_o"])
    lookup.loc[x]['shar_o']=np.nanmean(df.loc[df["iid"]==x]["shar_o"])
    



In [33]:
lookup=lookup.dropna(axis=1,how="all")
lookup.head()

Unnamed: 0,gender,match,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,field_cd,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1
1,0,0.4,6.7,7.4,8.0,7.2,8.0,7.1,21,1,2,4,2,7,1,,9,2,8,9,1,1,5,1,5,6,9,1,10,10,9,8,1,6,8,8,8,7,,,,
2,0,0.2,7.7,7.1,7.9,7.5,7.5,6.5,24,1,2,5,1,5,1,,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,7,5,10,8,3,,,,
3,0,0.0,6.5,7.1,7.3,6.2,7.11111,6.0,25,2,8,4,6,3,1,,3,8,7,8,5,5,8,4,5,7,8,7,7,7,5,8,7,8,9,8,9,8,,,,
4,0,0.2,7.0,7.1,7.7,7.5,7.7,7.2,23,1,1,1,1,5,1,1.0,1,1,6,7,6,7,7,5,7,7,7,9,7,8,7,1,8,7,8,9,7,8,,,,
5,0,0.2,5.3,7.7,7.6,7.2,7.8,6.2,21,1,8,1,2,4,1,1.0,7,4,7,7,6,8,6,6,8,6,8,6,6,3,7,8,3,6,3,6,10,8,,,,
