# EE379K Final Project
# An Experiment on Speed Dating
### Abigail Johnson, Jeremy Shahan, Raymond Wen, Jacob Williamson

##### I. Some preliminary exploration...

In [121]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv('data/speed_dating.csv', encoding="ISO-8859-1")
print (df.head())
print (df.describe())


   iid   id  gender  idg  condtn  wave  round  position  positin1  order  \
0    1  1.0       0    1       1     1     10         7       NaN      4   
1    1  1.0       0    1       1     1     10         7       NaN      3   
2    1  1.0       0    1       1     1     10         7       NaN     10   
3    1  1.0       0    1       1     1     10         7       NaN      5   
4    1  1.0       0    1       1     1     10         7       NaN      7   

   partner   pid  match  int_corr  samerace  age_o  race_o  pf_o_att  \
0        1  11.0      0      0.14         0   27.0     2.0      35.0   
1        2  12.0      0      0.54         0   22.0     2.0      60.0   
2        3  13.0      1      0.16         1   22.0     4.0      19.0   
3        4  14.0      1      0.61         0   23.0     2.0      30.0   
4        5  15.0      1      0.21         0   24.0     3.0      30.0   

   pf_o_sin  pf_o_int  pf_o_fun  pf_o_amb  pf_o_sha  dec_o  attr_o  sinc_o  \
0      20.0      20.0      20.0 

##### Females: which features are most significant ?

In [17]:
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

input_vars = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob']

# female model
f = df.loc[df.gender == 0, :]
f_data = f.copy()
f_data = f.dropna(subset=input_vars)
f_model = sm.OLS(f_data.dec, sm.add_constant(f_data.loc[:, input_vars]))
f_results = f_model.fit()
print(f_results.params, "\n")
print("P-values (Women) :\n" , f_results.pvalues, "\n" ) 
# all p-values are less than 0.05 which means all of features are significant in the model.

# Now let's rank the features based on their importance in the model using RFE method, for women.
estimator = SVC(kernel= "linear", C=0.1)
selector = RFE(estimator,1)
cfl=selector.fit(f_data.loc[:, input_vars], f_data.dec)
ranking_f=cfl.ranking_
feature_ranking_f=zip(ranking_f, input_vars)
print("Ranked Feature Importance (Women):\n")
list(feature_ranking_f)

const   -0.395832
attr     0.049571
sinc    -0.024828
intel    0.010104
fun      0.020365
amb     -0.020752
shar     0.023560
like     0.061468
prob     0.018177
dtype: float64 

P-values (Women) :
 const    3.024043e-28
attr     3.585211e-24
sinc     4.757680e-06
intel    1.412131e-01
fun      1.578416e-04
amb      5.948697e-05
shar     4.867489e-07
like     5.096468e-21
prob     1.299559e-06
dtype: float64 

Ranked Feature Importance (Women):



[(2, 'attr'),
 (6, 'sinc'),
 (8, 'intel'),
 (3, 'fun'),
 (4, 'amb'),
 (5, 'shar'),
 (1, 'like'),
 (7, 'prob')]

##### Males: which features are most significant ?

In [18]:
# female model
f = df.loc[df.gender == 1, :]
f_data = f.copy()
f_data = f.dropna(subset=input_vars)
f_model = sm.OLS(f_data.dec, sm.add_constant(f_data.loc[:, input_vars]))
f_results = f_model.fit()
print(f_results.params, "\n")
print("P-values (Men) :\n" , f_results.pvalues, "\n" ) 
# all p-values are less than 0.05 which means all of features are significant in the model.

# Now let's rank the features based on their importance in the model using RFE method, for women.
estimator = SVC(kernel= "linear", C=0.1)
selector = RFE(estimator,1)
cfl=selector.fit(f_data.loc[:, input_vars], f_data.dec)
ranking_f=cfl.ranking_
feature_ranking_f=zip(ranking_f, input_vars)
print("Ranked Feature Importance (Men):\n")
list(feature_ranking_f)

const   -0.430886
attr     0.081242
sinc    -0.038042
intel   -0.009979
fun      0.018040
amb     -0.021523
shar     0.011099
like     0.091558
prob     0.025312
dtype: float64 

P-values (Men) :
 const    4.115154e-31
attr     1.052945e-56
sinc     6.215313e-11
intel    1.436562e-01
fun      1.034067e-03
amb      3.929744e-05
shar     1.463562e-02
like     2.415622e-43
prob     6.211989e-11
dtype: float64 

Ranked Feature Importance (Men):



[(2, 'attr'),
 (3, 'sinc'),
 (8, 'intel'),
 (5, 'fun'),
 (6, 'amb'),
 (7, 'shar'),
 (1, 'like'),
 (4, 'prob')]

In [None]:
Good Script:
https://www.kaggle.com/piecurus/d/annavictoria/speed-dating-experiment/a-gender-perspective-of-participants

In [66]:
"""
Standardizes preference scores to correct collection variations introduced by experiment
From data description : 
Waves 1-5 and 10-21: You have 100 points to distribute among the following attributes -- 
give more points to those attributes that are more important in a potential date, 
and fewer points to those attributes that are less important in a potential date.  Total points must equal 100.
Waves 6-9: Please rate the importance of the following attributes in a potential 
date on a scale of 1-10 (1=not at all important, 10=extremely important): 
"""
def standardize_pref_ratings(df):    
    print(df.head())
    
    # TODO: gather list of all preference cols
    # pref attributes : pf_o_att, attr_o
    pref_attr = df.filter( regex='attr|sinc|intel|fun|amb|shar' ).columns.tolist() 
    
    # locate rows that have been scaled 1-10 (waves 6 - 10)
    off = df.loc[ df['wave'].isin( range(6,10 ) ) ]
    print(off.head())
    
    for index, row in off.iterrows():
        row_attr_tot_points = np.sum( [row[pref] for pref in pref_attr ] )
        #print(row_attr_tot_points)
        for pref in pref_attr:
            # scale cell value to relative percentage considering provided preference scores, mult by 100 
            df.loc[index, pref] = ( row[pref] / row_attr_tot_points ) * 100        
    print(df)
    return df

# =========================================================
# Below used as test dataframe for development of above
# =========================================================

d =  {'wave' : pd.Series([ 1, 8, 6, 7 ]) ,
    'attr1_2': pd.Series([25, 2, 1, 5 ]) ,
    'attr4_3': pd.Series([25, 2, 1, 10 ]) ,
    'fun8675309': pd.Series([50, 2, 1, 0 ] )
}

df = pd.DataFrame(d)
df = standardize_pref_ratings(df)


   attr1_2  attr4_3  fun8675309  wave
0       25       25          50     1
1        2        2           2     8
2        1        1           1     6
3        5       10           0     7
   attr1_2  attr4_3  fun8675309  wave
1        2        2           2     8
2        1        1           1     6
3        5       10           0     7
     attr1_2    attr4_3  fun8675309  wave
0  25.000000  25.000000   50.000000     1
1  33.333333  33.333333   33.333333     8
2  33.333333  33.333333   33.333333     6
3  33.333333  66.666667    0.000000     7


In [122]:
df = pd.read_csv('data/speed_dating.csv', encoding="ISO-8859-1")
df = df.dropna(axis=1,how="all")

In [126]:
lookup = pd.DataFrame(index=range(1,553), columns=df.columns.values)

In [150]:
for x in range(1,553):
    lookup.loc[x]['gender']=np.average(df.loc[df["iid"]==x]["gender"])
    lookup.loc[x]['match']=np.average(df.loc[df["iid"]==x]["match"])
    lookup.loc[x]['age']=np.average(df.loc[df["iid"]==x]["age"])
    lookup.loc[x]['career_c']=np.average(df.loc[df["iid"]==x]["career_c"])
    #lookup.loc[x]['income']=np.average(df.loc[df["iid"]==x]["income"])
    lookup.loc[x]['imprace']=np.average(df.loc[df["iid"]==x]["imprace"])
    lookup.loc[x]['imprelig']=np.average(df.loc[df["iid"]==x]["imprelig"])
    lookup.loc[x]['goal']=np.average(df.loc[df["iid"]==x]["goal"])

TypeError: unsupported operand type(s) for /: 'unicode' and 'int'

In [147]:
lookup.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,partner,pid,match,int_corr,samerace,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field,field_cd,undergra,mn_sat,tuition,race,imprace,imprelig,from,zipcode,income,goal,date,go_out,career,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,amb5_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,attr1_s,sinc1_s,intel1_s,fun1_s,amb1_s,shar1_s,attr3_s,sinc3_s,intel3_s,fun3_s,amb3_s,satis_2,length,numdat_2,attr7_2,sinc7_2,intel7_2,fun7_2,amb7_2,shar7_2,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,attr4_2,sinc4_2,intel4_2,fun4_2,amb4_2,shar4_2,attr2_2,sinc2_2,intel2_2,fun2_2,amb2_2,shar2_2,attr3_2,sinc3_2,intel3_2,fun3_2,amb3_2,attr5_2,sinc5_2,intel5_2,fun5_2,amb5_2,you_call,them_cal,date_3,numdat_3,num_in_3,attr1_3,sinc1_3,intel1_3,fun1_3,amb1_3,shar1_3,attr7_3,sinc7_3,intel7_3,fun7_3,amb7_3,shar7_3,attr4_3,sinc4_3,intel4_3,fun4_3,amb4_3,shar4_3,attr2_3,sinc2_3,intel2_3,fun2_3,amb2_3,shar2_3,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
1,,,0,,,,,,,,,,0.4,,,,,,,,,,,,,,,,,,,,,21,,,,,,,,,,,,,,,lawyer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,0,,,,,,,,,,0.2,,,,,,,,,,,,,,,,,,,,,24,,,,,,,,,,,,,,,law,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,0,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,25,,,,,,,,,,,,,,,Economist,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,0,,,,,,,,,,0.2,,,,,,,,,,,,,,,,,,,,,23,,,,,,,,,,,,,,,lawyer/policy work,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,0,,,,,,,,,,0.2,,,,,,,,,,,,,,,,,,,,,21,,,,,,,,,,,,,,,lawyer,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
