# EE379K Final Project
# An Experiment on Speed Dating
### Abigail Johnson, Jeremy Shahan, Raymond Wen, Jacob Williamson

In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.regression.linear_model as regression
import numpy as np
import pandas as pd

##### I. Some preliminary exploration...

In [2]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
df = pd.read_csv('data/speed_dating.csv', encoding="ISO-8859-1")
print (df.head())
print (df.describe())


   iid   id  gender  idg  condtn  wave  round  position  positin1  order  \
0    1  1.0       0    1       1     1     10         7       NaN      4   
1    1  1.0       0    1       1     1     10         7       NaN      3   
2    1  1.0       0    1       1     1     10         7       NaN     10   
3    1  1.0       0    1       1     1     10         7       NaN      5   
4    1  1.0       0    1       1     1     10         7       NaN      7   

   partner   pid  match  int_corr  samerace  age_o  race_o  pf_o_att  \
0        1  11.0      0      0.14         0   27.0     2.0      35.0   
1        2  12.0      0      0.54         0   22.0     2.0      60.0   
2        3  13.0      1      0.16         1   22.0     4.0      19.0   
3        4  14.0      1      0.61         0   23.0     2.0      30.0   
4        5  15.0      1      0.21         0   24.0     3.0      30.0   

   pf_o_sin  pf_o_int  pf_o_fun  pf_o_amb  pf_o_sha  dec_o  attr_o  sinc_o  \
0      20.0      20.0      20.0 



               iid           id       gender          idg       condtn  \
count  8378.000000  8377.000000  8378.000000  8378.000000  8378.000000   
mean    283.675937     8.960248     0.500597    17.327166     1.828837   
std     158.583367     5.491329     0.500029    10.940735     0.376673   
min       1.000000     1.000000     0.000000     1.000000     1.000000   
25%     154.000000          NaN     0.000000     8.000000     2.000000   
50%     281.000000          NaN     1.000000    16.000000     2.000000   
75%     407.000000          NaN     1.000000    26.000000     2.000000   
max     552.000000    22.000000     1.000000    44.000000     2.000000   

              wave        round     position     positin1        order  \
count  8378.000000  8378.000000  8378.000000  6532.000000  8378.000000   
mean     11.350919    16.872046     9.042731     9.295775     8.927668   
std       5.995903     4.358458     5.514939     5.650199     5.477009   
min       1.000000     5.000000     1

##### Females: which features are most significant ?

In [3]:
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

input_vars = ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob']

# female model
f = df.loc[df.gender == 0, :]
f_data = f.copy()
f_data = f.dropna(subset=input_vars)
f_model = sm.OLS(f_data.dec, sm.add_constant(f_data.loc[:, input_vars]))
f_results = f_model.fit()
print(f_results.params, "\n")
print("P-values (Women) :\n" , f_results.pvalues, "\n" ) 
# all p-values are less than 0.05 which means all of features are significant in the model.

# Now let's rank the features based on their importance in the model using RFE method, for women.
estimator = SVC(kernel= "linear", C=0.1)
selector = RFE(estimator,1)
cfl=selector.fit(f_data.loc[:, input_vars], f_data.dec)
ranking_f=cfl.ranking_
feature_ranking_f=zip(ranking_f, input_vars)
print("Ranked Feature Importance (Women):\n")
list(feature_ranking_f)

(const   -0.395832
attr     0.049571
sinc    -0.024828
intel    0.010104
fun      0.020365
amb     -0.020752
shar     0.023560
like     0.061468
prob     0.018177
dtype: float64, '\n')
('P-values (Women) :\n', const    3.024043e-28
attr     3.585211e-24
sinc     4.757680e-06
intel    1.412131e-01
fun      1.578416e-04
amb      5.948697e-05
shar     4.867489e-07
like     5.096468e-21
prob     1.299559e-06
dtype: float64, '\n')
Ranked Feature Importance (Women):



[(2, 'attr'),
 (6, 'sinc'),
 (8, 'intel'),
 (3, 'fun'),
 (4, 'amb'),
 (5, 'shar'),
 (1, 'like'),
 (7, 'prob')]

##### Males: which features are most significant ?

In [4]:
# female model
f = df.loc[df.gender == 1, :]
f_data = f.copy()
f_data = f.dropna(subset=input_vars)
f_model = sm.OLS(f_data.dec, sm.add_constant(f_data.loc[:, input_vars]))
f_results = f_model.fit()
print(f_results.params, "\n")
print("P-values (Men) :\n" , f_results.pvalues, "\n" ) 
# all p-values are less than 0.05 which means all of features are significant in the model.

# Now let's rank the features based on their importance in the model using RFE method, for women.
estimator = SVC(kernel= "linear", C=0.1)
selector = RFE(estimator,1)
cfl=selector.fit(f_data.loc[:, input_vars], f_data.dec)
ranking_f=cfl.ranking_
feature_ranking_f=zip(ranking_f, input_vars)
print("Ranked Feature Importance (Men):\n")
list(feature_ranking_f)

(const   -0.430886
attr     0.081242
sinc    -0.038042
intel   -0.009979
fun      0.018040
amb     -0.021523
shar     0.011099
like     0.091558
prob     0.025312
dtype: float64, '\n')
('P-values (Men) :\n', const    4.115154e-31
attr     1.052945e-56
sinc     6.215313e-11
intel    1.436562e-01
fun      1.034067e-03
amb      3.929744e-05
shar     1.463562e-02
like     2.415622e-43
prob     6.211989e-11
dtype: float64, '\n')
Ranked Feature Importance (Men):



[(2, 'attr'),
 (3, 'sinc'),
 (8, 'intel'),
 (5, 'fun'),
 (6, 'amb'),
 (7, 'shar'),
 (1, 'like'),
 (4, 'prob')]

Good Script:
https://www.kaggle.com/piecurus/d/annavictoria/speed-dating-experiment/a-gender-perspective-of-participants

In [5]:
"""
Standardizes preference scores to correct collection variations introduced by experiment
From data description : 
Waves 1-5 and 10-21: You have 100 points to distribute among the following attributes -- 
give more points to those attributes that are more important in a potential date, 
and fewer points to those attributes that are less important in a potential date.  Total points must equal 100.
Waves 6-9: Please rate the importance of the following attributes in a potential 
date on a scale of 1-10 (1=not at all important, 10=extremely important): 
"""
def standardize_pref_ratings(df):    
    print(df.head())
    
    # TODO: gather list of all preference cols
    # pref attributes : pf_o_att, attr_o
    pref_attr = df.filter( regex='attr|sinc|intel|fun|amb|shar' ).columns.tolist() 
    
    # locate rows that have been scaled 1-10 (waves 6 - 10)
    off = df.loc[ df['wave'].isin( range(6,10 ) ) ]
    print(off.head())
    
    for index, row in off.iterrows():
        row_attr_tot_points = np.sum( [row[pref] for pref in pref_attr ] )
        #print(row_attr_tot_points)
        for pref in pref_attr:
            # scale cell value to relative percentage considering provided preference scores, mult by 100 
            df.loc[index, pref] = ( row[pref] / row_attr_tot_points ) * 100        
    return df

# =========================================================
# Below used as test dataframe for development of above
# =========================================================

d =  {'wave' : pd.Series([ 1, 8, 6, 7 ]) ,
    'attr1_2': pd.Series([25, 2, 1, 5 ]) ,
    'attr4_3': pd.Series([25, 2, 1, 10 ]) ,
    'fun8675309': pd.Series([50, 2, 1, 0 ] )
}

df = pd.DataFrame(d)
df = standardize_pref_ratings(df)


   attr1_2  attr4_3  fun8675309  wave
0       25       25          50     1
1        2        2           2     8
2        1        1           1     6
3        5       10           0     7
   attr1_2  attr4_3  fun8675309  wave
1        2        2           2     8
2        1        1           1     6
3        5       10           0     7


In [6]:
df = pd.read_csv('data/speed_dating.csv', encoding="ISO-8859-1")
df = df.dropna(axis=1,how="all")
df=standardize_pref_ratings(df)

   iid   id  gender  idg  condtn  wave  round  position  positin1  order  \
0    1  1.0       0    1       1     1     10         7       NaN      4   
1    1  1.0       0    1       1     1     10         7       NaN      3   
2    1  1.0       0    1       1     1     10         7       NaN     10   
3    1  1.0       0    1       1     1     10         7       NaN      5   
4    1  1.0       0    1       1     1     10         7       NaN      7   

   partner   pid  match  int_corr  samerace  age_o  race_o  pf_o_att  \
0        1  11.0      0      0.14         0   27.0     2.0      35.0   
1        2  12.0      0      0.54         0   22.0     2.0      60.0   
2        3  13.0      1      0.16         1   22.0     4.0      19.0   
3        4  14.0      1      0.61         0   23.0     2.0      30.0   
4        5  15.0      1      0.21         0   24.0     3.0      30.0   

   pf_o_sin  pf_o_int  pf_o_fun  pf_o_amb  pf_o_sha  dec_o  attr_o  sinc_o  \
0      20.0      20.0      20.0 

In [7]:
lookup = pd.DataFrame(index=range(1,553), columns=df.columns.values)

In [8]:
for x in range(1,553):
    lookup.loc[x]['gender']=np.average(df.loc[df["iid"]==x]["gender"])
    lookup.loc[x]['match']=np.average(df.loc[df["iid"]==x]["match"])
    lookup.loc[x]['age']=np.average(df.loc[df["iid"]==x]["age"])
    lookup.loc[x]['career_c']=np.average(df.loc[df["iid"]==x]["career_c"])
    lookup.loc[x]['field_cd']=np.average(df.loc[df["iid"]==x]["field_cd"])
    #lookup.loc[x]['income']=np.average(df.loc[df["iid"]==x]["income"])
    lookup.loc[x]['imprace']=np.average(df.loc[df["iid"]==x]["imprace"])
    lookup.loc[x]['imprelig']=np.average(df.loc[df["iid"]==x]["imprelig"])
    lookup.loc[x]['goal']=np.average(df.loc[df["iid"]==x]["goal"])
    lookup.loc[x]['date']=np.average(df.loc[df["iid"]==x]["date"])
    lookup.loc[x]['go_out']=np.average(df.loc[df["iid"]==x]["go_out"])
    lookup.loc[x]['date']=np.average(df.loc[df["iid"]==x]["date"])
    lookup.loc[x]['sports']=np.average(df.loc[df["iid"]==x]["sports"])
    lookup.loc[x]['tvsports']=np.average(df.loc[df["iid"]==x]["tvsports"])
    lookup.loc[x]['exercise']=np.average(df.loc[df["iid"]==x]["exercise"])
    lookup.loc[x]['museums']=np.average(df.loc[df["iid"]==x]["museums"])
    lookup.loc[x]['dining']=np.average(df.loc[df["iid"]==x]["dining"])
    lookup.loc[x]['art']=np.average(df.loc[df["iid"]==x]["art"])
    lookup.loc[x]['hiking']=np.average(df.loc[df["iid"]==x]["hiking"])
    lookup.loc[x]['gaming']=np.average(df.loc[df["iid"]==x]["gaming"])
    lookup.loc[x]['clubbing']=np.average(df.loc[df["iid"]==x]["clubbing"])
    lookup.loc[x]['reading']=np.average(df.loc[df["iid"]==x]["reading"])
    lookup.loc[x]['tv']=np.average(df.loc[df["iid"]==x]["tv"])
    lookup.loc[x]['theater']=np.average(df.loc[df["iid"]==x]["theater"])
    lookup.loc[x]['movies']=np.average(df.loc[df["iid"]==x]["movies"])
    lookup.loc[x]['concerts']=np.average(df.loc[df["iid"]==x]["concerts"])
    lookup.loc[x]['music']=np.average(df.loc[df["iid"]==x]["music"])
    lookup.loc[x]['shopping']=np.average(df.loc[df["iid"]==x]["shopping"])
    lookup.loc[x]['yoga']=np.average(df.loc[df["iid"]==x]["yoga"])
    lookup.loc[x]['attr3_1']=np.average(df.loc[df["iid"]==x]["attr3_1"])
    lookup.loc[x]['sinc3_1']=np.average(df.loc[df["iid"]==x]["sinc3_1"])
    lookup.loc[x]['intel3_1']=np.average(df.loc[df["iid"]==x]["intel3_1"])
    lookup.loc[x]['fun3_1']=np.average(df.loc[df["iid"]==x]["fun3_1"])
    lookup.loc[x]['amb3_1']=np.average(df.loc[df["iid"]==x]["amb3_1"])
    lookup.loc[x]['attr5_1']=np.average(df.loc[df["iid"]==x]["attr5_1"])
    lookup.loc[x]['sinc5_1']=np.average(df.loc[df["iid"]==x]["sinc5_1"])
    lookup.loc[x]['intel5_1']=np.average(df.loc[df["iid"]==x]["intel5_1"])
    lookup.loc[x]['fun5_1']=np.average(df.loc[df["iid"]==x]["fun5_1"])
    lookup.loc[x]['attr_o']=np.nanmean(df.loc[df["iid"]==x]["attr_o"])
    lookup.loc[x]['sinc_o']=np.nanmean(df.loc[df["iid"]==x]["sinc_o"])
    lookup.loc[x]['intel_o']=np.nanmean(df.loc[df["iid"]==x]["intel_o"])
    lookup.loc[x]['fun_o']=np.nanmean(df.loc[df["iid"]==x]["fun_o"])
    lookup.loc[x]['amb_o']=np.nanmean(df.loc[df["iid"]==x]["amb_o"])
    lookup.loc[x]['shar_o']=np.nanmean(df.loc[df["iid"]==x]["shar_o"])
    



In [9]:
lookup=lookup.dropna(axis=1,how="all")
lookup=lookup.fillna(lookup.mean())

In [15]:
lookup_men=lookup.loc[lookup["gender"]==1]
lookup_women=lookup.loc[lookup["gender"]==0]

## How attributes contribute to match success rate in men and women

In [11]:
est = smf.ols('match ~ attr_o + sinc_o + intel_o+fun_o+amb_o+shar_o', lookup).fit()
print est.summary().tables[0]
print est.summary().tables[1]

                            OLS Regression Results                            
Dep. Variable:                  match   R-squared:                       0.201
Model:                            OLS   Adj. R-squared:                  0.192
Method:                 Least Squares   F-statistic:                     22.88
Date:                Wed, 30 Nov 2016   Prob (F-statistic):           4.05e-24
Time:                        00:48:06   Log-Likelihood:                 327.95
No. Observations:                 552   AIC:                            -641.9
Df Residuals:                     545   BIC:                            -611.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.2926      0.081     -3.613      0.0

## How attributes contribute to match success in men

In [16]:
est = smf.ols('match ~ attr_o + sinc_o + intel_o+fun_o+amb_o+shar_o', lookup_men).fit()
print est.summary().tables[0]
print est.summary().tables[1]

                            OLS Regression Results                            
Dep. Variable:                  match   R-squared:                       0.238
Model:                            OLS   Adj. R-squared:                  0.221
Method:                 Least Squares   F-statistic:                     14.02
Date:                Wed, 30 Nov 2016   Prob (F-statistic):           6.94e-14
Time:                        01:07:06   Log-Likelihood:                 170.19
No. Observations:                 277   AIC:                            -326.4
Df Residuals:                     270   BIC:                            -301.0
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.2354      0.104     -2.270      0.0

## How attributes contribute to the match success of women

In [17]:
est = smf.ols('match ~ attr_o + sinc_o + intel_o+fun_o+amb_o+shar_o', lookup_women).fit()
print est.summary().tables[0]
print est.summary().tables[1]

                            OLS Regression Results                            
Dep. Variable:                  match   R-squared:                       0.178
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     9.621
Date:                Wed, 30 Nov 2016   Prob (F-statistic):           1.38e-09
Time:                        01:07:56   Log-Likelihood:                 159.15
No. Observations:                 274   AIC:                            -304.3
Df Residuals:                     267   BIC:                            -279.0
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.3415      0.139     -2.450      0.0

## How hobbies/activities contribute to match success of men and women

In [12]:
est = smf.ols('match ~ tvsports+exercise+dining+museums+art+hiking+gaming+clubbing+reading+tv+theater+movies+concerts+music+shopping+yoga', lookup).fit()
print est.summary().tables[0]
print est.summary().tables[1]

                            OLS Regression Results                            
Dep. Variable:                  match   R-squared:                       0.063
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     2.254
Date:                Wed, 30 Nov 2016   Prob (F-statistic):            0.00355
Time:                        00:48:07   Log-Likelihood:                 283.96
No. Observations:                 552   AIC:                            -533.9
Df Residuals:                     535   BIC:                            -460.6
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.0557      0.050      1.107      0.2

## How hobbies/activities contribute to match success of men

In [18]:
est = smf.ols('match ~ tvsports+exercise+dining+museums+art+hiking+gaming+clubbing+reading+tv+theater+movies+concerts+music+shopping+yoga', lookup_men).fit()
print est.summary().tables[0]
print est.summary().tables[1]

                            OLS Regression Results                            
Dep. Variable:                  match   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     1.702
Date:                Wed, 30 Nov 2016   Prob (F-statistic):             0.0464
Time:                        01:10:31   Log-Likelihood:                 146.43
No. Observations:                 277   AIC:                            -258.9
Df Residuals:                     260   BIC:                            -197.2
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.0330      0.069      0.475      0.6

## How hobbies/activities contribute to match success of women

In [19]:
est = smf.ols('match ~ tvsports+exercise+dining+museums+art+hiking+gaming+clubbing+reading+tv+theater+movies+concerts+music+shopping+yoga', lookup_women).fit()
print est.summary().tables[0]
print est.summary().tables[1]

                            OLS Regression Results                            
Dep. Variable:                  match   R-squared:                       0.099
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     1.759
Date:                Wed, 30 Nov 2016   Prob (F-statistic):             0.0371
Time:                        01:10:42   Log-Likelihood:                 146.57
No. Observations:                 274   AIC:                            -259.1
Df Residuals:                     257   BIC:                            -197.7
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.0675      0.078      0.869      0.3

In [22]:
lookup.head(20)

Unnamed: 0,gender,match,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,field_cd,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1
1,0.0,0.4,6.7,7.4,8.0,7.2,8.0,7.1,21.0,1.0,2.0,4.0,2.0,7.0,1.0,5.250464,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,6.0,8.0,8.0,8.0,7.0,6.929936,7.933121,8.257962,7.38535
2,0.0,0.2,7.7,7.1,7.9,7.5,7.5,6.5,24.0,1.0,2.0,5.0,1.0,5.0,1.0,5.250464,3.0,2.0,7.0,10.0,8.0,6.0,3.0,5.0,8.0,10.0,1.0,9.0,8.0,7.0,8.0,3.0,1.0,7.0,5.0,10.0,8.0,3.0,6.929936,7.933121,8.257962,7.38535
3,0.0,0.0,6.5,7.1,7.3,6.2,7.111111,6.0,25.0,2.0,8.0,4.0,6.0,3.0,1.0,5.250464,3.0,8.0,7.0,8.0,5.0,5.0,8.0,4.0,5.0,7.0,8.0,7.0,7.0,7.0,5.0,8.0,7.0,8.0,9.0,8.0,9.0,8.0,6.929936,7.933121,8.257962,7.38535
4,0.0,0.2,7.0,7.1,7.7,7.5,7.7,7.2,23.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,6.0,7.0,6.0,7.0,7.0,5.0,7.0,7.0,7.0,9.0,7.0,8.0,7.0,1.0,8.0,7.0,8.0,9.0,7.0,8.0,6.929936,7.933121,8.257962,7.38535
5,0.0,0.2,5.3,7.7,7.6,7.2,7.8,6.2,21.0,1.0,8.0,1.0,2.0,4.0,1.0,1.0,7.0,4.0,7.0,7.0,6.0,8.0,6.0,6.0,8.0,6.0,8.0,6.0,6.0,3.0,7.0,8.0,3.0,6.0,3.0,6.0,10.0,8.0,6.929936,7.933121,8.257962,7.38535
6,0.0,0.2,6.8,7.8,8.6,7.0,8.0,6.3,23.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,10.0,8.0,9.0,7.0,8.0,7.0,9.0,2.0,6.0,9.0,2.0,5.0,6.0,6.0,4.0,1.0,1.0,5.0,7.0,8.0,9.0,5.0,6.929936,7.933121,8.257962,7.38535
7,0.0,0.2,7.9,7.6,8.2,7.0,7.5,6.7,22.0,1.0,2.0,4.0,1.0,5.0,1.0,1.0,5.0,3.0,4.0,10.0,10.0,10.0,2.0,3.0,8.0,8.0,8.0,10.0,10.0,10.0,10.0,10.0,10.0,6.0,6.0,5.0,7.0,7.0,6.929936,7.933121,8.257962,7.38535
8,0.0,0.8,8.2,7.8,8.1,7.9,7.444444,6.8,25.0,13.0,1.0,1.0,1.0,5.0,1.0,6.0,2.0,2.0,1.0,10.0,9.0,9.0,3.0,2.0,10.0,8.0,10.0,9.0,9.0,6.0,6.0,8.0,6.0,7.0,4.0,8.0,8.0,8.0,6.929936,7.933121,8.257962,7.38535
9,0.0,0.7,7.0,7.4,7.5,7.8,6.9,6.3,26.0,13.0,1.0,1.0,1.0,4.0,1.0,9.0,4.0,3.0,1.0,8.0,6.0,7.0,2.0,2.0,10.0,8.0,8.0,10.0,10.0,9.0,9.0,8.0,3.0,7.0,6.0,10.0,7.0,7.0,6.929936,7.933121,8.257962,7.38535
10,0.0,0.2,6.333333,7.0,6.4,6.4,6.2,5.777778,26.0,13.0,4.0,4.0,2.0,6.0,1.0,9.0,9.0,9.0,9.0,7.0,6.0,6.0,7.0,1.0,7.0,8.0,5.0,6.0,7.0,7.0,8.0,7.0,7.0,6.0,8.0,10.0,6.0,9.0,6.929936,7.933121,8.257962,7.38535


In [27]:
lookup["percieved_difference"]=(lookup["attr_o"]-lookup["attr5_1"])**2+(lookup["sinc_o"]-lookup["sinc3_1"])**2+(lookup["intel_o"]-lookup["intel3_1"])**2+(lookup["fun_o"]-lookup["fun3_1"])**2+(lookup["amb_o"]-lookup["amb3_1"])**2

In [28]:
lookup.head(20)

Unnamed: 0,gender,match,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,field_cd,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,percieved_difference
1,0.0,0.4,6.7,7.4,8.0,7.2,8.0,7.1,21.0,1.0,2.0,4.0,2.0,7.0,1.0,5.250464,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,6.0,8.0,8.0,8.0,7.0,6.929936,7.933121,8.257962,7.38535,2.052871
2,0.0,0.2,7.7,7.1,7.9,7.5,7.5,6.5,24.0,1.0,2.0,5.0,1.0,5.0,1.0,5.250464,3.0,2.0,7.0,10.0,8.0,6.0,3.0,5.0,8.0,10.0,1.0,9.0,8.0,7.0,8.0,3.0,1.0,7.0,5.0,10.0,8.0,3.0,6.929936,7.933121,8.257962,7.38535,31.512998
3,0.0,0.0,6.5,7.1,7.3,6.2,7.111111,6.0,25.0,2.0,8.0,4.0,6.0,3.0,1.0,5.250464,3.0,8.0,7.0,8.0,5.0,5.0,8.0,4.0,5.0,7.0,8.0,7.0,7.0,7.0,5.0,8.0,7.0,8.0,9.0,8.0,9.0,8.0,6.929936,7.933121,8.257962,7.38535,10.714969
4,0.0,0.2,7.0,7.1,7.7,7.5,7.7,7.2,23.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,6.0,7.0,6.0,7.0,7.0,5.0,7.0,7.0,7.0,9.0,7.0,8.0,7.0,1.0,8.0,7.0,8.0,9.0,7.0,8.0,6.929936,7.933121,8.257962,7.38535,3.644909
5,0.0,0.2,5.3,7.7,7.6,7.2,7.8,6.2,21.0,1.0,8.0,1.0,2.0,4.0,1.0,1.0,7.0,4.0,7.0,7.0,6.0,8.0,6.0,6.0,8.0,6.0,8.0,6.0,6.0,3.0,7.0,8.0,3.0,6.0,3.0,6.0,10.0,8.0,6.929936,7.933121,8.257962,7.38535,31.986692
6,0.0,0.2,6.8,7.8,8.6,7.0,8.0,6.3,23.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,10.0,8.0,9.0,7.0,8.0,7.0,9.0,2.0,6.0,9.0,2.0,5.0,6.0,6.0,4.0,1.0,1.0,5.0,7.0,8.0,9.0,5.0,6.929936,7.933121,8.257962,7.38535,10.816883
7,0.0,0.2,7.9,7.6,8.2,7.0,7.5,6.7,22.0,1.0,2.0,4.0,1.0,5.0,1.0,1.0,5.0,3.0,4.0,10.0,10.0,10.0,2.0,3.0,8.0,8.0,8.0,10.0,10.0,10.0,10.0,10.0,10.0,6.0,6.0,5.0,7.0,7.0,6.929936,7.933121,8.257962,7.38535,9.191024
8,0.0,0.8,8.2,7.8,8.1,7.9,7.444444,6.8,25.0,13.0,1.0,1.0,1.0,5.0,1.0,6.0,2.0,2.0,1.0,10.0,9.0,9.0,3.0,2.0,10.0,8.0,10.0,9.0,9.0,6.0,6.0,8.0,6.0,7.0,4.0,8.0,8.0,8.0,6.929936,7.933121,8.257962,7.38535,16.381704
9,0.0,0.7,7.0,7.4,7.5,7.8,6.9,6.3,26.0,13.0,1.0,1.0,1.0,4.0,1.0,9.0,4.0,3.0,1.0,8.0,6.0,7.0,2.0,2.0,10.0,8.0,8.0,10.0,10.0,9.0,9.0,8.0,3.0,7.0,6.0,10.0,7.0,7.0,6.929936,7.933121,8.257962,7.38535,7.064909
10,0.0,0.2,6.333333,7.0,6.4,6.4,6.2,5.777778,26.0,13.0,4.0,4.0,2.0,6.0,1.0,9.0,9.0,9.0,9.0,7.0,6.0,6.0,7.0,1.0,7.0,8.0,5.0,6.0,7.0,7.0,8.0,7.0,7.0,6.0,8.0,10.0,6.0,9.0,6.929936,7.933121,8.257962,7.38535,22.315935
