In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
pd.options.display.max_columns = 200

In [10]:
# Import data
df = pd.read_csv("data/SpeedDating.csv", encoding="Latin-1")
df_match = df[df.match == 1].copy()
df_match.dropna(subset=["race", "goal", "field_cd", "from", "career_c"], inplace=True)
df_match.reset_index(drop=True, inplace=True)
list_key_date = [str(int(df_match.loc[x, 'iid'])) + "_" + str(int(df_match.loc[x, "pid"])) + "_" + str(df_match.loc[x, "wave"]) if int(df_match.loc[x, "iid"])>int(df_match.loc[x, "pid"]) else str(int(df_match.loc[x, 'pid']))  + "_" + str(int(df_match.loc[x, "iid"])) + "_" + str(df_match.loc[x, "wave"]) for x in range(df_match.shape[0])]
df_match["key_date"] = list_key_date
# Split men/women to take attributes from the two tables
df_men = df_match[df_match.gender==1].loc[:, ["key_date", "race", "goal", "field_cd", "from", "career_c"]].copy()
df_women = df_match[df_match.gender==0].loc[:, ["key_date", "race", "goal", "field_cd", "from", "career_c"]].copy()
df_res = df_men.merge(df_women, how="inner", on="key_date", suffixes=('_H', '_F'))

In [11]:
# Process cities
cities = set(df_res["from_H"].tolist() + df_res["from_F"].tolist())
dict_from = dict(zip(cities, range(len(cities))))
df_res["from_H"] = df_res["from_H"].apply(lambda x: dict_from[x])
df_res["from_F"] = df_res["from_F"].apply(lambda x: dict_from[x])
# Process all fields
for attribute in df_res.columns[1:]:
    df_res[attribute] = df_res[attribute].apply(lambda x: int(x))

In [12]:
df_res.head(5)

Unnamed: 0,key_date,race_H,goal_H,field_cd_H,from_H,career_c_H,race_F,goal_F,field_cd_F,from_F,career_c_F
0,12_8_1,2,1,1,177,1,2,1,13,18,6
1,12_9_1,2,1,1,177,1,6,1,13,129,9
2,13_8_1,4,2,1,29,1,2,1,13,18,6
3,13_9_1,4,2,1,29,1,6,1,13,129,9
4,13_10_1,4,2,1,29,1,2,2,13,172,9


In [13]:
# List of dicts
dico_from = dict(zip(dict_from.values(), dict_from.keys()))
dico_race = {1 : "Black/African American",
             2 : "European/Caucasian-American",
             3 : "Latino/Hispanic American",
             4 : "Asian/Pacific Islander/Asian-American",
             5 : "Native American",
             6 : "Other"}
dico_career = {1 : 'Lawyer',
            2: 'Academic/Research',
            3: 'Psychologist',
            4: 'Doctor/Medicine',
            5: 'Engineer',
            6: 'Creative Arts/Entertainment',
            7: 'Banking/Consulting/Finance/Marketing/Business/CEO/Entrepreneur/Admin',
            8: 'Real Estate',
            9: 'International/Humanitarian Affairs',
            10: 'Undecided',
            11:'Social Work',
            12:'Speech Pathology',
            13:'Politics',
            14:'Pro sports/Athletics',
            15:'Other',
            16:'Journalism',
            17:'Architecture'}
dico_goal = {1 : "Seemed like a fun night out",
             2 : "To meet new people",
             3 : "To get a date",
             4 : "Looking for a serious relationship",
             5 : "To say I did it",
             6 : "Other"}
dico_field = {1: 'Law',
            2: 'Math',
            3: 'Social Science, Psychologist',
            4: 'Medical Science, Pharmaceuticals, and Bio Tech',
            5: 'Engineering',
            6: 'English/Creative Writing/ Journalism',
            7: 'History/Religion/Philosophy',
            8: 'Business/Econ/Finance',
            9: 'Education, Academia',
            10: 'Biological Sciences/Chemistry/Physics',
            11: 'Social Work',
            12: 'Undergrad/undecided',
            13:'Political Science/International Affairs',
            14:'Film',
            15:'Fine Arts/Arts Administration',
            16:'Languages',
            17:'Architecture',
            18:'Other'}
dico_global = {"race": dico_race, "goal": dico_goal, "field_cd": dico_field, "from": dico_from, "career_c": dico_career}

In [14]:
def initialize_data(attribute_H, attribute_F, df=df_res, dico=dico_global):
    
    df_attribute = df.groupby([attribute_H + "_H", attribute_F + "_F"]).count()
    # INITIALIZATION
    dico_nodes_H = {}
    dico_nodes_F = {}
    for i, node in enumerate(df_attribute.index.levels[0].tolist()):
        dico_nodes_H[node] = i
    offset = len(dico_nodes_H.keys())
    for i, node in enumerate(df_attribute.index.levels[1].tolist()):
        dico_nodes_F[node] = i + offset
    # NODES
    nodes = []
    dico_code_to_name_H = dico_global[attribute_H]
    dico_code_to_name_F = dico_global[attribute_F]
    for node in dico_nodes_H.keys():
        nodes.append({"node":dico_nodes_H[node], "name":dico_code_to_name_H[node]})
    for node in dico_nodes_F.keys():
        nodes.append({"node":dico_nodes_F[node], "name":dico_code_to_name_F[node]})
    # LINKS
    list_links = df_attribute.index.values
    links = []
    for i, link in enumerate(list_links):
        links.append({"source":dico_nodes_H[link[0]], "target":dico_nodes_F[link[1]], "value": int(df_attribute.iloc[i,0])})
    return nodes, links

In [15]:
nodes, links = initialize_data('career_c', 'career_c')

In [16]:
with open('data/data_sankey.json', 'w') as json_file:
    json.dump({"nodes" : nodes, "links" : links}, json_file)