In [146]:
import json
import pandas as pd

DATA_PATH = '../data/1832_v4_preprocessed.csv'
SANKEY_PATH = '../data/sankey_oj.json'
SANKEY_NORMALIZED_PATH = '../data/sankey_oj_normalized.json'
SANKEY_BINARY_PATH = '../data/sankey_oj_binary.json'
SANKEY_DJO_PATH = '../data/sankey_djo.json'

In [147]:
rec = pd.read_csv(DATA_PATH)
rec.head()

Unnamed: 0,nom_rue,no_maison,chef_annee_naissance,chef_origine,chef_annee_arrivee,chef_vocation,page,division,chef_vocation_class,chef_origine_class
0,marterey,1,1764.0,tolochenaz,1786.0,charon,1,1-2,,morges
1,marterey,2,,rolle,1821.0,marechal-ferrant,1,1-2,artisanat,rolle
2,marterey,3,1785.0,pailly,,lingere,1,1-2,service,moudon
3,marterey,3,,gingins,,rentier,1,1-2,rente,
4,marterey,3,,gingins,,rentier,1,1-2,rente,


## **Origin Job**

In [72]:
# Keep TOP_N most populated chef_origine_class 
TOP_N = 5

top_origins = rec['chef_origine_class'].value_counts().head(TOP_N).index.tolist()
rec['chef_origine_class'] = rec['chef_origine_class'].apply(lambda x: x if x in top_origins else 'Other')

# Delete Other entries
rec = rec[rec['chef_origine_class'] != 'Other']
rec.head()

Unnamed: 0,nom_rue,no_maison,chef_annee_naissance,chef_origine,chef_annee_arrivee,chef_vocation,page,division,chef_vocation_class,chef_origine_class
0,marterey,1,1764.0,tolochenaz,1786.0,charon,1,1-2,,morges
5,marterey,3,1802.0,lutry,,garcon voiturier,1,1-2,,lavaux
8,marterey,3,1787.0,lausanne,,rentier,1,1-2,rente,lausanne
9,marterey,3,1787.0,lausanne,,rentier,1,1-2,rente,lausanne
10,marterey,3,,saint-saphorin,1784.0,garde arsenal,1,1-2,,lavaux


In [73]:
rec = rec.groupby(['chef_origine_class', 'chef_vocation_class']) \
    .agg({'chef_origine_class': 'count'}) \
    .rename(columns={'chef_origine_class': 'count'}) \
    .reset_index()

rec.head()

Unnamed: 0,chef_origine_class,chef_vocation_class,count
0,france,administration,1
1,france,agricole,20
2,france,artisanat,16
3,france,commerce,7
4,france,construction,1


In [74]:
data = []

# for each entry put source target and value
for index, row in rec.iterrows():
    data.append({
        'source': row['chef_origine_class'],
        'target': row['chef_vocation_class'],
        'value': row['count']
    })

In [75]:
# Save data
with open(SANKEY_PATH, 'w') as f:
    json.dump(data, f, indent=4)

## **Origin Job Normalized**

In [85]:
# Keep TOP_N most populated chef_origine_class 
TOP_N = 5

top_origins = rec['chef_origine_class'].value_counts().head(TOP_N).index.tolist()
rec['chef_origine_class'] = rec['chef_origine_class'].apply(lambda x: x if x in top_origins else 'Other')

# Delete Other entries
rec = rec[rec['chef_origine_class'] != 'Other']
rec.head()

Unnamed: 0,nom_rue,no_maison,chef_annee_naissance,chef_origine,chef_annee_arrivee,chef_vocation,page,division,chef_vocation_class,chef_origine_class
0,marterey,1,1764.0,tolochenaz,1786.0,charon,1,1-2,,morges
5,marterey,3,1802.0,lutry,,garcon voiturier,1,1-2,,lavaux
8,marterey,3,1787.0,lausanne,,rentier,1,1-2,rente,lausanne
9,marterey,3,1787.0,lausanne,,rentier,1,1-2,rente,lausanne
10,marterey,3,,saint-saphorin,1784.0,garde arsenal,1,1-2,,lavaux


In [86]:
rec = rec.groupby(['chef_origine_class', 'chef_vocation_class']) \
    .agg({'chef_origine_class': 'count'}) \
    .rename(columns={'chef_origine_class': 'count'}) \
    .reset_index()

rec.head()

Unnamed: 0,chef_origine_class,chef_vocation_class,count
0,france,administration,1
1,france,agricole,20
2,france,artisanat,16
3,france,commerce,7
4,france,construction,1


In [87]:
rec_sum = rec.groupby(['chef_origine_class']) \
    .agg({'count': 'sum'}) \
    .rename(columns={'count': 'sum'}) \
    .reset_index()

rec_sum.head()

Unnamed: 0,chef_origine_class,sum
0,france,98
1,lausanne,902
2,lavaux,207
3,morges,140
4,suisse_allemande,150


In [88]:
# Normalize
rec = rec.merge(rec_sum, on='chef_origine_class')
rec['count'] = rec['count'] / rec['sum']
rec = rec.drop(columns=['sum'])

In [89]:
data = []

# for each entry put source target and value
for index, row in rec.iterrows():
    data.append({
        'source': row['chef_origine_class'],
        'target': row['chef_vocation_class'],
        'value': row['count']
    })

In [90]:
# Save data
with open(SANKEY_NORMALIZED_PATH, 'w') as f:
    json.dump(data, f, indent=4)

## **Binary**

In [50]:
rec.loc[rec['chef_origine_class'] != 'lausanne', 'chef_origine_class'] = 'hors_lausanne'

In [52]:
rec = rec.groupby(['chef_origine_class', 'chef_vocation_class']) \
    .agg({'chef_origine_class': 'count'}) \
    .rename(columns={'chef_origine_class': 'count'}) \
    .reset_index()

rec.head()

Unnamed: 0,chef_origine_class,chef_vocation_class,count
0,hors_lausanne,administration,49
1,hors_lausanne,agricole,389
2,hors_lausanne,artisanat,536
3,hors_lausanne,commerce,174
4,hors_lausanne,construction,199


In [53]:
rec_sum = rec.groupby(['chef_origine_class']) \
    .agg({'count': 'sum'}) \
    .rename(columns={'count': 'sum'}) \
    .reset_index()

rec_sum.head()

Unnamed: 0,chef_origine_class,sum
0,hors_lausanne,2060
1,lausanne,902


In [54]:
# Normalize
rec = rec.merge(rec_sum, on='chef_origine_class')
rec['count'] = rec['count'] / rec['sum']
rec = rec.drop(columns=['sum'])

In [55]:
data = []

# for each entry put source target and value
for index, row in rec.iterrows():
    data.append({
        'source': row['chef_origine_class'],
        'target': row['chef_vocation_class'],
        'value': row['count']
    })

In [56]:
# Save data
with open(SANKEY_BINARY_PATH, 'w') as f:
    json.dump(data, f, indent=4)

## **Division Job Origin**

In [148]:
rec_dj = rec.copy()
rec_dj = rec_dj.groupby(['division', 'chef_vocation_class']) \
    .agg({'chef_vocation_class': 'count'}) \
    .rename(columns={'chef_vocation_class': 'count'}) \
    .reset_index()

rec_dj.head()

Unnamed: 0,division,chef_vocation_class,count
0,1-2,administration,5
1,1-2,agricole,82
2,1-2,artisanat,93
3,1-2,commerce,17
4,1-2,construction,39


In [149]:
data = []

# for each entry put source target and value
for index, row in rec_dj.iterrows():
    data.append({
        'source': row['division'],
        'target': row['chef_vocation_class'],
        'value': row['count']
    })

In [150]:
rec_jo = rec.copy()
rec_jo = rec_jo.groupby(['chef_vocation_class', 'chef_origine_class']) \
    .agg({'chef_origine_class': 'count'}) \
    .rename(columns={'chef_origine_class': 'count'}) \
    .reset_index()

rec_jo.head()

Unnamed: 0,chef_vocation_class,chef_origine_class,count
0,administration,angleterre,1
1,administration,france,1
2,administration,fribourg,1
3,administration,la_vallee,6
4,administration,lausanne,14


In [151]:
# for each entry put source target and value
for index, row in rec_jo.iterrows():
    print(index, row['chef_origine_class'])
    data.append({
        'source': row['chef_vocation_class'],
        'target': row['chef_origine_class'],
        'value': row['count']
    })

0 angleterre
1 france
2 fribourg
3 la_vallee
4 lausanne
5 lavaux
6 moudon
7 nyon
8 orbe
9 oron
10 pays_d_enhaut
11 rolle
12 suisse_allemande
13 vevey
14 yverdon
15 aigle
16 angleterre
17 aubonne
18 cossonay
19 echallens
20 france
21 fribourg
22 geneve
23 grandson
24 la_vallee
25 lausanne
26 lavaux
27 morges
28 moudon
29 neuchatel
30 nyon
31 orbe
32 oron
33 pays_d_enhaut
34 rolle
35 suisse_allemande
36 vaud
37 vevey
38 yverdon
39 aigle
40 angleterre
41 aubonne
42 avenches
43 cossonay
44 echallens
45 france
46 fribourg
47 la_vallee
48 lausanne
49 lavaux
50 morges
51 moudon
52 neuchatel
53 nyon
54 orbe
55 oron
56 payerne
57 pays_d_enhaut
58 rolle
59 suisse_allemande
60 vaud
61 vevey
62 yverdon
63 aigle
64 aubonne
65 cossonay
66 echallens
67 france
68 fribourg
69 grandson
70 la_vallee
71 lausanne
72 lavaux
73 morges
74 moudon
75 neuchatel
76 nyon
77 orbe
78 oron
79 pays_d_enhaut
80 rolle
81 suisse_allemande
82 vaud
83 vevey
84 yverdon
85 aigle
86 aubonne
87 avenches
88 cossonay
89 echallen

In [139]:
# Save data
with open(SANKEY_DJO_PATH, 'w') as f:
    json.dump(data, f, indent=4)