In [38]:
from collections import defaultdict
from tqdm import tqdm
import itertools
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [39]:
rec = pd.read_excel("data/1832_v4.xlsx", index_col=0)
rec = rec[[
    "nom_rue_norm_2",
    "no_maison",
    "proprietaire_nom_norm_2",
    "chef_prenom_norm",
    "chef_nom_norm_2",
    "chef_annee_naissance",
    "epouse_nom_norm",
    "epouse_annee_naissance",
    "enfants_dans_la_commune_prenom_norm",
    "enfants_annee_naissance",
    "chef_origine_norm_2",
    "chef_annee_arrivee",
    "chef_vocation_norm_2",
    "pensionnaires_prenom_norm",
    "pensionnaires_nom_norm",
    "pensionnaires_origine_norm",
    "Page"
]]
rec = rec.rename(columns={
    "nom_rue_norm_2": "nom_rue",
    "proprietaire_nom_norm_2": "proprietaire_nom",
    "chef_prenom_norm": "chef_prenom",
    "chef_nom_norm_2": "chef_nom",
    "epouse_nom_norm": "epouse_nom",
    "enfants_dans_la_commune_prenom_norm": "enfants_dans_la_commune_prenom",
    "chef_origine_norm_2": "chef_origine",
    "chef_vocation_norm_2": "chef_vocation",
    "pensionnaires_prenom_norm": "pensionnaires_prenom",
    "pensionnaires_nom_norm": "pensionnaires_nom",
    "pensionnaires_origine_norm": "pensionnaires_origine",
    "Page" : "page"
})

In [40]:
rec.head()

Unnamed: 0,nom_rue,no_maison,proprietaire_nom,chef_prenom,chef_nom,chef_annee_naissance,epouse_nom,epouse_annee_naissance,enfants_dans_la_commune_prenom,enfants_annee_naissance,chef_origine,chef_annee_arrivee,chef_vocation,pensionnaires_prenom,pensionnaires_nom,pensionnaires_origine,page
1,marterey,1,weidmann,georges henri,weidmann,1764,·,·,·,·,tolochenaz,1786,charon,louise|anna|francois|jn adam,weidmann|la maurer|kirsner|reverber,tolochenaz|frutigen|cirol|bavarois,1
2,marterey,2,collioud,louis,collioud,17936,blanchoud,1800,marie|henri|charles,1822|1823|1825,rolle,1821,marechal-ferrant,louis|marc,henny|bourgeois,montherod|rolle,1
3,marterey,3,fiaux,louise,piot,1785,neissance,·,benjamin,1815,pailly,nee,lingere,·,·,·,1
4,marterey,·,fiaux,monsieur,de gingins,·,seigneux,·,·,·,gingins,·,rentier,louise|marianne|louis,oswald|thelin|vannod,thoune|mex|orny,1
5,marterey,·,fiaux,louis vincent,lavanchy,1802,baudet,1805,·,·,lutry,ne,garcon voiturier,·,·,·,1


In [41]:
rec.dtypes

nom_rue                           object
no_maison                         object
proprietaire_nom                  object
chef_prenom                       object
chef_nom                          object
chef_annee_naissance              object
epouse_nom                        object
epouse_annee_naissance            object
enfants_dans_la_commune_prenom    object
enfants_annee_naissance           object
chef_origine                      object
chef_annee_arrivee                object
chef_vocation                     object
pensionnaires_prenom              object
pensionnaires_nom                 object
pensionnaires_origine             object
page                               int64
dtype: object

In [42]:
len(rec)

3674

In [43]:
min_road_count = 10
min_chief_vocation_count = 10

valid_roads = rec \
    .groupby("nom_rue") \
    .agg({"chef_vocation": "count"}) \
    .rename(columns={"chef_vocation": "count"}) \
    .query(f"count >= {min_road_count}")
valid_roads = valid_roads.index.tolist()

invalid_chief_vocations = ["?", "·"]
valid_chief_vocations = rec \
    .groupby("chef_vocation") \
    .agg({"nom_rue": "count"}) \
    .rename(columns={"nom_rue": "count"}) \
    .query(f"count >= {min_chief_vocation_count}")
valid_chief_vocations = valid_chief_vocations.index.tolist()
valid_chief_vocations = [v for v in valid_chief_vocations if v not in invalid_chief_vocations]

In [44]:
rec = rec[rec["nom_rue"].isin(valid_roads) & rec["chef_vocation"].isin(valid_chief_vocations)]
len(rec)

2271

In [45]:
road_vocation_edges = rec \
    .groupby(["nom_rue", 'chef_vocation']) \
    .agg({"chef_vocation": "count"}) \
    .rename(columns={"chef_vocation": "count"}) \
    .reset_index() \
    .rename(columns={
        "nom_rue": "source",
        "chef_vocation": "target",
        "count": "weight"
    })
road_vocation_edges.head()

Unnamed: 0,source,target,weight
0,ale,agriculteur,1
1,ale,aubergiste,1
2,ale,boulanger,1
3,ale,carrier,2
4,ale,charpentier,8


In [46]:
road_nodes = pd.DataFrame(columns=["id", "label", "type", "size"])
road_nodes["id"] = road_vocation_edges["source"].unique()
road_nodes["label"] = road_nodes["id"]
road_nodes["type"] = "road"
road_nodes["size"] = 1
road_nodes.head()

Unnamed: 0,id,label,type,size
0,ale,ale,road,1
1,barre,barre,road,1
2,bourg,bourg,road,1
3,calvaire,calvaire,road,1
4,chailly,chailly,road,1


In [48]:
vocation_nodes = pd.DataFrame(columns=["id", "label", "type", "size"])
vocation_nodes["id"] = road_vocation_edges["target"].unique()
vocation_nodes["label"] = vocation_nodes["id"]
vocation_nodes["type"] = "vocation"
vocation_sizes = road_vocation_edges.groupby("target").agg({"weight": "sum"})["weight"].to_dict()
vocation_nodes["size"] = vocation_nodes["id"].map(vocation_sizes)
vocation_nodes.head()

Unnamed: 0,id,label,type,size
0,agriculteur,agriculteur,vocation,28
1,aubergiste,aubergiste,vocation,15
2,boulanger,boulanger,vocation,32
3,carrier,carrier,vocation,15
4,charpentier,charpentier,vocation,71


In [49]:
road_vocation_nodes = pd.concat([road_nodes, vocation_nodes])

## Write nodes and edges files in separate csv files

In [None]:
# write to csv
road_vocation_edges.to_csv("data/road_vocation_edges.csv", index=False)
road_vocation_nodes.to_csv("data/road_vocation_nodes.csv", index=False)

## Write data in a single json file

In [50]:
road_vocation_edges_json = road_vocation_edges.to_dict(orient="records")
road_vocation_nodes_json = road_vocation_nodes.to_dict(orient="records")

In [51]:
road_vocation_data = {
    "nodes": road_vocation_nodes_json,
    "links": road_vocation_edges_json
}

In [52]:
# write road_vocation_data as json file
JSON_FILE = "data/road_vocation_data.json"
with open(JSON_FILE, "w") as f:
    json.dump(road_vocation_data, f)