In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm
import pickle

In [87]:
# Loading data
recensement = pd.read_excel(io="data/1832_v4.xlsx", na_values=["·","?"])
classes = pd.read_excel(io="data/histoire urbaine digitale classification metiers.xlsx", sheet_name="data",  na_values="·")

# Adding the 'chef_vocation_categorie' column
classes = classes.rename(columns={"JOB": "chef_vocation_norm_2"})
classes = classes.rename(columns={"CLASS": "chef_vocation_categorie"})
recensement = recensement.merge(classes, on="chef_vocation_norm_2", how="outer")

# Split the values on the '/' character
split_vals = recensement['chef_vocation_categorie'].str.split('/')

# Create a new dataframe from the split values
recensement = recensement.assign(chef_vocation_categorie=split_vals).explode('chef_vocation_categorie')

Only a subset is taken to make the graph. Comment this line to study the full graph.

In [88]:
recensement = recensement.query("not chef_vocation_categorie.isna()").head(1000)

A `networkx` graph is made. If the file `graphs_metiers.pkl` already exists, the graph is loaded from there. Else it is recalculated (can take up to 5 minutes if the full graph is being used).

In [89]:
try:
    G = pickle.load(file=open("graphs_metiers.pkl", "rb"))
except:
    G = nx.Graph()
    for index, row in recensement.iterrows():
        G.add_node(
            index,
            nom_rue=row["nom_rue_norm_2"],
            proprietaire=row["proprietaire_nom_norm_2"],
            chef_nom=row["chef_nom_norm_2"],
            chef_annee_naissance=row["chef_annee_naissance"],
            chef_origine=row["chef_origine_norm_2"],
            chef_vocation=row["chef_vocation_norm_2"],
            chef_vocation_categorie=row["chef_vocation_categorie"],
        )
    for index1, row1 in tqdm(recensement.iterrows(), total=len(recensement)):
        if pd.isna(row1["chef_vocation_categorie"]): continue
        for index2, row2 in recensement.iterrows():
            if pd.isna(row2["chef_vocation_categorie"]): continue
            if index2 < index1: continue

            score = 0

            #if row1["chef_vocation_categorie"] == row2["chef_vocation_categorie"]: score += 1
            if row1["chef_vocation_norm_2"] == row2["chef_vocation_norm_2"]: score += 1
            if row1["proprietaire_nom_norm_2"] == row2["proprietaire_nom_norm_2"]: score += 1
            if row1["chef_origine_norm_2"] == row2["chef_origine_norm_2"]: score += 1
            #if row1["nom_rue_norm_2"] == row2["nom_rue_norm_2"]: score += 1

            if score > 0: G.add_edge(index1, index2, weight=score)
            
    pickle.dump(obj=G, file=open("graphs_metiers.pkl", "wb"))

100%|██████████| 1000/1000 [00:19<00:00, 51.18it/s]


In [None]:
nx.write_gexf(G, 'graphs_metiers.gexf')