# Problème rencontré : beaucoup de liens sont créés, car tous les membres d'un même quartier sont reliés entre eux, et tous les membres d'un quartier sont reliés à tous les autres membres des quartiers proches.

Importing modules.

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm
import pickle

Defining a function that gives the approximate distance for a lattitude and longitude difference. This function does **not** take into account local curvature.

In [2]:
def approx_dist(lat: float, lon: float) -> float:
    dx = lat * 110.574 # km
    dy = lon * 111.320 * np.cos(lat) # km
    return np.sqrt(dx**2 + dy**2)

Imorting data and adding informations for later.

In [3]:
# Loading data
recensement = pd.read_excel(io="../../1832_v4.xlsx", na_values=["·","?"])
classes = pd.read_excel(io="../../histoire urbaine digitale classification metiers.xlsx", sheet_name="data",  na_values="·")
rues = pd.read_csv("../../nom_rues_et_coor.csv", sep=",")

# Adding the 'chef_vocation_categorie' column
classes = classes.rename(columns={"JOB": "chef_vocation_norm_2"})
classes = classes.rename(columns={"CLASS": "chef_vocation_categorie"})
recensement = recensement.merge(classes, on="chef_vocation_norm_2", how="outer")

# Split the values on the '/' character
split_vals = recensement['chef_vocation_categorie'].str.split('/')

# Create a new dataframe from the split values
recensement = recensement.assign(chef_vocation_categorie=split_vals).explode('chef_vocation_categorie')

# Adding coordinates
correspondances = {
    "nom_rue": [
        "ale", 
        "etraz", 
        "rue du pre", 
        "chaucrau", 
        "st laurent", 
        "st pierre", 
        "st etienne", 
        "bourg", 
        "st francois", 
        "georgette",
    ],
    "nom_rue_norm_2": [
        "rue de l'ale",
        "rue d etraz",
        "rue du pre",
        "rue de chaucrau",
        "place de st laurent",
        "rue de st pierre",
        "st etienne",
        "rue de bourg",
        "place de st froncois",
        "chemin de georgette",
    ]
}
for index, row in tqdm(recensement.iterrows(), total=len(recensement), desc="Adding coordinates"):
    nom_rue = row["nom_rue_norm_2"]
    if type(nom_rue) == float: continue # if nom_rue is NaN
    if nom_rue in correspondances["nom_rue"]:
        index = correspondances["nom_rue"].index(nom_rue)
        nom_rue = correspondances["nom_rue_norm_2"][index]
        corresponding_streets = rues.query("Nom_rue == @nom_rue")
    else:
        corresponding_streets = rues.query("Nom_rue.str.contains(@nom_rue)")
    occurences = len(corresponding_streets)
    if occurences == 1:
        recensement.at[index, "rue_x"] = corresponding_streets["X"].values[0]
        recensement.at[index, "rue_y"] = corresponding_streets["Y"].values[0]
        continue
print(f'{len(recensement.query("~rue_x.isnull()")) / len(recensement) * 100:.2f}% des personnes positionnées')

Adding coordinates: 100%|██████████| 3862/3862 [00:04<00:00, 836.13it/s]

49.30% des personnes positionnées





Filtering data. Mainly remove lines that have no vocation & no coordinates.

In [4]:
original_size = len(recensement)

# Keep only the persons who have a classified vocation
recensement = recensement.query("not chef_vocation_categorie.isna()")
print(f"Size is now {len(recensement)/original_size*100:.0f}% of the original.")

recensement = recensement.query("not rue_x.isna()")
print(f"Size is now {len(recensement)/original_size*100:.0f}% of the original.")


Size is now 80% of the original.
Size is now 39% of the original.


A `networkx` graph is made.

In [5]:
G = nx.Graph()
for index, row in recensement.iterrows():
    G.add_node(
        index,
        nom_rue=row["nom_rue_norm_2"],
        proprietaire=row["proprietaire_nom_norm_2"],
        chef_nom=row["chef_nom_norm_2"],
        chef_annee_naissance=row["chef_annee_naissance"],
        chef_origine=row["chef_origine_norm_2"],
        chef_vocation=row["chef_vocation_norm_2"],
        chef_vocation_categorie=row["chef_vocation_categorie"],
    )
for index1, row1 in tqdm(recensement.iterrows(), total=len(recensement)):
    if pd.isna(row1["chef_vocation_categorie"]): continue
    for index2, row2 in recensement.iterrows():
        if pd.isna(row2["chef_vocation_categorie"]): continue
        if index2 < index1: continue

        score = 0

        #if row1["nom_rue_norm_2"] == row2["nom_rue_norm_2"]: score += 1
        #if row1["chef_origine_norm_2"] == row2["chef_origine_norm_2"]: score += 1

        distance = approx_dist(lat=row1["rue_x"] - row2["rue_x"], lon=row1["rue_y"] - row2["rue_y"])
        if distance > 1: continue
        score = 1/(distance + 1)
        if score > 0: G.add_edge(index1, index2, weight=score)

100%|██████████| 1495/1495 [01:46<00:00, 14.02it/s]


Show some stats.

In [6]:
print(f"Number of nodes: {len(G.nodes)}")
print(f"Number of edges: {len(G.edges)}")

Number of nodes: 1396
Number of edges: 693926


Write `.gexf` file.

In [7]:
nx.write_gexf(G, 'individus_origine_rue.gexf')