In [170]:
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from tqdm import tqdm

In [171]:
rec = pd.read_excel("data/1832_v4.xlsx", index_col=0)
rec = rec[[
    "nom_rue_norm_2",
    "no_maison",
    "proprietaire_nom_norm_2",
    "chef_prenom_norm",
    "chef_nom_norm_2",
    "chef_annee_naissance",
    "epouse_nom_norm",
    "epouse_annee_naissance",
    "enfants_dans_la_commune_prenom_norm",
    "enfants_annee_naissance",
    "chef_origine_norm_2",
    "chef_annee_arrivee",
    "chef_vocation_norm_2",
    "pensionnaires_prenom_norm",
    "pensionnaires_nom_norm",
    "pensionnaires_origine_norm",
    "Page"
]]
rec = rec.rename(columns={
    "nom_rue_norm_2": "nom_rue",
    "proprietaire_nom_norm_2": "proprietaire_nom",
    "chef_prenom_norm": "chef_prenom",
    "chef_nom_norm_2": "chef_nom",
    "epouse_nom_norm": "epouse_nom",
    "enfants_dans_la_commune_prenom_norm": "enfants_dans_la_commune_prenom",
    "chef_origine_norm_2": "chef_origine",
    "chef_vocation_norm_2": "chef_vocation",
    "pensionnaires_prenom_norm": "pensionnaires_prenom",
    "pensionnaires_nom_norm": "pensionnaires_nom",
    "pensionnaires_origine_norm": "pensionnaires_origine",
    "Page" : "page"
})

In [172]:
rec.head()

Unnamed: 0,nom_rue,no_maison,proprietaire_nom,chef_prenom,chef_nom,chef_annee_naissance,epouse_nom,epouse_annee_naissance,enfants_dans_la_commune_prenom,enfants_annee_naissance,chef_origine,chef_annee_arrivee,chef_vocation,pensionnaires_prenom,pensionnaires_nom,pensionnaires_origine,page
1,marterey,1,weidmann,georges henri,weidmann,1764,·,·,·,·,tolochenaz,1786,charon,louise|anna|francois|jn adam,weidmann|la maurer|kirsner|reverber,tolochenaz|frutigen|cirol|bavarois,1
2,marterey,2,collioud,louis,collioud,17936,blanchoud,1800,marie|henri|charles,1822|1823|1825,rolle,1821,marechal-ferrant,louis|marc,henny|bourgeois,montherod|rolle,1
3,marterey,3,fiaux,louise,piot,1785,neissance,·,benjamin,1815,pailly,nee,lingere,·,·,·,1
4,marterey,·,fiaux,monsieur,de gingins,·,seigneux,·,·,·,gingins,·,rentier,louise|marianne|louis,oswald|thelin|vannod,thoune|mex|orny,1
5,marterey,·,fiaux,louis vincent,lavanchy,1802,baudet,1805,·,·,lutry,ne,garcon voiturier,·,·,·,1


In [173]:
rec.dtypes

nom_rue                           object
no_maison                         object
proprietaire_nom                  object
chef_prenom                       object
chef_nom                          object
chef_annee_naissance              object
epouse_nom                        object
epouse_annee_naissance            object
enfants_dans_la_commune_prenom    object
enfants_annee_naissance           object
chef_origine                      object
chef_annee_arrivee                object
chef_vocation                     object
pensionnaires_prenom              object
pensionnaires_nom                 object
pensionnaires_origine             object
page                               int64
dtype: object

In [174]:
len(rec)

3674

## Matching house nodes by the chief's vocation

In this section, every node represents a house, and the edges represent whether chiefs' vocation of the two houses are the same. The graph is unweighted and undirected.

### Get the current data's proximity index

In [90]:
def get_matching_records(rec, on_column):
    """
    Args:
        rec (pd.DataFrame): population records
        on_column (str): column to group by
    Returns:
        pd.DataFrame: dataframe with matching records' ids
    """
    matching = pd.DataFrame(columns=["id_1", "id_2"])

    # Group the rows by chef_vocation
    groups = rec.groupby(on_column)

    # Compare each group to itself using merge
    for _, group in groups:
        pairs = itertools.combinations(group.index, 2)
        matching = pd.concat([matching, pd.DataFrame(pairs, columns=["id_1", "id_2"])])

    return matching

In [94]:
def merge_matching_records(rec, matching, on_column):
    """
    Args:
        rec (pd.DataFrame): population records
        matching (pd.DataFrame): dataframe with matching records' ids
        on_column (str): column to group by
    Returns:
        pd.DataFrame: dataframe with matching records' ids and data
    """
    new_matching_data = matching.copy()
    new_matching_data = new_matching_data \
        .merge(rec, left_on="id_1", right_index=True) \
        .merge(rec, left_on="id_2", right_index=True, suffixes=("_1", "_2"))

    # check is list
    if isinstance(on_column, list):
        new_matching_data = new_matching_data[
            ["nom_rue_1", "nom_rue_2"] + [on_col + "_1" for on_col in on_column]
        ].rename(columns={
            on_col + "_1": on_col for on_col in on_column
        })
    else:
        new_matching_data = new_matching_data[[
            "nom_rue_1",
            "nom_rue_2",
            on_column + "_1",
        ]].rename(columns={
            on_column + "_1": on_column,
        })

    return new_matching_data

In [104]:
def clean_matching_data(matching_data, on_column, min_cluster_size_thd=10, invalid_values=["?", "·", "ne", "nee"]):
    """
    Args:
        matching_data (pd.DataFrame): dataframe with matching records' ids and data
        on_column (str): column to group by
        min_cluster_size_thd (int): minimum cluster size threshold
        invalid_chief_vocations (list): invalid chief vocations
    Returns:
        pd.DataFrame: dataframe with matching records' ids and data after cleaning
    """
    new_matching_data = matching_data.copy()
    if isinstance(on_column, list):
        for on_col in on_column:
            new_matching_data = new_matching_data[~new_matching_data[on_col].isin(invalid_values)]
    else:
        new_matching_data = new_matching_data[~new_matching_data[on_column].isin(invalid_values)]
    new_matching_data = new_matching_data.groupby(on_column).filter(lambda x: len(x) >= min_cluster_size_thd)

    return new_matching_data

In [61]:
def get_proximity_index(matching_data, on_column):
    """
    Args:
        matching_data (pd.DataFrame): dataframe with matching records' ids and data
        on_column (str): column to group by
    Returns:
        float: proximity index
    """
    new_matching_data = matching_data.copy()
    # get whether the two records are on the same road
    new_matching_data["same_road"] = new_matching_data["nom_rue_1"] == new_matching_data["nom_rue_2"]

    # get the proximity index for each chief vocation
    proximity_indexes = defaultdict(float)
    for chief_vocation, group in new_matching_data.groupby(on_column):
        for _, row in group.iterrows():
            proximity_indexes[chief_vocation] += 1 if row["same_road"] else 0

        proximity_indexes[chief_vocation] /= len(group)

    # build a new dataframe with important information about each cluster
    # get proximity indexes
    clusters = pd.DataFrame.from_dict(proximity_indexes, orient="index", columns=["proximity_index"])
    clusters = clusters.sort_values(by="proximity_index", ascending=False)

    # get number of edges
    clusters['num_edges'] = 0
    for chief_vocation, group in new_matching_data.groupby(on_column):
        clusters.loc[chief_vocation, 'num_edges'] = len(group)

    # get number of nodes
    clusters['num_nodes'] = (1 + np.sqrt(1 + 8 * clusters['num_edges'])) / 2 # quadratic formula
    clusters['num_nodes'] = clusters['num_nodes'].astype(int)

    # average the proximity index by the number of nodes
    proximity_index = np.sum(np.array(clusters['proximity_index']) * np.array(clusters['num_nodes'])) / clusters['num_nodes'].sum()

    return proximity_index


In [63]:
def get_proximity_index_from_rec(rec, on_column):
    """
    Args:
        rec (pd.DataFrame): population records
        on_column (str): column to group by
    Returns:
        float: proximity index
    """
    matching = get_matching_records(rec, on_column)
    matching_data = merge_matching_records(rec, matching, on_column)
    matching_data = clean_matching_data(matching_data, on_column)
    proximity_index = get_proximity_index(matching_data, on_column)

    return proximity_index

proximity_index = get_proximity_index_from_rec(rec, "chef_vocation")
print(f"Proximity index: {proximity_index}")

Proximity index: 0.07506469380301693


### Get the proximity index on randomly shuffled data

In [66]:
def shuffle_chief_vocations(rec, on_column):
    """
    Args:
        rec (pd.DataFrame): population records
        on_column (str): column to group by
    Returns:
        pd.DataFrame: population records with shuffled chief vocations
    """
    new_rec = rec.copy()
    new_rec[on_column] = new_rec[on_column].sample(frac=1).values

    return new_rec

In [70]:
def get_random_proximity_index(rec, on_column):
    """
    Args:
        rec (pd.DataFrame): population records
        on_column (str): column to group by
    Returns:
        float: proximity index
    """
    rec = shuffle_chief_vocations(rec, on_column) # shuffle the chief vocations to get a new population
    proximity_index = get_proximity_index_from_rec(rec, on_column)
    
    return proximity_index

In [71]:
def get_bootstrapped_ci_proximity_index(rec, on_column, n=100, alpha=0.05):
    """
    Args:
        rec (pd.DataFrame): population records
        n (int): number of bootstrap iterations
    Returns:
        float: proximity index
    """
    proximity_indexes = []
    for _ in tqdm(range(n)):
        proximity_indexes.append(get_random_proximity_index(rec, on_column))

    # get the confidence interval
    proximity_indexes = np.array(proximity_indexes)
    lower_bound = np.percentile(proximity_indexes, alpha / 2 * 100)
    upper_bound = np.percentile(proximity_indexes, (1 - alpha / 2) * 100)

    return lower_bound, upper_bound

In [52]:
lower_bound, upper_bound = get_bootstrapped_ci_proximity_index(rec, "chef_vocation")
print(f" The 95% confidence interval for the proximity index is [{lower_bound}, {upper_bound}]")

100%|██████████| 100/100 [03:19<00:00,  1.99s/it]

 The 95% confidence interval for the proximity index is [0.03137840435947604, 0.03465384562253532]





In summary, the proximity index describes how houses with the same chief's vocation tend to be close to each other (in the same road). With a proximity index of $1$, all houses with the same chief's vocation are in the same road. With a proximity index of $0$, all houses with the same chief's vocation are in different roads. The proximity index of our dataset is about $0.075$, whereas a random shuffle of the dataset (still preserving the same prevalence of each vocation) has a confidence interval of $[0.031, 0.034]$.

This finding suggests that there is some degree of non-randomness in the distribution of houses with the same chief's vocation. However, it's important to be cautious when interpreting the results of the proximity index. This measure only considers one aspect of the spatial distribution of houses, and it's possible that other factors could be influencing the clustering patterns that are observed. Additionally, the proximity index only considers the distribution of houses with the same chief's vocation, and it's possible that other factors could also be clustering in the same way. Therefore, it's important to consider these potential confounding factors when interpreting the results of the proximity index.

## Matching house nodes by the chief's origin

In this section, every node represents a house, and the edges represent whether chiefs' origin of the two houses are the same. The graph is unweighted and undirected.

### Get the current data's proximity index

In [65]:
proximity_index = get_proximity_index_from_rec(rec, "chef_origine")
print(f"Proximity index: {proximity_index}")

Proximity index: 0.052897171650312456


### Get the proximity index on randomly shuffled data

In [74]:
lower_bound, upper_bound = get_bootstrapped_ci_proximity_index(rec, "chef_origine")
print(f" The 95% confidence interval for the proximity index is [{lower_bound}, {upper_bound}]")

100%|██████████| 100/100 [10:21<00:00,  6.22s/it]

 The 95% confidence interval for the proximity index is [0.02982299106572898, 0.03457017794819724]





There also is a significant difference when taking into account the chief's origin. This might lead us to believe that people with the same origin tend to live in the same area.

## Matching house nodes by the chief's year of arrival

### Get the current data's proximity index

In [75]:
proximity_index = get_proximity_index_from_rec(rec, "chef_annee_arrivee")
print(f"Proximity index: {proximity_index}")

Proximity index: 0.06773890872022503


### Get the proximity index on randomly shuffled data

In [77]:
lower_bound, upper_bound = get_bootstrapped_ci_proximity_index(rec, "chef_annee_arrivee")
print(f" The 95% confidence interval for the proximity index is [{lower_bound}, {upper_bound}]")

100%|██████████| 100/100 [15:50<00:00,  9.50s/it]

 The 95% confidence interval for the proximity index is [0.029857567328123336, 0.0363242971366465]





The same happens when we consider the year of arrival of the chief. This might lead us to believe that people who arrived at the same year tend to live in the same area. But this might also be due to the fact that Lausanne was expanding, and people were moving to the new areas at about the same time.

## Matching house nodes by the chief's year of birth

### Get the current data's proximity index

In [85]:
proximity_index = get_proximity_index_from_rec(rec, "chef_annee_naissance")
print(f"Proximity index: {proximity_index}")

Proximity index: 0.03618268612132012


### Get the proximity index on randomly shuffled data

In [86]:
lower_bound, upper_bound = get_bootstrapped_ci_proximity_index(rec, "chef_annee_naissance", n=100)
print(f" The 95% confidence interval for the proximity index is [{lower_bound}, {upper_bound}]")

100%|██████████| 100/100 [03:39<00:00,  2.20s/it]

 The 95% confidence interval for the proximity index is [0.031240412052650275, 0.034157239499055116]





When taking into account the chief's year of birth, we get a slight difference, but it is not that significant. People born at the same year tend to live in the same area, but not significantly more than random.

## Matching house nodes by the chief's wife year of birth

### Get the current data's proximity index

In [87]:
proximity_index = get_proximity_index_from_rec(rec, "epouse_annee_naissance")
print(f"Proximity index: {proximity_index}")

Proximity index: 0.03327712832001548


### Get the proximity index on randomly shuffled data

In [88]:
lower_bound, upper_bound = get_bootstrapped_ci_proximity_index(rec, "epouse_annee_naissance", n=100)
print(f" The 95% confidence interval for the proximity index is [{lower_bound}, {upper_bound}]")

100%|██████████| 100/100 [03:41<00:00,  2.21s/it]

 The 95% confidence interval for the proximity index is [0.030988005992089115, 0.034623336483340456]





To conclude, taking into account the chief's wife year of birth, we might think that this is also not a significant factor in the clustering of houses. And indeed, we cannot use this information to cluster houses.

## Using the three most significant factors to cluster houses

In [None]:
# get the matching records
matching_columns = ['chef_vocation', 'chef_origine', 'chef_annee_arrivee']
matching_columns_weights = [1, 1, 1]

# define the matchings with scores of 0
index_pairs = list(itertools.combinations(rec.index, 2))
matching = pd.DataFrame(index=range(len(index_pairs)), columns=["id_1", "id_2", "score"])
matching[["id_1", "id_2"]] = index_pairs
matching["score"] = 0
matching = matching.set_index(["id_1", "id_2"])
# add score for each matching column
for matching_column in matching_columns:
    groups = rec.groupby(matching_column)
    for _, group in tqdm(groups):
        pairs = itertools.combinations(group.index, 2)
        for pair in pairs:
            matching.loc[pair, "score"] += 1
        
matching = matching \
    .sort_values(by="score", ascending=False) \
    .reset_index()
matching.head()

In [207]:
# get the matching data
matching_data = matching \
        .merge(rec, left_on="id_1", right_index=True) \
        .merge(rec, left_on="id_2", right_index=True, suffixes=("_1", "_2"))
matching_data = matching_data[
            ["id_1", "id_2", "score"] + [on_col + "_1" for on_col in matching_columns]
        ].rename(columns={
            on_col + "_1": on_col for on_col in matching_columns
        })
matching_data.head()

Unnamed: 0,id_1,id_2,score,chef_vocation,chef_origine,chef_annee_arrivee
0,937,2746,3,rentiere,lausanne,·
162182,265,2746,2,lingere,lausanne,·
402861,920,2746,1,·,·,·
1022,2398,2746,3,rentiere,lausanne,·
1002941,1311,2746,1,·,·,·


In [208]:
# clean the matching data
invalid_values = ["?", "·", "ne", "nee"]
min_cluster_size_thd = 10

for on_col in matching_columns:
    matching_data = matching_data[~matching_data[on_col].isin(invalid_values)]
matching_data = matching_data.groupby(matching_columns).filter(lambda x: len(x) >= min_cluster_size_thd)
matching_data = matching_data[matching_data["score"] > 0]
edge_list = matching_data[["id_1", "id_2", "score"]]
edge_list.head()

Unnamed: 0,id_1,id_2,score
1963065,333,2746,1
1967052,332,2746,1
2350001,219,2746,1
1938429,2315,2746,1
2547337,2086,2746,1


In [209]:
node_list = rec \
    .reset_index() \
    .rename(columns={"index": "id"}) \
    [["id", "nom_rue"]]
node_list.head()

Unnamed: 0,id,nom_rue
0,1,marterey
1,2,marterey
2,3,marterey
3,4,marterey
4,5,marterey


In [210]:
# write to csvs
node_list.to_csv("data/node_list.csv", index=False)
edge_list.to_csv("data/edge_list.csv", index=False)