# French Demographics Data

In [1]:
import pandas as pd

## *Statistiques locales* by department. [INSEE](https://www.insee.fr/fr/statistiques/6013867) 2022

In [5]:
# Read CSV without setting a header
stats_locale = pd.read_csv("Demographics/INSEE/stats_locales_2023.csv", sep=';', header=None)
# Use the third row as column headers
stats_locale.columns = stats_locale.iloc[2]
# Drop the metadata rows
stats_locale = stats_locale.drop([0, 1, 2]).reset_index(drop=True)

stats_locale.head()

2,Code,Libellé,Nb de pers. non scolarisées de 15 ans ou + 2021,Salaire net horaire moyen 2022,Population municipale 2022,Densité de population (historique depuis 1876) 2021,Taux de pauvreté 2021,Taux de chômage annuel moyen 2023
0,1,Ain,485536,15.96,671289,115.1,10.8,5.5
1,2,Aisne,394405,14.46,525558,71.7,18.8,10.5
2,3,Allier,263288,14.19,334715,45.6,16.2,7.8
3,4,Alpes-de-Haute-Provence,129945,14.6,167179,24.0,17.1,8.1
4,5,Hautes-Alpes,110528,13.99,141677,25.4,14.7,6.6


In [6]:
stats_locale.columns.tolist()

['Code',
 'Libellé',
 'Nb de pers. non scolarisées de 15 ans ou + 2021',
 'Salaire net horaire moyen 2022',
 'Population municipale 2022',
 'Densité de population (historique depuis 1876) 2021',
 'Taux de pauvreté 2021',
 'Taux de chômage annuel moyen 2023']

In [11]:
stats_locale = stats_locale.rename(columns={
    'Code': 'department_num',
    'Libellé': 'department',
    'Taux de pauvreté 2021': 'poverty_rate(%)',
    'Taux de chômage annuel moyen 2023': 'average_annual_unemployment_rate(%)',
    'Salaire net horaire moyen 2022': 'average_net_hourly_wage(€)',
    'Population municipale 2022': 'municipal_population',
    'Densité de population (historique depuis 1876) 2021': 'population_density(inhabitants/sq_km)',
    'Nb de pers. non scolarisées de 15 ans ou + 2021': 'non_schooled_persons_15_and_over'
})

In [12]:
stats_locale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 8 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   department_num                         101 non-null    object
 1   department                             101 non-null    object
 2   non_schooled_persons_15_and_over       101 non-null    object
 3   average_net_hourly_wage(€)             101 non-null    object
 4   municipal_population                   101 non-null    object
 5   population_density(inhabitants/sq_km)  101 non-null    object
 6   poverty_rate(%)                        101 non-null    object
 7   average_annual_unemployment_rate(%)    101 non-null    object
dtypes: object(8)
memory usage: 6.4+ KB


In [13]:
# Remove leading and trailing whitespace for all string columns
stats_locale = stats_locale.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

In [14]:
# Define columns to be converted to numeric type
numeric_cols = ['average_annual_unemployment_rate(%)',
                'average_net_hourly_wage(€)',
                'municipal_population',
                'population_density(inhabitants/sq_km)',
                'non_schooled_persons_15_and_over']

# Convert the columns to numeric, setting any errors to NaN
for col in numeric_cols:
    stats_locale[col] = pd.to_numeric(stats_locale[col], errors='coerce')

In [15]:
# Check if there are any NaN values in the DataFrame
nan_values = stats_locale.isnull().sum()
print(nan_values[nan_values > 0])

2
non_schooled_persons_15_and_over         1
average_net_hourly_wage(€)               1
municipal_population                     1
population_density(inhabitants/sq_km)    1
average_annual_unemployment_rate(%)      1
dtype: int64


In [16]:
# Return that row of data
nan_rows = stats_locale[stats_locale.isna().any(axis=1)]
nan_rows

2,department_num,department,non_schooled_persons_15_and_over,average_net_hourly_wage(€),municipal_population,population_density(inhabitants/sq_km),poverty_rate(%),average_annual_unemployment_rate(%)
100,976,Mayotte,,,,,N/A - résultat non disponible,


*`Mayotte`* is an overseas department which will be removed from the dataset as we are focusing our analysis on mainland France

In [17]:
stats_locale.tail(10)

2,department_num,department,non_schooled_persons_15_and_over,average_net_hourly_wage(€),municipal_population,population_density(inhabitants/sq_km),poverty_rate(%),average_annual_unemployment_rate(%)
91,91,Essonne,911594.0,18.42,1324546.0,728.1,13.9,6.4
92,92,Hauts-de-Seine,1148994.0,26.79,1647435.0,9312.1,12.4,5.9
93,93,Seine-Saint-Denis,1121318.0,15.47,1681725.0,7064.6,28.4,10.3
94,94,Val-de-Marne,982682.0,19.45,1419531.0,5776.3,17.2,7.1
95,95,Val-d'Oise,855651.0,17.56,1270845.0,1008.6,17.7,8.0
96,971,Guadeloupe,286147.0,15.21,383569.0,236.0,N/A - résultat non disponible,18.6
97,972,Martinique,277245.0,15.02,361019.0,319.8,26.8,10.8
98,973,Guyane,166297.0,15.43,288382.0,3.4,N/A - résultat non disponible,14.0
99,974,La Réunion,602199.0,14.26,881348.0,347.9,36.1,19.0
100,976,Mayotte,,,,,N/A - résultat non disponible,


We remove the overseas departments

In [18]:
# Remove the last 5 entries from the DataFrame (Overseas territories)
stats_locale = stats_locale.iloc[:-5]
print(f"Shape of stats_locale: {stats_locale.shape}")

Shape of stats_locale: (96, 8)


In [19]:
paris = stats_locale[stats_locale['department'] == 'Paris']
paris

2,department_num,department,non_schooled_persons_15_and_over,average_net_hourly_wage(€),municipal_population,population_density(inhabitants/sq_km),poverty_rate(%),average_annual_unemployment_rate(%)
75,75,Paris,1552859.0,27.88,2113705.0,20238.2,15.6,5.7


We need to extrapolate `area(sq_km)` from population density and population

In [20]:
stats_locale['area(sq_km)'] = round(stats_locale['municipal_population'] /
                                                           stats_locale['population_density(inhabitants/sq_km)'], 2)

In [21]:
stats_locale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   department_num                         96 non-null     object 
 1   department                             96 non-null     object 
 2   non_schooled_persons_15_and_over       96 non-null     float64
 3   average_net_hourly_wage(€)             96 non-null     float64
 4   municipal_population                   96 non-null     float64
 5   population_density(inhabitants/sq_km)  96 non-null     float64
 6   poverty_rate(%)                        96 non-null     object 
 7   average_annual_unemployment_rate(%)    96 non-null     float64
 8   area(sq_km)                            96 non-null     float64
dtypes: float64(6), object(3)
memory usage: 6.9+ KB


----
&nbsp;
## GDP by Department. [OECD](https://stats.oecd.org) 2001 - 2020

This data is not as clean as the other two sources

In [22]:
gdp = pd.read_csv("Demographics/GDP_departmental.csv")
gdp.head(5)

Unnamed: 0,Year,Unnamed: 1,Unnamed: 2,2001,2002,2003,2004,2005,2006,2007,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,FR101: Paris,"Euro, Millions",,160 406,161 712,162 368,165 582,172 171,175 198,186 771,...,199 526,204 857,207 534,210 141,214 434,219 964,227 457,237 283,246 937,226 007
1,FR102: Seine-et-Marne,"Euro, Millions",,24 721.5,26 879.9,27 856.6,29 283.9,29 903.1,31 814.7,33 695.6,...,37 628.4,39 139.3,38 546.3,39 244.3,39 934.5,40 635.8,41 513.1,41 968.7,44 271.8,39 478.1
2,FR103: Yvelines,"Euro, Millions",,41 113.9,42 180.8,42 832.7,43 728.5,45 595.8,47 101.2,50 043.1,...,53 645.1,53 823.9,55 041.9,54 497.6,56 771.4,57 769.3,58 659.2,59 728.3,61 249,56 649.9
3,FR104: Essonne,"Euro, Millions",,31 959,33 611.8,35 251,36 274.2,36 045.8,38 293.5,40 579.4,...,42 610.7,43 857.9,47 425.1,47 993.2,49 173.1,50 072.2,52 010.1,53 586.7,55 491.7,53 821.9
4,FR105: Hauts-de-Seine,"Euro, Millions",,99 819.9,105 194,106 868,111 508,118 961,123 114,131 837,...,147 676,151 710,156 217,157 950,159 861,164 572,168 693,177 044,188 096,176 675


Values are in millions of Euros. We lose the two 'Unnamed' columns

In [23]:
# We loose the 'Unnamed' columns
gdp = gdp[[col for col in gdp.columns if 'Unnamed' not in col]]

In [24]:
gdp.head(3)

Unnamed: 0,Year,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,FR101: Paris,160 406,161 712,162 368,165 582,172 171,175 198,186 771,189 261,182 178,...,199 526,204 857,207 534,210 141,214 434,219 964,227 457,237 283,246 937,226 007
1,FR102: Seine-et-Marne,24 721.5,26 879.9,27 856.6,29 283.9,29 903.1,31 814.7,33 695.6,35 794.1,35 378.1,...,37 628.4,39 139.3,38 546.3,39 244.3,39 934.5,40 635.8,41 513.1,41 968.7,44 271.8,39 478.1
2,FR103: Yvelines,41 113.9,42 180.8,42 832.7,43 728.5,45 595.8,47 101.2,50 043.1,52 799.2,50 204.9,...,53 645.1,53 823.9,55 041.9,54 497.6,56 771.4,57 769.3,58 659.2,59 728.3,61 249,56 649.9


We need to strip the non-breaking space in the numerical data.

In [25]:
def strip_nbsp(cell):
    if isinstance(cell, str):
        return cell.replace('\xa0', '')
    return cell

In [26]:
gdp = gdp.applymap(strip_nbsp)

If a column represent a year then it contains numeric data

In [27]:
for col in gdp.columns:
    if col.isdigit():  # Check if the column name is a year (4-digit number)
        gdp[col] = gdp[col].astype(float)  # Convert to float

In [28]:
# Only the most recent year is required
gdp = gdp[['Year', '2020']]
gdp = gdp.rename(columns={'Year': 'department', '2020': 'GDP_millions(€)'})

In [29]:
gdp.tail(10)

Unnamed: 0,department,GDP_millions(€)
91,FRL04: Bouches-du-Rhône,73878.7
92,FRL05: Var,27056.5
93,FRL06: Vaucluse,17880.2
94,FRM01: Corse-du-Sud,4783.9
95,FRM02: Haute-Corse,4300.08
96,FRY10: Guadeloupe,9219.3
97,FRY20: Martinique,8788.07
98,FRY30: French Guiana,4424.65
99,FRY40: La Réunion,18973.0
100,FRY50: Mayotte,2711.64


Overseas departments occupy the last five rows and can be dropped

In [30]:
gdp = gdp.iloc[:-5]
print(f"Shape of GDP: {gdp.shape}")

Shape of GDP: (96, 2)


In [31]:
# Remove the initial part of the string ending with a colon followed by whitespace
gdp['department'] = gdp['department'].str.replace(r'^[^:]+:\s*', '', regex=True)

In [32]:
gdp.head()

Unnamed: 0,department,GDP_millions(€)
0,Paris,226007.0
1,Seine-et-Marne,39478.1
2,Yvelines,56649.9
3,Essonne,53821.9
4,Hauts-de-Seine,176675.0


----
&nbsp;
### Merging `departments`, `stats_locale` & `gdp`

We aim to sort all DataFrames on `department_num` and therefore merge `gdp` with `departments` first.

In [35]:
departments = pd.read_csv('Demographics/departments.csv')
departments.head(5)

Unnamed: 0,department_num,department,capital,region
0,1,Ain,Bourg-en-Bresse,Auvergne-Rhône-Alpes
1,2,Aisne,Laon,Hauts-de-France
2,3,Allier,Moulins,Auvergne-Rhône-Alpes
3,4,Alpes-de-Haute-Provence,Digne-les-Bains,Provence-Alpes-Côte d'Azur
4,5,Hautes-Alpes,Gap,Provence-Alpes-Côte d'Azur


In [36]:
# We check unique values
set1 = set(departments['department'].unique())
set2 = set(gdp['department'].unique())
print(set1 == set2)  # This should print True if all sets are equal

True


In [37]:
gdp_departments = departments.merge(gdp, on='department', how='inner')

We now merge with `stats_locale`

In [38]:
# We sort all dfs on 'department_num'
gdp_departments = gdp_departments.sort_values('department_num')
stats_locale = stats_locale.sort_values('department_num')

We check if the name `department` is equal in the two dataframes

In [39]:
set1 = set(gdp_departments['department'].unique())
set2 = set(stats_locale['department'].unique())
print(set1 == set2)

True


In [40]:
from functools import reduce

# List of dataframes to merge
dfs = [gdp_departments, stats_locale]

# Use reduce and merge to merge both dataframes
demographics = reduce(lambda left,right: pd.merge(left,right,on=['department', 'department_num']), dfs)

Will drop `non_schooled_persons_15_and_over` as I feel it's the weakest statistic

In [41]:
demographics = demographics.drop(columns="non_schooled_persons_15_and_over")
demographics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 0 to 95
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   department_num                         96 non-null     object 
 1   department                             96 non-null     object 
 2   capital                                96 non-null     object 
 3   region                                 96 non-null     object 
 4   GDP_millions(€)                        96 non-null     float64
 5   average_net_hourly_wage(€)             96 non-null     float64
 6   municipal_population                   96 non-null     float64
 7   population_density(inhabitants/sq_km)  96 non-null     float64
 8   poverty_rate(%)                        96 non-null     object 
 9   average_annual_unemployment_rate(%)    96 non-null     float64
 10  area(sq_km)                            96 non-null     float64
dtypes: float

We will calculate `GDP_per_capita(€)` as it's a more meaningful statistic for comparison

In [42]:
demographics['GDP_per_capita(€)'] = round((demographics['GDP_millions(€)'] * 1e6) / demographics['municipal_population'], 2)

In [43]:
print(demographics.columns.tolist())

['department_num', 'department', 'capital', 'region', 'GDP_millions(€)', 'average_net_hourly_wage(€)', 'municipal_population', 'population_density(inhabitants/sq_km)', 'poverty_rate(%)', 'average_annual_unemployment_rate(%)', 'area(sq_km)', 'GDP_per_capita(€)']


In [44]:
# Reorder the columns
new_order = ['department_num', 'department', 'capital', 'region', 'GDP_millions(€)', 'GDP_per_capita(€)',
             'poverty_rate(%)', 'average_annual_unemployment_rate(%)', 'average_net_hourly_wage(€)',
             'municipal_population', 'population_density(inhabitants/sq_km)', 'area(sq_km)']

In [45]:
demographics = demographics[new_order]

In [46]:
demographics.head()

Unnamed: 0,department_num,department,capital,region,GDP_millions(€),GDP_per_capita(€),poverty_rate(%),average_annual_unemployment_rate(%),average_net_hourly_wage(€),municipal_population,population_density(inhabitants/sq_km),area(sq_km)
0,1,Ain,Bourg-en-Bresse,Auvergne-Rhône-Alpes,16726.4,24916.84,10.8,5.5,15.96,671289.0,115.1,5832.22
1,2,Aisne,Laon,Hauts-de-France,12016.4,22864.08,18.8,10.5,14.46,525558.0,71.7,7329.96
2,3,Allier,Moulins,Auvergne-Rhône-Alpes,8278.98,24734.42,16.2,7.8,14.19,334715.0,45.6,7340.24
3,4,Alpes-de-Haute-Provence,Digne-les-Bains,Provence-Alpes-Côte d'Azur,4154.79,24852.34,17.1,8.1,14.6,167179.0,24.0,6965.79
4,5,Hautes-Alpes,Gap,Provence-Alpes-Côte d'Azur,3756.22,26512.56,14.7,6.6,13.99,141677.0,25.4,5577.83


The `demographics` data can now be exported

In [47]:
# Export the data to a csv file
demographics.to_csv('Demographics/demographics_2023.csv', index=False)