In [1]:
import pandas as pd
import numpy as np
from IPython.core.display import HTML
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
from ef_dicts import station_type_map
from ef_dicts_2 import station_name_map
from ef_dict_rer import rer_name_map
from ef_sncf import sncf_dict
import json
from unidecode import unidecode

<h4>Stylesheet</h4>

In [2]:
css_style = """
h1 {
    color: black;
    font-family: 'Segoe UI', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif;
    font-size: 35px !important;
    padding-bottom: 10px;
    padding-top: 10px;
    border-bottom: 5px solid navy;
    border-top: 5px solid navy;
    font-variant: small-caps;
    text-align: center;
    margin-bottom: 25px;
}

h3 {
    color : dimgray;    
}

.all {
/*This class is the default class for <div> so it does not interact with jupyter notebook structure */
text-align: justify;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin-right: 2px;
}

.titlediv {
    /*This class is the default class for <div> so it does not interact with jupyter notebook structure */
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    border: 5px solid navy;
    padding-top: 30px;
    padding-bottom: 30px;
    padding-left: 5px;
    padding-right: 5px;
    margin-bottom: 10px;
    }

.titlediv_2 {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    text-align: center !important;
    }

.titlep {
    font-size : 50px;
    text-align: center !important;
    font-variant: small-caps;
}   

.titlep_2 {
    font-size : 30px;
    text-align: center !important;
    margin-top: 10px;
}  

.title_sp {
    text-align: center !important;
    font-size: 20px !important;
    font-weight: bold;
}

.signature {
    margin-top: 60px;
    padding-top: 15px;
    border-top: 2px solid black;
    text-align: right !important;
    font-family:'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

.obj {
    text-align: justify;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    margin-right: 100px;
    border: 2px solid lightcoral;
    padding: 15px;
}

p {
    text-align: justify !important;
}

.intro {
    font-style: italic;
}

.conclusion {
    border: 5px solid navy;
    margin-top: 50px;
    padding: 10px;
}

.conclusion h3 {
    color:black;
    font-variant: small-caps;
}

.small-caps {
    font-variant: small-caps;
}

table {
    margin-right: 10px !important;
    font-size: 14px !important;
    text-align: center !important;
}

.table_1 {
    border: 3px solid black !important;
    width: 100% !important;
}

.table_1 td, .table_1, th {
    border: 1px solid black;
}

.head_tr {
    background-color: dimgray !important;
    color: white;
    border: 3px solid black !important;
}

th, td {
    padding: 8px !important;
    text-align: center !important;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    border: 1.5px solid black; 
}


.recap_table td, .recap_table th {
    height: 50px;
    width: 180px;
}

.recap_table th {
    font-size: 18px;
}

.col_1 {
    text-align: right !important;
    width: 50 !important;
}

.col_group {
    width: 200px;
    background-color: #d4c9df !important;
    text-align: center;
    height: 5px !important;
    font-size: 16 !important;
    font-weight: bold;
}

.columns {
    display: flex;
    flex-wrap: wrap;
}

.column {
    flex: 1;
    padding: 10px;
    text-align: justify;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    margin-right: 50px;
}
"""
    
HTML(f'<style>{css_style}</style>')

In [3]:
display(HTML(f"""
<div class = 'all'>
    <h1>Introduction</h1>
    <h2>Origin of Data</h2>
    <p>
        Data on metro and RER A & B were fetched on <a href = 'https://data.ratp.fr/explore/?sort=modified' target = '_blank'>RATP</a>'s website.
    </p>
    <h3>Included Datasets</h3>
    <ul>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2021/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2021</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2020/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2020</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2019/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2019</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2018/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2018</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2017/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2017</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2016/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2016</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2015/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2015</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2014/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2014</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre</a></li>
</div>
"""))

In [140]:
# Importing RATP data: Metro and RER A/B ridership
ratp_2013 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre.csv', sep = ';')
ratp_2014 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2014.csv', sep = ';')
ratp_2015 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2015.csv', sep = ';')
ratp_2016 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2016.csv', sep = ';')
ratp_2017 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2017.csv', sep = ';')
ratp_2018 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2018.csv', sep = ';')
ratp_2019 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2019.csv', sep = ';')
ratp_2020 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2020.csv', sep = ';')
ratp_2021 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2021.csv', sep = ';')

ratp_2013.loc[308, 'Station'] = "MAIRIE DE MONTROUGE"
ratp_2016 = ratp_2016.drop(['Column 12', 'Column 13', 'Column 14', 'Column 15'], axis = 1)
ratp_2020.loc[326, 'Station'] = 'SAINT-OUEN'
ratp_2021.loc[93, 'Station'] = 'SAINT-OUEN'

ratp_13_21 = [
    ratp_2013,
    ratp_2014,
    ratp_2015,
    ratp_2016,
    ratp_2017,
    ratp_2018,
    ratp_2019,
    ratp_2020,
    ratp_2021    
]

# Creating a feature that stores the number of lines stopping at each metro station
years = range(2013, 2022)

for i, year in enumerate(years):
    ratp_13_21[i] = ratp_13_21[i].rename(columns={'Rang': f'Rang_{year}', 'Trafic': f'Traffic_{year}', 'Arrondissement pour Paris' : 'Arr'})
    ratp_13_21[i][f'Lignes_{year}'] = ratp_13_21[i].apply(
        lambda row: 5 if pd.notna(row['Correspondance_5'])
        else 4 if pd.notna(row['Correspondance_4'])
        else 3 if pd.notna(row['Correspondance_3'])
        else 2 if pd.notna(row['Correspondance_2'])
        else 1 if pd.notna(row['Correspondance_1'])
        else 0, axis = 1
    )
    
    ratp_13_21[i] = ratp_13_21[i].drop(['Correspondance_1', 'Correspondance_2', 'Correspondance_3', 'Correspondance_4', 'Correspondance_5'], axis = 1)

# Merging RATP dfs as one conaining trafic data from 2013 to 2021

df = ratp_13_21[0]

for i in range(1,9,1):
    df = pd.merge(df, ratp_13_21[i], how = 'outer', on = ['Station', 'Réseau', 'Ville'])
    
df = df.drop(['Arr_x', 'Arr_y'], axis = 1)
df.loc[65, 'Arr'] = 5

df = df.drop([
    'Lignes_2013', 'Lignes_2014', 'Lignes_2015', 'Lignes_2016', 'Lignes_2017', 'Lignes_2018', 
    'Lignes_2019', 'Lignes_2020', 'Rang_2013', 'Rang_2014', 'Rang_2015', 'Rang_2016', 'Rang_2017',
    'Rang_2018', 'Rang_2019', 'Rang_2020', 'Rang_2021'
], axis = 1)

col_trad = {
    "Réseau" : "Network",
    "Ville" : "City",
    "Lignes_2021" : "Lines",
}

df = df.rename(columns = col_trad)

gps = pd.read_csv('original_datasets/metro-france.csv', sep = ';')
gps_raw = pd.read_csv('original_datasets/metro-france.csv', sep = ';')

gps['dept'] = gps.apply(lambda row: int(str(row['Commune code Insee'])[:2]), axis = 1)     # Extracting the dept number from Insee code
gps = gps[gps['dept'].isin([75, 91, 92, 93, 94, 95, 77, 78])]                              # Limiting the data to Île-de-France depts
gps = gps[gps['finish'] == 1]                                                              # Dropping data from stations still under construction
gps = gps.drop_duplicates(subset = ['Libelle station'])                                    # In this dataset, there are multiple rows if a stations hosts a connexion
gps['Station'] = gps['Libelle station'].map(station_name_map)

cp = pd.read_csv('original_datasets/postcodes.csv', sep = ';', encoding='latin1')
cp = cp.drop('Ligne_5', axis = 1)
cp = cp.drop_duplicates(subset = ['Code_postal'])
cp = cp.rename(columns = {"Nom_de_la_commune" : "City", "Code_postal": "Postcode"})

gps = gps.rename(columns = {'Commune code Insee' : 'Insee'})
gps['Insee'] = gps.apply(lambda row: str(row['Insee']), axis = 1)
cp = cp.rename(columns = {"#Code_commune_INSEE" : "Insee"})
gps = pd.merge(left = gps, right = cp[["Insee", "Postcode"]], on = "Insee", how = "left")

gps2 = pd.read_csv('original_datasets/gares-idf.csv', sep = ';')
gps2_raw = pd.read_csv('original_datasets/gares-idf.csv', sep = ';')

gps2 = gps2[gps2['res_com'].isin(["RER A", "RER B", "RER C", "RER D", "RER E", "TRAIN H", "TRAIN J", "TRAIN K", "TRAIN L", "TRAIN N",
                      "TRAIN P", "TRAIN R", "TRAIN U"])]
gps2["Latitude"] = gps2.apply(lambda row: json.loads(row['Geo Shape'])["coordinates"][1], axis = 1)
gps2["Longitude"] = gps2.apply(lambda row: json.loads(row['Geo Shape'])["coordinates"][0], axis = 1)
gps2 = gps2[["nom_long", "Latitude", "Longitude", "res_com"]].drop_duplicates(subset = "nom_long")
gps2['Station'] = gps2['nom_long'].map(rer_name_map)

coords = pd.concat([gps[['Station', 'Longitude', 'Latitude']], gps2[['Station', 'Longitude', 'Latitude']]])

df = pd.merge(left = df, right = coords, on = "Station", how = "left")
 
df = df.drop(138)            # Drop Funiculaire station
 


trains = pd.read_csv('original_datasets/frequentation-gares.csv', sep = ';')
trains_raw = pd.read_csv('original_datasets/frequentation-gares.csv', sep = ';')

out_idf = [
    "Boran-sur-Oise", "Précy-sur-Oise", "Saint-Leu-d'Esserent", "Creil", "Chantilly - Gouvieux",
    "Orry-la-Ville - Coye", "La Borne Blanche", "Gisors", "Trie-Château", "Chaumont-en-Vexin",
    "Liancourt-Saint-Pierre", "Lavilletertre", "Vernon - Giverny", "Marchezais - Broué", 
    "Dreux", "Malesherbes", "Montargis", "Ferrières - Fontenay", "Dordives", "Château-Thierry", 
    "Chézy-sur-Marne", "Nogent-l'Artaud - Charly", "La Ferté-Milon", "Mareuil-sur-Ourcq",
    "Crépy-en-Valois", "Ormoy-Villers", "Nanteuil-le-Haudouin", "Le Plessis-Belleville"
]

trains['dept'] = trains.apply(lambda row: 99 if row['Nom de la gare'] in out_idf
                              else int(str(row['Code postal'])[:2]) if row['Code postal'] >= 10000
                              else int(str(row['Code postal'])[:1]), axis = 1)
trains = trains[trains['dept'].isin([75, 77, 78, 91, 92, 93, 94, 95, 99])]
trains = trains[~trains["Nom de la gare"].str.contains(r".*T13.*")]
trains = trains[~trains["Nom de la gare"].str.contains(r".*T11.*")]
trains['nom_long'] = trains['Nom de la gare'].map(sncf_dict)

trains = trains.drop(['Total Voyageurs + Non voyageurs 2022', "Total Voyageurs + Non voyageurs 2021", 
                      "Total Voyageurs + Non voyageurs 2020", "Total Voyageurs + Non voyageurs 2019",
                      "Total Voyageurs + Non voyageurs 2018", "Total Voyageurs + Non voyageurs 2017",
                      "Total Voyageurs + Non voyageurs 2016", "Total Voyageurs + Non voyageurs 2015",
                      "Code UIC", "Segmentation DRG"
                     ], axis = 1)

trains2 = pd.merge(left = trains, right = gps2, on = 'nom_long', how = 'left')
trains2 = trains2.drop([131, 154, 205, 208, 213, 223, 237, 262, 278, 327, 358, 287, 241, 242, 375, 382])   # Dropping rows related to tram stations

missing_lat = {
 'Guillerval' : 48.37512,
 'Gazeran' : 48.62589,
 'Longjumeau' : 48.70218,
 'Monnerville' : 48.34855,
 "Paris Bercy Bourgogne - Pays d'Auvergne" : 48.83920,
 'Angerville' : 48.31178,
 'Bréval' : 48.94363,
 'Gravigny Balizy' : 48.68533,
 'Massy TGV' : 48.72739,
 'Chilly-Mazarin' : 48.70067,
 'Petit Vaux' : 48.67651
}

missing_long = {
 'Guillerval' : 2.6057,
 'Gazeran' : 1.77177,
 'Longjumeau' : 2.29415,
 'Monnerville' : 2.03201,
 "Paris Bercy Bourgogne - Pays d'Auvergne" : 2.38294,
 'Angerville' : 2.00346,
 'Bréval' : 1.15180,
 'Gravigny Balizy' : 2.31742,
 'Massy TGV' : 2.26338,
 'Chilly-Mazarin' : 2.30818,
 'Petit Vaux' : 2.33272
}

trains2['Latitude'] = trains2.apply(lambda row: missing_lat[row['Nom de la gare']] if row['Nom de la gare'] in missing_lat.keys()
                                    else row['Latitude'], axis = 1)
trains2['Longitude'] = trains2.apply(lambda row: missing_long[row['Nom de la gare']] if row['Nom de la gare'] in missing_long.keys()
                                     else row['Longitude'], axis = 1)
trains2 = trains2.drop("res_com", axis = 1)
trains2 = trains2.rename(columns = {"Nom de la gare" : "Station_name", "Code postal" : "Postcode",
                                   "Total Voyageurs 2022" : "Traffic_2022", "Total Voyageurs 2021" : "Traffic_2021",
                                   "Total Voyageurs 2020" : "Traffic_2020", "Total Voyageurs 2019" : "Traffic_2019",
                                   "Total Voyageurs 2018" : "Traffic_2018", "Total Voyageurs 2017" : "Traffic_2017",
                                   "Total Voyageurs 2016" : "Traffic_2016", "Total Voyageurs 2015" : "Traffic_2015"
                                  })
trains2['Network'] = "RER"


trains2 = pd.merge(left = trains2, right = cp[['City', 'Postcode']], on = "Postcode", how = "left")
df = pd.concat([df, trains2])



df['Station_name'] = df.apply(lambda row: unidecode(str(row['Station']).lower()) if pd.isna(row['Station_name']) else unidecode(str(row['Station_name']).lower()), axis = 1)
df = df.drop(['Station', 'nom_long'], axis = 1)

df = df[['Station_name', 'Network', 'Longitude', 'Latitude', 'City', 'Arr',
       'Lines', 'Traffic_2013', 'Traffic_2014', 'Traffic_2015', 'Traffic_2016',
       'Traffic_2017', 'Traffic_2018', 'Traffic_2019', 'Traffic_2020',
       'Traffic_2021', 'Postcode',
       'Traffic_2022', 'dept']]

df = df.drop_duplicates()

df = df.reset_index(drop = True)

# Giving a distinct name to train stations that either shared a name with a station from another dataset
df.loc[438, 'Station_name'] = 'porte de clichy rer'
df.loc[446, 'Station_name'] = 'saint-ouen rer'
df.loc[578, 'Station_name'] = 'malesherbes rer'
df.loc[594, 'Station_name'] = 'pont cardinet rer'
df.loc[603, 'Station_name'] = 'saint-fargeau rer'
df.loc[717, 'Station_name'] = 'invalides rer'
df.loc[197, 'Station_name'] = 'la defense rer'
df.loc[494, 'Station_name'] = 'la defense rer sncf'
df.loc[63, 'Station_name'] = 'val de fontenay rer'
df.loc[679, 'Station_name'] = 'val de fontenay rer sncf'

# Merging train stations coming from RATP and SNCF datasets

stations_to_merge = {
    "Nanterre Université" : (361, 586),
    "La défense" : (197, 494),
    "Paris Gare de Lyon": (265, 521),
    "Val de Fontenay" : (63, 679),
    "Massy Palaiseau" : (157, 652),
    "Massy Verrieres" : (121, 653),
    "St Michel Notre Dame" : (65, 668),
    "Gare du Nord" : (61, 432),
    "Gare du Nord (Magenta)" : (61, 575),
    "Paris Saint Lazare": (592, 490),
    "Marne la Vallée" : (41, 417),
    "Massy Palaiseau / TGV" : (157, 580)
}

def merge_stations(df, id_a, id_b):
    for i in range(2015,2023,1):
        if np.isnan(df.loc[id_a, f"Traffic_{i}"]):
            df.loc[id_a, f"Traffic_{i}"] = 0
        if np.isnan(df.loc[id_b, f"Traffic_{i}"] == np.nan):
            df.loc[id_b, f"Traffic_{i}"] = 0
        df.loc[id_a, f"Traffic_{i}"] += df.loc[id_b, f"Traffic_{i}"]
    df.drop(id_b, inplace = True)
    
for station, loc in stations_to_merge.items():
    merge_stations(df, loc[0], loc[1])
    
df['City'] = df.apply(lambda row: unidecode(str(row['City'])).lower(), axis = 1)         # Harmonising city names

df = df.drop(["Traffic_2013", "Traffic_2014"], axis = 1)     # Dropping older data (SNCF data starts in 2015)

# Features creation : mean traffic years [2015 - 2019] and SD, % of lost ridership in 2020 and 2021 compared to pre-covid, increase from 2020 to 2021

df["Traffic_pre_cov"] = df.apply(lambda row: np.mean([row['Traffic_2015'], row['Traffic_2016'], row['Traffic_2017'], row['Traffic_2018'], row['Traffic_2019']]), axis = 1)
df["Traffic_pre_cov_sd"] = df.apply(lambda row: np.std([row['Traffic_2015'], row['Traffic_2016'], row['Traffic_2017'], row['Traffic_2018'], row['Traffic_2019']]), axis = 1)
df["covid_drop_2020"] = df.apply(lambda row: (row["Traffic_2020"] - row["Traffic_pre_cov"])/row["Traffic_pre_cov"], axis = 1) 
df["covid_drop_2021"] = df.apply(lambda row: (row["Traffic_2021"] - row["Traffic_pre_cov"])/row["Traffic_pre_cov"], axis = 1)
df["Traffic_2020_2021"] = df.apply(lambda row: (row["Traffic_2021"] - row["Traffic_2020"])/row["Traffic_2020"], axis = 1)


# Selecting train stations using their index
gares_dict = {
    "Z" : 592,
    "MM" : 433,
    "N" : 61,
    "E" : 520,
    "B" : 519,
    "L" : 265,
    "A" : 591,
    "MASSU" : 157,
    "FRMLV" : 41,
    "FRCDG" : 371   
}

df['Network'] = df.apply(lambda row: "Train station" if row.name in list(gares_dict.values()) else row['Network'], axis = 1)
df['Origin_id'] = df.apply(lambda row: row.name, axis = 1)

# Ranking stations according to ridership
gares = df[df['Network'] == 'Train station']
metro = df[df['Network'] == 'Métro']
rer = df[df['Network'] == 'RER']


gares_rename = {
    41: "marne la vallee - chessy",
    61: "paris gare du nord / magenta",
    157: "massy palaiseau / massy TGV",
    265: "paris gare de lyon",
    371: "aeroport charles de gaulle 2 TGV",
    433: "paris montparnasse",
    519: "paris bercy",
    520: "paris gare de l'est",
    592: "paris saint lazare / haussmann",
    591: "paris austerlitz"
}

gares['Station_name'] = gares['Origin_id'].map(gares_rename)

for i in range(2015,2023,1):
    gares = gares.sort_values(by = f'Traffic_{i}', ascending = False).reset_index(drop = True)
    gares[f'Rank_train_{i}'] = gares.apply(lambda row: row.name + 1, axis = 1)

for i in range(2015,2023,1):
    metro = metro.sort_values(by = f'Traffic_{i}', ascending = False).reset_index(drop = True)
    metro[f'Rank_metro_{i}'] = metro.apply(lambda row: row.name + 1, axis = 1)
    
for i in range(2015,2023,1):
    rer = rer.sort_values(by = f'Traffic_{i}', ascending = False).reset_index(drop = True)
    rer[f'Rank_rer_{i}'] = rer.apply(lambda row: row.name + 1, axis = 1)    

df = pd.concat([gares, metro, rer])
df.to_csv('even_flow_etl_database.csv')
display(HTML(f"""
<div class = 'all'>
    <h2></h2>
    <p>
        Ridership data was gathered from 2013 to 2021. During this time period:
    </p>
    <ul>
        <li>March 2013: Metro station Mairie de Montrouge (Line 4) opened</li>
        <li>2020: Covid-19 outburst, with several lockdown period in France</li>
        <li>2020-2021: Metro line 14 was extended, connecting to Porte de Clichy and Mairie de Saint-Ouen metro stations,
        as well as Pont Cardinet and Saint-Ouen. The latter two are not present in prior datasets as they were stations for
        Transilien line L and RER line C, not covered by these datasets.</li>
        <li>No station was closed during this time period.</li>
    </ul>
    <h2>Data Processing</h2>
    <ul>
        <li>In the 2013 dataset, Mairie de Montrouge station was labeled <code>"MAIRIE DE MONTROUGE**"</code>
        , probably because the station opened in march 2013. "**" was deleted to match the name of this station 
        in other datasets.</li>
        <li>Four empty columns were found in 2016 dataset, and were dropped.</li>
        <li>In 2020 and 2021 datasets, Saint-Ouen station was labeled <code>"CLICHY SAINT-OUEN"</code> as it was the expected
        commercial name, but was eventually named Saint-Ouen to match the existing station on RER C line. 
        Accordingly, station was renamed <code>"SAINT-OUEN"</code> in the present dataset.</li>
        <li>The arrondissement was missing for the station <code>"SAINT-MICHEL NOTRE-DAME"</code> and was manually added.</li>
    </ul>
    <h2>Feature Engineering</h2>
    <ul>
        <li>Features named <code>Correspondance_X</code> were dropped and replaced by a unique feature <code>Line</code>: 
        the number of lines connecting in the station.</li>
        <li>Creation of <code>Mean_trafic</code> and <code>SD_Trafic</code> to calculate the mean 
        and SD of trafic over the covered period.</li>
        <li>Creation of <code>Trafic_norm_X</code>, X being the related year. This feature is the yearly trafic
        divided by the number of lines in the station.</li>
        <li>Creation of <code>Station_type</code> that separated stations into groups, as detailed in the table below.</li>
        <li>Creation of <code>Rank_mean</code>, the rank based on <code>Mean_trafic</code>.</li>
        <li> Creation of <code>Latitude</code> and <code>Longitude</code> to store gps coordinates of all stations.
            <ul>
                <li>Metro station longitude and latitude where obtained from another dataset named <i>Lignes et stations de métro en France</i>
                    from the <a href = "https://www.data.gouv.fr/fr/datasets/lignes-et-stations-de-metro-en-france/" target = '_blank'>french government website</a>.</li>
                <li>RER station longitude and latitude were obtained from another dataset named <i>emplacement-des-gares-idf</i> from <a href = 'https://data.iledefrance-mobilites.fr/explore/dataset/emplacement-des-gares-idf/information/' target = '_blank'>Île de France Mobilités website</a>
                for lines administered by RATP and from a dataset names <i>frequentation-gares</i> from <a href = 'https://ressources.data.sncf.com/explore/dataset/frequentation-gares/information/?disjunctive.nom_gare&disjunctive.code_postal&sort=nom_gare' target = '_blank'>SNCF website</a>
                for lines administered by SNCF.</li>
            </ul>
        </li>
    </ul>
</div>
<br>
<div class = 'all' style = 'font-style:italic; font-size: 12px'>
    <p>
    <strong>Semantic note</strong> -  "RER" refers to any suburban train line:
    </p>
    <ul>
        <li>RER line A, B, C, D and E</li>            
        <li>Transilien line H, J, K, L, N, P, R, U</li>
    </ul>
    <p>
        Conversely, "train" refers to other national train lines, usually classified as "TGV" (high speed train), 
        "Intercité" (regular speed train) and "TER" (regional trains). Such trains will connect to a very limited number
        of stations from this dataset, classified as "Train station"
    </p>
</div>
"""))

display(df.head(10))

Unnamed: 0,Station_name,Network,Longitude,Latitude,City,Arr,Lines,Traffic_2015,Traffic_2016,Traffic_2017,Traffic_2018,Traffic_2019,Traffic_2020,Traffic_2021,Postcode,Traffic_2022,dept,Traffic_pre_cov,Traffic_pre_cov_sd,covid_drop_2020,covid_drop_2021,Traffic_2020_2021,Origin_id,Rank_train_2015,Rank_train_2016,Rank_train_2017,Rank_train_2018,Rank_train_2019,Rank_train_2020,Rank_train_2021,Rank_train_2022,Rank_metro_2015,Rank_metro_2016,Rank_metro_2017,Rank_metro_2018,Rank_metro_2019,Rank_metro_2020,Rank_metro_2021,Rank_metro_2022,Rank_rer_2015,Rank_rer_2016,Rank_rer_2017,Rank_rer_2018,Rank_rer_2019,Rank_rer_2020,Rank_rer_2021,Rank_rer_2022
0,paris gare du nord / magenta,Train station,2.356329,48.880922,paris,10.0,1.0,332020946.0,335742120.0,340795086.0,331166163.0,335178997.0,130830320,186202010,,244744330.0,,334980662.4,3398147.0,-0.609439,-0.444141,0.423233,61,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,
1,paris saint lazare / haussmann,Train station,2.324738,48.876837,paris 08,,,153937114.0,156378179.0,158141436.0,155983269.0,155027706.0,124495879,121178259,75008.0,135527602.0,75.0,155893540.8,1405221.0,-0.201405,-0.222686,-0.026648,592,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,
2,paris gare de lyon,Train station,2.373916,48.843773,paris,12.0,1.0,148082749.0,148856941.0,153468536.0,149450759.0,150170752.0,77574420,111625467,,102024783.0,,150005947.4,1862462.0,-0.482858,-0.25586,0.438947,265,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,
3,paris montparnasse,Train station,2.320509,48.841187,paris 15,,,55167664.0,55162747.0,57752742.0,59174533.0,61374056.0,32447396,41039816,75015.0,56862435.0,75.0,57726348.4,2388415.0,-0.43791,-0.289063,0.264811,433,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,,,,,,,,,,,,,,,,
4,paris gare de l'est,Train station,2.359457,48.877376,paris 10,,,34919689.0,35820125.0,38455354.0,39301206.0,41240098.0,21580625,27758686,75010.0,36774394.0,75.0,37947294.4,2307596.0,-0.4313,-0.268494,0.286278,520,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,,,,,,,,,,,,,,,
5,paris austerlitz,Train station,2.365433,48.842528,paris 13,,,23300254.0,23073881.0,23212805.0,21821140.0,21682820.0,13203603,22281301,75013.0,19515861.0,75.0,22618180.0,712269.4,-0.416239,-0.014894,0.687517,591,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,,,,,,,,,,,,,,,,
6,aeroport charles de gaulle 2 TGV,Train station,2.570589,49.004687,roissy en france,,,12976556.0,13534634.0,14649835.0,14907773.0,15227840.0,5011999,6281244,95700.0,12949183.0,95.0,14259327.6,858099.9,-0.648511,-0.559499,0.253241,371,8.0,8.0,8.0,8.0,8.0,8.0,8.0,7.0,,,,,,,,,,,,,,,,
7,massy palaiseau / massy TGV,Train station,2.258323,48.725997,massy,,1.0,14798447.0,15528068.0,16787812.0,16404093.0,15835128.0,7170001,13284465,,6345326.0,,15870709.6,691952.5,-0.548224,-0.162957,0.852784,157,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,,,,,,,,,,,,,,,,
8,paris bercy,Train station,2.38294,48.8392,paris 12,,,3823448.0,3702383.0,4003278.0,3983108.0,4318464.0,2493266,3347427,75012.0,4764857.0,75.0,3966136.2,207715.2,-0.371361,-0.155998,0.342587,519,10.0,10.0,10.0,10.0,10.0,10.0,10.0,9.0,,,,,,,,,,,,,,,,
9,marne la vallee - chessy,Train station,2.782227,48.86995,montevrain,,1.0,10690051.0,10575800.0,11860737.0,12225619.0,11524013.0,4685119,5837863,,4641120.0,,11375244.0,646463.2,-0.58813,-0.486792,0.246044,41,9.0,9.0,9.0,9.0,9.0,9.0,9.0,10.0,,,,,,,,,,,,,,,,
