In [1]:
import pandas as pd
import numpy as np
from IPython.core.display import HTML
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

<h4>Stylesheet</h4>

In [2]:
css_style = """
h1 {
    color: black;
    font-family: 'Segoe UI', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif;
    font-size: 35px !important;
    padding-bottom: 10px;
    padding-top: 10px;
    border-bottom: 5px solid navy;
    border-top: 5px solid navy;
    font-variant: small-caps;
    text-align: center;
    margin-bottom: 25px;
}

h3 {
    color : dimgray;    
}

.all {
/*This class is the default class for <div> so it does not interact with jupyter notebook structure */
text-align: justify;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin-right: 2px;
}

.titlediv {
    /*This class is the default class for <div> so it does not interact with jupyter notebook structure */
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    border: 5px solid navy;
    padding-top: 30px;
    padding-bottom: 30px;
    padding-left: 5px;
    padding-right: 5px;
    margin-bottom: 10px;
    }

.titlediv_2 {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    text-align: center !important;
    }

.titlep {
    font-size : 50px;
    text-align: center !important;
    font-variant: small-caps;
}   

.titlep_2 {
    font-size : 30px;
    text-align: center !important;
    margin-top: 10px;
}  

.title_sp {
    text-align: center !important;
    font-size: 20px !important;
    font-weight: bold;
}

.signature {
    margin-top: 60px;
    padding-top: 15px;
    border-top: 2px solid black;
    text-align: right !important;
    font-family:'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

.obj {
    text-align: justify;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    margin-right: 100px;
    border: 2px solid lightcoral;
    padding: 15px;
}

p {
    text-align: justify !important;
}

.intro {
    font-style: italic;
}

.conclusion {
    border: 5px solid navy;
    margin-top: 50px;
    padding: 10px;
}

.conclusion h3 {
    color:black;
    font-variant: small-caps;
}

.small-caps {
    font-variant: small-caps;
}

table {
    margin-right: 10px !important;
    font-size: 14px !important;
    text-align: center !important;
}

.table_1 {
    border: 3px solid black !important;
    width: 100% !important;
}

.table_1 td, .table_1, th {
    border: 1px solid black;
}

.head_tr {
    background-color: dimgray !important;
    color: white;
    border: 3px solid black !important;
}

th, td {
    padding: 8px !important;
    text-align: center !important;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    border: 1.5px solid black; 
}


.recap_table td, .recap_table th {
    height: 50px;
    width: 180px;
}

.recap_table th {
    font-size: 18px;
}

.col_1 {
    text-align: right !important;
    width: 50 !important;
}

.col_group {
    width: 200px;
    background-color: #d4c9df !important;
    text-align: center;
    height: 5px !important;
    font-size: 16 !important;
    font-weight: bold;
}

.columns {
    display: flex;
    flex-wrap: wrap;
}

.column {
    flex: 1;
    padding: 10px;
    text-align: justify;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    margin-right: 50px;
}
"""
    
HTML(f'<style>{css_style}</style>')

In [3]:
display(HTML(f"""
<div class = 'all'>
    <h1>Introduction</h1>
    <h2>Origin of Data</h2>
    <p>
        Data on metro and RER A & B were fetched on <a href = 'https://data.ratp.fr/explore/?sort=modified' target = '_blank'>RATP</a>'s website.
    </p>
    <h3>Included Datasets</h3>
    <ul>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2021/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2021</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2020/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2020</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2019/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2019</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2018/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2018</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2017/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2017</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2016/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2016</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2015/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2015</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre-2014/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre-2014</a></li>
        <li><a href = 'https://data.ratp.fr/explore/dataset/trafic-annuel-entrant-par-station-du-reseau-ferre/information/' target = '_blank'>trafic-annuel-entrant-par-station-du-reseau-ferre</a></li>
</div>
"""))

In [6]:
ratp_2013 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre.csv', sep = ';')
ratp_2014 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2014.csv', sep = ';')
ratp_2015 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2015.csv', sep = ';')
ratp_2016 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2016.csv', sep = ';')
ratp_2017 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2017.csv', sep = ';')
ratp_2018 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2018.csv', sep = ';')
ratp_2019 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2019.csv', sep = ';')
ratp_2020 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2020.csv', sep = ';')
ratp_2021 = pd.read_csv('original_datasets/trafic-annuel-entrant-par-station-du-reseau-ferre-2021.csv', sep = ';')

ratp_2013.loc[308, 'Station'] = "MAIRIE DE MONTROUGE"
ratp_2016 = ratp_2016.drop(['Column 12', 'Column 13', 'Column 14', 'Column 15'], axis = 1)
ratp_2020.loc[326, 'Station'] = 'SAINT-OUEN'
ratp_2021.loc[93, 'Station'] = 'SAINT-OUEN'

ratp_13_21 = [
    ratp_2013,
    ratp_2014,
    ratp_2015,
    ratp_2016,
    ratp_2017,
    ratp_2018,
    ratp_2019,
    ratp_2020,
    ratp_2021    
]

years = range(2013, 2022)

for i, year in enumerate(years):
    ratp_13_21[i] = ratp_13_21[i].rename(columns={'Rang': f'Rang_{year}', 'Trafic': f'Trafic_{year}', 'Arrondissement pour Paris' : 'Arr'})
    ratp_13_21[i][f'Lignes_{year}'] = ratp_13_21[i].apply(
        lambda row: 5 if pd.notna(row['Correspondance_5'])
        else 4 if pd.notna(row['Correspondance_4'])
        else 3 if pd.notna(row['Correspondance_3'])
        else 2 if pd.notna(row['Correspondance_2'])
        else 1 if pd.notna(row['Correspondance_1'])
        else 0, axis = 1
    )
    
    ratp_13_21[i] = ratp_13_21[i].drop(['Correspondance_1', 'Correspondance_2', 'Correspondance_3', 'Correspondance_4', 'Correspondance_5'], axis = 1)

df = ratp_13_21[0]

for i in range(1,9,1):
    df = pd.merge(df, ratp_13_21[i], how = 'outer', on = ['Station', 'Réseau', 'Ville'])
    
df = df.drop(['Arr_x', 'Arr_y'], axis = 1)
df.loc[65, 'Arr'] = 5

df['Trafic_2013_norm'] = (df['Trafic_2013']/df['Lignes_2013']).astype('float')
df['Trafic_2014_norm'] = (df['Trafic_2014']/df['Lignes_2014']).astype('float')
df['Trafic_2015_norm'] = (df['Trafic_2015']/df['Lignes_2015']).astype('float')
df['Trafic_2016_norm'] = (df['Trafic_2016']/df['Lignes_2016']).astype('float')
df['Trafic_2017_norm'] = (df['Trafic_2017']/df['Lignes_2017']).astype('float')
df['Trafic_2018_norm'] = (df['Trafic_2018']/df['Lignes_2018']).astype('float')
df['Trafic_2019_norm'] = (df['Trafic_2019']/df['Lignes_2019']).astype('float')
df['Trafic_2020_norm'] = (df['Trafic_2020']/df['Lignes_2020']).astype('float')
df['Trafic_2021_norm'] = (df['Trafic_2021']/df['Lignes_2021']).astype('float')

df = df.drop(['Lignes_2013', 'Lignes_2014', 'Lignes_2015', 'Lignes_2016', 'Lignes_2017', 'Lignes_2018', 'Lignes_2019', 'Lignes_2020'], axis = 1)

col_trad = {
    "Rang_2013" : "Rank_2013",
    "Rang_2014" : "Rank_2014",
    "Rang_2015" : "Rank_2015",
    "Rang_2016" : "Rank_2016",
    "Rang_2017" : "Rank_2017",
    "Rang_2018" : "Rank_2018",
    "Rang_2019" : "Rank_2019",
    "Rang_2020" : "Rank_2020",
    "Rang_2021" : "Rank_2021",
    "Réseau" : "Network",
    "Ville" : "City",
    "Lignes_2021" : "Lines",
}
df = df.rename(columns = col_trad)

df['Mean_trafic'] = df.apply(lambda row: np.mean([row['Trafic_2013'], row['Trafic_2014'], row['Trafic_2015'], row['Trafic_2016'], row['Trafic_2017'], row['Trafic_2018'], row['Trafic_2019'], row['Trafic_2020'], row['Trafic_2021']]), axis = 1)
df['SD_trafic'] = df.apply(lambda row: np.std([row['Trafic_2013'], row['Trafic_2014'], row['Trafic_2015'], row['Trafic_2016'], row['Trafic_2017'], row['Trafic_2018'], row['Trafic_2019'], row['Trafic_2020'], row['Trafic_2021']]), axis = 1)

df['Station_type'] = df.apply(lambda row: "minor" if row["Lines"] == 1
                              else "connexion" if row["Lines"] == 2
                              else "hub" if row["Lines"] > 2
                              else None, axis = 1
                             )

In [5]:
display(HTML(f"""
<div class = 'all'>
    <h2></h2>
    <p>
        Ridership data was gathered from 2013 to 2021. During this time period:
    </p>
    <ul>
        <li>March 2013: Metro station Mairie de Montrouge (Line 4) opened</li>
        <li>2020: Covid-19 outburst, with several lockdown period in France</li>
        <li>2020-2021: Metro line 14 was extended, connecting to Porte de Clichy and Mairie de Saint-Ouen metro stations,
        as well as Pont Cardinet and Saint-Ouen. The latter two are not present in prior datasets as they were stations for
        Transilien line L and RER line C, not covered by these datasets.</li>
        <li>No station was closed during this time period.</li>
    </ul>
    <h2>Data Processing</h2>
    <ul>
        <li>In the 2013 dataset, MAIRIE DE MONTROUGE station was labeled "MAIRIE DE MONTROUGE**", probably because the station 
        opened in march 2013. "**" was deleted to match the name of this station in other datasets.</li>
        <li>Four empty columns were found in 2016 dataset, and were dropped.</li>
        <li>In 2020 and 2021 datasets, "SAINT-OUEN" station was labeled "CLICHY SAINT-OUEN" as it was the expected
        commercial name, but was eventually named "SAINT-OUEN" to match the existing station on RER C line.</li>
        <li>The arrondissement was missing for the station "SAINT-MICHEL NOTRE-DAME" and was manually added.</li>
    </ul>
    <h2>Feature Engineering</h2>
    <ul>
        <li>Features named "Correspondance_X" were dropped and replaced by a unique feature "Line": the number 
        of line connecting in the station</li>
        <li>Creation of "Mean_Trafic" and "SD_Trafic" to calculate the mean and SD of trafic over the covered period</li>
        <li>Creation of "Station_type" that separated stations into 3 groups: "hub" if more than 2 lines connects, "connexion"
        if 2 lines connects or "minor" if no connection is available at this stations</li>
    </ul>
</div>
"""))