In [6]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import lib_data_load as ld
import lib_data_ref as dr
from datetime import date

import seaborn as sns

dir_data = ".\\data\\data_gouv_fr\\"

# Usagers
dic_usagers = ld.load_usagers(folder_path=dir_data, start_year=2020, end_year=2021)

# Véhicules immatriculés
dic_vehic_immat = ld.load_vehic_immat(folder_path=dir_data, start_year=2020, end_year=2021)

# Véhicules
dic_vehic = ld.load_vehicules(folder_path=dir_data, start_year=2020, end_year=2021)

df_usagers = dic_usagers[2020]
df_vehic_immat = dic_vehic_immat[2020]
df_vehic = dic_vehic[2020]

In [17]:
print(df_vehic_immat.columns)

print(df_vehic.columns)

Index(['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe',
       'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp',
       'etatp'],
      dtype='object')
Index(['Num_Acc', 'id_vehicule', 'num_veh', 'senc', 'catv', 'obs', 'obsm',
       'choc', 'manv', 'motor', 'occutc'],
      dtype='object')


In [16]:
print(df_vehic_immat.info())
print(df_vehic.info())

# check primary keys
print(df_vehic_immat.Num_Acc.duplicated().sum())

df_vehic_immat[df_vehic_immat.Num_Acc.duplicated()==True]
df_vehic_immat[df_vehic_immat.Num_Acc == 202000000001]
# df_vehic_merged = df_usagers.merge(right=df_caract, on='Num_Acc', how='left')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105295 entries, 0 to 105294
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Num_Acc      105295 non-null  int64 
 1   id_vehicule  105295 non-null  object
 2   num_veh      105295 non-null  object
 3   place        105295 non-null  int64 
 4   catu         105295 non-null  int64 
 5   grav         105295 non-null  int64 
 6   sexe         105295 non-null  int64 
 7   an_nais      105295 non-null  int64 
 8   trajet       105295 non-null  int64 
 9   secu1        105295 non-null  int64 
 10  secu2        105295 non-null  int64 
 11  secu3        105295 non-null  int64 
 12  locp         105295 non-null  int64 
 13  actp         105295 non-null  object
 14  etatp        105295 non-null  int64 
dtypes: int64(12), object(3)
memory usage: 12.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81066 entries, 0 to 81065
Data columns (total 11 columns):
 #   Colu

Unnamed: 0,Num_Acc,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202000000001,154 742 274,B01,1,1,1,1,1983,5,1,0,-1,-1,-1,-1
1,202000000001,154 742 275,A01,1,1,3,1,1982,5,2,6,-1,-1,-1,-1


## Merge data et suppression des colonnes inutiles

In [None]:
#  merge usagers and caracteristiques
df = df_usagers.merge(right=df_caract, on='Num_Acc', how='left')

# inutiles : ['id_vehicule', 'num_veh', 'adr'] 
df = df.drop(columns=['id_vehicule', 'num_veh', 'adr', 'lat', 'long'], axis=1)

## Encodages

In [None]:
# création de 4 classes basée sur les quartiles
# df['age'].describe()
def get_cl_age(age):
    if age <= 25:
        return '0-25'
    if 25 < age <= 37:
        return '26-37'
    if 37 < age <= 53:
        return '38-53'
    if 53 < age:
        return '>53'

df['luminosité'] = df['lum'].apply(lambda i: dr.get_labels('lum', i))
df['gravité'] = df['grav'].apply(lambda i: dr.get_labels('grav', i))
df['age'] = 2022 - df['an_nais']
df["classe d'age"] = [get_cl_age(age) for age in df['age']]
df["catégorie d'utilisateur"] = df['catu'].apply(lambda i: dr.get_labels('catu', i))
df["sexe_lbl"] = df['sexe'].apply(lambda i: dr.get_labels('sexe', i))
df['date'] = df[['jour', 'mois', 'an']].apply(lambda row : date(row['an'], row['mois'], row['jour']), axis=1)
df["agg_lbl"] = df['agg'].apply(lambda i: dr.get_labels('agg', i))
df["int_lbl"] = df['int'].apply(lambda i: dr.get_labels('int', i))
df["atm_lbl"] = df['atm'].apply(lambda i: dr.get_labels('atm', i))
df["col_lbl"] = df['col'].apply(lambda i: dr.get_labels('col', i))

df = df.drop(columns=['an_nais', 'age'], axis=1)

columns_titles = ['Tué', 'Blessé hospitalisé', 'Blessé léger', 'Indemne']

## Relation Gravité - Luminosité

In [None]:
df_cross = pd.crosstab(df['luminosité'], df['gravité'], normalize='index')
df_cross = df_cross.reindex(columns=columns_titles)
print(df_cross)

sns.set_theme(style="ticks")
f, ax = plt.subplots(figsize=(7, 5))
chart = df_cross.plot(kind="bar", stacked=True, rot=0, ax=ax, 
                         title='Distribution de la gravité en fonction de la luminosité')
chart.set_xticklabels(chart.get_xticklabels(), rotation=80)
sns.move_legend(ax, "upper right")
