In [None]:
import pandas as pd
import numpy as np
from my_libs import lib_tools as pt # Project Tools

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from datetime import date, datetime

start_year = 2005      # Année de début de période d'étude
end_year   = 2021      # Année de fin de période d'étude
chk        = True      # Affichage de log des checks
sampled    = True      # Travail sur données samplées de df

df, dic_usagers, dic_caract, dic_lieux, dic_vehic = pt.get_work_df(start_year, end_year, sampled, chk)

In [None]:
display_stat_data_load = chk

if display_stat_data_load:
    pt.display_stats_data_load(dic_usagers, dic_caract, dic_lieux, dic_vehic, start_year, end_year)

### Suppression des colonnes jugées non pertinentes

In [None]:
# relatives à une info géographique trop fine
cols_rmv = ['com', 'adr', 'lat', 'long', 'pr', 'pr1']
for col in cols_rmv:
    if col in df.columns:
        df = df.drop(columns=cols_rmv, axis=1)

# non connue en pratique avant l'accident
cols_rmv = ['obs', 'obsm', 'choc', 'manv', 'col']
for col in cols_rmv:
    if col in df.columns:
        df = df.drop(columns=cols_rmv, axis=1)

### Représentation graphique des *Null* - Suppression de variables avec plus de 8 % de Null

In [None]:
df_sample = df.sample(10000)
df_sample = df_sample.sort_values(by=['an', 'mois', 'jour'], ascending=True)

fig, ax = plt.subplots(nrows=2, ncols=1)
fig.set_size_inches(16, 16)
ax[0].set_title('Représentation graphique de la présence de valeurs Null dans les données brutes')
sns.heatmap(df_sample.isna(), cbar=False, ax=ax[0]);

df = pt.rmv_col_too_much_null(df, 0.08, chk)

df_sample = df.sample(10000)
df_sample = df_sample.sort_values(by=['an', 'mois', 'jour'], ascending=True)

ax[1].set_title('Représentation graphique de la présence de valeurs Null après suppression des colonnes avec au moins 8% de valeurs Null')
sns.heatmap(df_sample.isna(), cbar=False, ax=ax[1]);

### Transformation des  *Null* restants en -1 (non renseigné) pour les colonnes qui l'autorisent

In [None]:
df = pt.clean_categ_not_specified(df)

### Nettoyage de l'année de naissance

In [None]:
df[df.an_nais.isna()].shape[0]
df[df.an_nais.isna()].an.value_counts()

years = []
rate_nan = []
for year in range(start_year, end_year+1):
    years.append(year)
    rate_nan.append(df[(df.an_nais.isna()) & (df.an==year)].shape[0] / df[df.an==year].shape[0])

plt.title(f"Pourcentage de Null dans la colonne 'Année de naissance' de {start_year} à {end_year}")
ax = sns.barplot(x=years, y=rate_nan)
plt.xticks(rotation=60);

In [None]:
print("Suppression des lignes avec Null : ")
nb_bef = df.shape[0]
print(f"Nombre de lignes avant : {nb_bef}")

df = df.dropna(axis = 0, how = 'any')
nb_aft = df.shape[0]
print(f"Nombre de lignes après : {nb_aft}")
print(f"Taux de perte : {(nb_bef-nb_aft)/nb_aft*100:.2f} %")

### Création de *age* et *age_cls*

In [None]:
df = pt.create_col_age(df)
df = pt.create_col_age_cls(df)

### Encodages : *dep*,  *nbv*, *catv*, *senc*

In [None]:
df = pt.clean_col_dep(df, True)
df = pt.clean_nbv(df)
df = pt.clean_catv(df)
df = pt.clean_senc(df)

### Affichage des modes des variables catégorielles restantes

In [None]:
# var_categ = df.select_dtypes('Int64').columns
var_categ = df.columns
col_excluded = ['Num_Acc', 'num_veh', 'age', 'hrmn', 'an', 'jour', 'an_nais', 'datetime', 'dep']

for col in var_categ:
    if not (col in col_excluded):
        modes = list(df[col].unique())
        print(f"{col}\t: {modes}")            
#         print(f"{modes}")            

###  Nettoyage de *hrmn* et création de *datetime* - *joursem* - *grav_lbl*

In [None]:
df = pt.clean_hrmn(df)
df = pt.create_col_datetime(df)
df = pt.create_col_joursem(df)
df = pt.create_col_grav_lbl(df)

### Suppression des colonnes inutiles : *an_nais*

In [None]:
df = df.drop(columns=['an_nais'], axis=1)

### Analyses croisées

In [None]:
target = df['grav']
data = df.drop(columns=['grav'], axis=1)
# set 'grav' as last column of data
data['grav'] = target

ds = data.sample(10000)
corr = ds.corr()
plt.figure(figsize=(10,10))
ax = sns.heatmap(corr, annot=False, cmap='coolwarm', cbar=False) 
ax.set_title(f"Matrice de corrélation du jeu de données");

In [None]:
# target = df['grav']
# data = df.drop(columns=['grav'], axis=1)
# # set 'grav' as last column of data
# data['grav'] = target

# ds = data.sample(10000)
# corr1 = ds[ds.grav == 1].corr()

# ds = data.sample(10000)
# corr2 = ds[ds.grav == 2].corr()

# ds = data.sample(10000)
# corr3 = ds[ds.grav == 3].corr()

# ds = data.sample(10000)
# corr4 = ds[ds.grav == 4].corr()

# fig, axs = plt.subplots(nrows=2, ncols=2)
# fig.set_size_inches(15, 15)

# axs[0,0].set_title('grav = Indemne')
# sns.heatmap(corr1, annot=False, cmap='coolwarm', ax=axs[0,0], cbar=False) 

# axs[0,1].set_title('grav = Tué')
# sns.heatmap(corr2, annot=False, cmap='coolwarm', ax=axs[0,1], cbar=False) 

# axs[1,0].set_title('grav = Blessé hospitalisé')
# sns.heatmap(corr3, annot=False, cmap='coolwarm', ax=axs[1,0], cbar=False) 

# axs[1,1].set_title('grav = Blessé léger')
# sns.heatmap(corr4, annot=False, cmap='coolwarm', ax=axs[1,1], cbar=False);

In [None]:
colormap = 'tab20c'
df_cross = pd.crosstab(df['joursem'], df['grav_lbl'], normalize='index')
df_cross = df_cross.reindex(['lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi', 'dimanche'])
print(df_cross)

sns.set_theme(style="ticks")
f, ax = plt.subplots(figsize=(7, 5))
chart = df_cross.plot(kind="bar", stacked=True, rot=0, ax=ax, 
                      title=f"Distribution de la gravité en fonction du jour de la semaine (données {start_year}-{end_year})", 
                      colormap=colormap)
chart.set_xticklabels(chart.get_xticklabels(), rotation=80)
sns.move_legend(ax, "upper right")

In [None]:
colormap = 'tab20c'

df_tmp = df[['sexe', 'grav_lbl']]
df_tmp['sexe'] = df_tmp['sexe'].astype('str').replace(['1','2'], ['masculin', 'féminin'])
df_cross_1 = pd.crosstab(df_tmp['sexe'], df_tmp['grav_lbl'], normalize='index')
df_cross_2 = pd.crosstab(df_tmp['grav_lbl'], df_tmp['sexe'], normalize='index')

print(df_cross_1)
print(df_cross_2)

sns.set_theme(style="ticks")

fig, axs = plt.subplots(2,1)
chart = df_cross_1.plot(kind="bar", stacked=True, rot=0, ax=axs[0], 
                      title=f"Distribution de la gravité en fonction du sexe (données {start_year}-{end_year})", 
                      colormap=colormap)
chart.set_xticklabels(chart.get_xticklabels(), rotation=80)
sns.move_legend(ax, "upper right")

chart = df_cross_2.plot(kind="bar", stacked=True, rot=0, ax=axs[1], 
                      title=f"Distribution du sexe de l'usager en fonction de la gravité (données {start_year}-{end_year})", 
                      colormap=colormap)
chart.set_xticklabels(chart.get_xticklabels(), rotation=80)
sns.move_legend(ax, "upper right")


### Evolution de gravité *Blessé hospitalisé* en 2019?

In [None]:
tue     = []
bless_l = []
bless_h = []
indemne = []
y_m     = []
years   = range(start_year, end_year+1)
months  = range(1, 13)

import datetime

df_tmp = df

for year in years:
    for month in months:
#         df_tmp = df[(df.datetime.dt.year == year) & (df.datetime.dt.month == month)]
        df_tmp = df[(df.an==year) & (df.mois==month)]
        if 2 in df_tmp.grav.value_counts(normalize=True).index: 
            tue.append(df_tmp.grav.value_counts(normalize=True)[2]) 
        else :
            tue.append(tue[-1])
        if 3 in df_tmp.grav.value_counts(normalize=True).index: 
            bless_h.append(df_tmp.grav.value_counts(normalize=True)[3])
        else:
            bless_h.append(bless_h[-1])
        if 4 in df_tmp.grav.value_counts(normalize=True).index: 
            bless_l.append(df_tmp.grav.value_counts(normalize=True)[4])
        else:
            bless_l.append(bless_l[-1])
        if 1 in df_tmp.grav.value_counts(normalize=True).index: 
            indemne.append(df_tmp.grav.value_counts(normalize=True)[1])
        else:
            indemne.append(indemne[-1])
        y_m.append(datetime.datetime(year, month, 1))

  
data = pd.DataFrame({'Tué':tue, 'Blessé hospitalisé':bless_h, 'Blessé léger':bless_l, 'Indemne':indemne, 'temps':y_m})
data = data.set_index('temps')

sns.set_theme(style="white", palette=None)
ax = sns.lineplot(data=data, palette=['red','orange','green','gray'], linewidth=1) #.set_title("Evolution des proportions des niveaux de gravité en fonction du temps")
ax.axes.set_title(f"Evolution des proportions des niveaux de gravité de {start_year} à {end_year}",fontsize=10)
sns.set(rc={'figure.figsize':(15,6)})
plt.axvline(datetime.datetime(2018, 1, 1), linewidth=1)
plt.axvline(datetime.datetime(2020, 4, 1), linestyle='--', color='gray', linewidth=1)
plt.legend(loc='upper left');

In [None]:
# colormap = 'tab20c'
# col = 'place'

# col_excluded = ['Num_Acc', 'num_veh', 'age', 'hrmn', 'dep', 'an', 'jour', 'an_nais']

# sns.set_theme(style="ticks")
# fig, axs = plt.subplots(7,4)
# fig.set_size_inches(20, 15)
# axs = np.reshape(axs, -1)
# plt.show()

# k = 0
# for col in df.columns:
#     if not (col in col_excluded) and (k <= 1):    
# #         if (k%5 == 0) : 
#         print(k)
#         df_cross = pd.crosstab(df[col], df.grav_lbl, normalize='index')
#         chart = df_cross.plot(kind="bar", stacked=True, rot=0, ax=axs[k], title=f"{col}", colormap=colormap)
# #         axs[k].yaxis.set_visible(False)
# #         chart.set_xticklabels(chart.get_xticklabels(), rotation=80)        
#         k += 1

In [None]:
df.info(verbose=True, show_counts=True)

In [None]:
acc    = []
y_m    = []
years  = range(start_year, end_year+1)
months = range(1, 13)

import datetime

df_tmp = df

for year in years:
    for month in months:
        df_tmp = df[(df.an==year) & (df.mois==month)]
        acc.append(df_tmp.shape[0])
        y_m.append(datetime.datetime(year, month, 1))
  
df_acc = pd.DataFrame({'Accidents':acc, 'temps':y_m})
df_acc = df_acc.set_index('temps')

acc_ma = df_acc.Accidents.rolling(12).mean()
df_acc_ma = pd.DataFrame({'Moy mobile':acc_ma, 'temps':y_m})
df_acc_ma = df_acc_ma.set_index('temps')

plt.figure(figsize=(12,6))
plt.plot(df_acc, 'b', linewidth=1, label=f"Nombre mensuel d'accidents")
plt.plot(df_acc_ma, 'r', linestyle='dashed', label=f"Moyenne mobile sur 12 mois")
plt.title(f"Evolution du nombre d'accidents de {start_year} à {end_year}")
plt.grid()
plt.legend();

In [None]:
# sns.histplot(data=df[df.grav==2], x='age', bins=30)
fig, axs = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
fig.suptitle(f"Distributions des modalités de 'grav' en fonction de l'âge des accidentés")
sns.histplot(data=df[df.grav==2], x="age", color="red", label="Tué", kde=True, bins=20, ax=axs[0])
sns.histplot(data=df[df.grav==3], x="age", color="skyblue", label="Blessé hospitalisé", kde=True, bins=20, ax=axs[0])
sns.histplot(data=df[df.grav==1], x="age", color="gray", label="Indemne", kde=True, bins=20, ax=axs[1])
sns.histplot(data=df[df.grav==4], x="age", color="yellow", label="Blessé léger", kde=True, bins=20, ax=axs[1])

plt.legend();

In [None]:
print(f"Proportion de tués : {df[df.grav==2].shape[0]/df.shape[0]}")
print(f"Proportion de blessés hospitalisés : {df[df.grav==3].shape[0]/df.shape[0]}")
print(f"Proportion de blessés légers : {df[df.grav==4].shape[0]/df.shape[0]}")
print(f"Proportion de personnes indemnes : {df[df.grav==1].shape[0]/df.shape[0]}")

In [None]:
sns.set_theme(style="whitegrid", palette="pastel")
df_tmp = df[['grav_lbl', 'joursem']]
plt.title(f"Distributions des accidents selon les niveaux de gravité et les jours de la semaine")
ax = sns.countplot(data=df_tmp, x="joursem", hue="grav_lbl", 
                   order=['lundi', 'mardi', 'mercredi', 'jeudi', 'vendredi', 'samedi', 'dimanche']);



In [None]:
df.senc.unique()
df_lst = []
for year in np.arange(start_year, end_year, 2):
    df_lst.append(df[df.an==year].senc)
        
   
plt.hist(df_lst, label=np.arange(start_year, end_year, 2));
plt.legend()