# <span style="color: orange;">Preprocessing train_test </span>


### <span style="color: green;">Import des librairies</span>

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
import unicodedata

import sys
sys.path.append("../src")
from preprocessing import normalize_region_name

### <span style="color: green;">Chargement des données</span>

In [2]:
# Train et Test datasets
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9196 entries, 0 to 9195
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Id           9196 non-null   int64 
 1   week         9196 non-null   int64 
 2   region_code  9196 non-null   int64 
 3   region_name  9196 non-null   object
 4   TauxGrippe   9196 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 359.3+ KB


In [3]:
# Données démographiques
df_pop_train = pd.read_csv('../data/processed/pop_train.csv')
df_pop_test = pd.read_csv('../data/processed/pop_test.csv')
# Données Google Trends
df_requetes = pd.read_excel('../data/processed/google_trends_requetes.xlsx')

### <span style="color: green;">Preprocessing des données</span>

- Avant de faire les jointures, il faut s'assurer qu'il y a des clés de fusion de même format. 
- On va rajouter des colonnes sur la dimension temporelle pour pouvoir faire la fusion

In [4]:
# Création de la colonne 'year'
df_train['year'] = df_train['week'].astype(str).str[:4].astype(int)
df_test['year'] = df_test['week'].astype(str).str[:4].astype(int)
# Création de la colonne 'month' au format datetime (1er jour du mois)
df_train['month'] = pd.to_datetime(df_train['week'].astype(str) + '1', format='%Y%W%w').dt.to_period('M').dt.to_timestamp()
df_test['month'] = pd.to_datetime(df_test['week'].astype(str) + '1', format='%Y%W%w').dt.to_period('M').dt.to_timestamp()
# Vérification
display(df_train[['week', 'month', 'year']].sample(5))

Unnamed: 0,week,month,year
2684,200935,2009-08-01,2009
8139,200449,2004-12-01,2004
7925,200505,2005-01-01,2005
1198,201050,2010-12-01,2010
7904,200506,2005-02-01,2005


In [5]:
# Normalisation des noms de régions
df_train['region_normalized'] = df_train['region_name'].apply(normalize_region_name)
df_test['region_normalized'] = df_test['region_name'].apply(normalize_region_name)
# Vérification
display(df_train[['region_name', 'region_normalized']].sample(5))

Unnamed: 0,region_name,region_normalized
6958,CENTRE,CENTRE
4732,AUVERGNE,AUVERGNE
7885,FRANCHE-COMTE,FRANCHECOMTE
8439,LORRAINE,LORRAINE
5462,CENTRE,CENTRE


### <span style="color: green;">Fusions finales</span>

- On fusionne `df_train`et `df_train` avec `df_pop_train`et `df_pop_test`

In [6]:
# Fusion left join du df_train et df_pop
df_merged_train = pd.merge(df_train, df_pop_train, on=['year', 'region_normalized'], how='left')
df_merged_test = pd.merge(df_test, df_pop_test, on=['year', 'region_normalized'], how='left')
print("Shape après fusion :", df_merged_train.shape)
display(df_merged_train.sample(5))

Shape après fusion : (9196, 15)


Unnamed: 0,Id,week,region_code,region_name,TauxGrippe,year,month,region_normalized,region,pop_0_19,pop_20_39,pop_40_59,pop_60_74,pop_75_plus,pop_total
1514,7037,201036,54,POITOU-CHARENTES,0,2010,2010-09-01,POITOUCHARENTES,Poitou-Charentes,397865,399574,487864,285689,199371,1770363
6192,11715,200632,23,HAUTE-NORMANDIE,0,2006,2006-08-01,HAUTENORMANDIE,Haute-Normandie,476065,472501,505184,219139,138166,1811055
3353,8876,200905,43,FRANCHE-COMTE,516,2009,2009-02-01,FRANCHECOMTE,Franche-Comté,289478,293468,317840,164746,102676,1168208
8859,14382,200416,31,NORD-PAS-DE-CALAIS,0,2004,2004-04-01,NORDPASDECALAIS,Nord - Pas-de-Calais,1127000,1111444,1052694,448892,270518,4010548
8285,13808,200442,41,LORRAINE,13,2004,2004-10-01,LORRAINE,Lorraine,583501,625948,644420,305339,168928,2328136


- On rajoute `df_requetes`

In [7]:
df_merged_train = pd.merge(df_merged_train, df_requetes, on=['month', 'region_normalized'], how='left')
df_merged_test = pd.merge(df_merged_test, df_requetes, on=['month', 'region_normalized'], how='left')
print("Shape après fusion avec requêtes :", df_merged_train.shape)
display(df_merged_train.sample(5))

Shape après fusion avec requêtes : (9196, 19)


Unnamed: 0,Id,week,region_code,region_name,TauxGrippe,year,month,region_normalized,region,pop_0_19,pop_20_39,pop_40_59,pop_60_74,pop_75_plus,pop_total,Mois,requete_grippe,requete_grippe_aviaire_vaccin,requete_grippe_aviaire_vaccin_porcine_porc_H1N1_AH1N1_A_mexicaine_Mexique_pandemie
299,5822,201139,41,LORRAINE,8,2011,2011-09-01,LORRAINE,Lorraine,555561,594060,658097,336827,206112,2350657,2011-09,2,1,1
4311,9834,200814,82,RHONE-ALPES,135,2008,2008-04-01,RHONEALPES,Rhône-Alpes,1563820,1612445,1648989,797851,494124,6117229,2008-04,2,1,1
7407,12930,200529,31,NORD-PAS-DE-CALAIS,0,2005,2005-07-01,NORDPASDECALAIS,Nord - Pas-de-Calais,1119560,1104070,1067337,444839,279807,4015613,2005-07,3,2,2
8839,14362,200417,22,PICARDIE,0,2004,2004-04-01,PICARDIE,Picardie,510580,504779,518419,219485,130146,1883409,2004-04,9,9,9
6041,11564,200639,41,LORRAINE,26,2006,2006-09-01,LORRAINE,Lorraine,573475,614496,664908,299472,183398,2335749,2006-09,2,1,1


- On rajoute df_meteo

### <span style="color: green;">Nettoyages finaux</span>

In [11]:
df_merged_train.columns

Index(['Id', 'week', 'region_code', 'region_name', 'TauxGrippe', 'year',
       'month', 'region_normalized', 'region', 'pop_0_19', 'pop_20_39',
       'pop_40_59', 'pop_60_74', 'pop_75_plus', 'pop_total', 'Mois',
       'requete_grippe', 'requete_grippe_aviaire_vaccin',
       'requete_grippe_aviaire_vaccin_porcine_porc_H1N1_AH1N1_A_mexicaine_Mexique_pandemie'],
      dtype='object')

- On ne selectionne que les colonnes 'utiles'

In [25]:
# Liste des colonnes à conserver
keep_cols = ['Id', 'week', 'region_name', 'TauxGrippe',   # Les données du dataset central cible
             'pop_0_19', 'pop_20_39', 'pop_40_59', 'pop_60_74', 'pop_75_plus', 'pop_total', # Données démographiques
             'requete_grippe', 'requete_grippe_aviaire_vaccin', 'requete_grippe_aviaire_vaccin_porcine_porc_H1N1_AH1N1_A_mexicaine_Mexique_pandemie' # Données Google Trends
             # Données météo à ajouter plus tard
]
# Sélection
df_train = df_merged_train[keep_cols].copy()
df_test = df_merged_test[[c for c in keep_cols if c != 'TauxGrippe']].copy()
# Affichage pour vérification
print(f"Dimensions : \n Train : {df_train.shape} \n Test : {df_test.shape}")
display(df_train.head(), df_test.head())

Dimensions : 
 Train : (9196, 13) 
 Test : (2288, 12)


Unnamed: 0,Id,week,region_name,TauxGrippe,pop_0_19,pop_20_39,pop_40_59,pop_60_74,pop_75_plus,pop_total,requete_grippe,requete_grippe_aviaire_vaccin,requete_grippe_aviaire_vaccin_porcine_porc_H1N1_AH1N1_A_mexicaine_Mexique_pandemie
0,5523,201152,ALSACE,66,448112,479905,526481,250849,146978,1852325,2,2,2
1,5524,201152,AQUITAINE,24,732120,765143,896030,516065,344875,3254233,2,2,2
2,5525,201152,AUVERGNE,91,294709,305778,375485,223830,150880,1350682,3,3,2
3,5526,201152,BASSE-NORMANDIE,49,354514,336201,401766,228888,154315,1475684,2,2,2
4,5527,201152,BOURGOGNE,33,372023,369737,450664,269109,181201,1642734,2,2,1


Unnamed: 0,Id,week,region_name,pop_0_19,pop_20_39,pop_40_59,pop_60_74,pop_75_plus,pop_total,requete_grippe,requete_grippe_aviaire_vaccin,requete_grippe_aviaire_vaccin_porcine_porc_H1N1_AH1N1_A_mexicaine_Mexique_pandemie
0,3235,201352,ALSACE,446622,471708,529960,266350,153543,1868183,2,2,2
1,3236,201352,AQUITAINE,742922,761895,911307,547389,353376,3316889,3,2,2
2,3237,201352,AUVERGNE,294350,299336,375266,235294,153422,1357668,4,3,3
3,3238,201352,BASSE-NORMANDIE,352006,325231,400270,242425,158780,1478712,2,2,2
4,3239,201352,BOURGOGNE,370747,357632,447520,282940,183848,1642687,3,2,2


### <span style="color: green;">Export CSV</span>

In [28]:
df_train.to_csv('../data/processed/train.csv', index=False)
df_test.to_csv('../data/processed/test.csv', index=False)

### <span style="color: green;">Fin</span>