# Données Météo avec Pandas

3 librairies connexes:
- pandas: données (lecture, écriture et traitement)
- numpy: calcul (vecteurs, matrice, stats, ...)
- matplotlib: graphiques

**NB:** Vocabulaire python
- module: fichier python (.py) contenant des fonctions, des types, ...
- package: répertoire qui contient
    - module(s)
    - package(s)
    - fichier: \_\_init\_\_.py (indique que le dossier est un package)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1. Import data
Une fonction de lecture par format: pd.read_*** (json, csv, excel, xml, sql,...)


In [8]:
# principaux paramètres
# - sep/delimiter: , ; \t space
# - quotechar: "dat"   |data| 
#      permet d'encadrer une donnée qui contient un délimiteur, ex "bureau 102, 2eme étage, bat A"
dfMeteoBassoCombo = pd.read_csv('meteo/08-station-meteo-toulouse-basso-cambo.csv', sep=';')
dfMeteoBassoCombo

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
0,10c914701aa00390ecbc4800,8.0,84.0,14.0,0.0,100100.0,114.0,ISS,0.0,315.0,7.0,18.0,14.6,2020-06-09T10:30:00+02:00,2020-06-09T08:30:00+00:00
1,10c815d0168804102c5a1800,8.0,81.0,13.0,0.0,99800.0,130.0,ISS,0.0,292.5,1.0,6.0,14.5,2020-06-08T13:15:00+02:00,2020-06-08T11:15:00+00:00
2,10c814d0862803910c584c00,8.0,69.0,12.0,0.0,99800.0,114.0,ISS,0.0,270.0,8.0,19.0,16.1,2020-06-08T11:15:00+02:00,2020-06-08T09:15:00+00:00
3,10c8145026680390ec584c00,8.0,77.0,12.0,0.0,99800.0,114.0,ISS,0.0,270.0,7.0,19.0,14.9,2020-06-08T10:15:00+02:00,2020-06-08T08:15:00+00:00
4,10c814101a8003e0cc584000,8.0,80.0,12.0,0.0,99800.0,124.0,ISS,0.0,270.0,6.0,16.0,14.6,2020-06-08T09:45:00+02:00,2020-06-08T07:45:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122044,105b4b6ce23800000ce00000,8.0,71.0,0.0,0.0,100300.0,0.0,ISS,0.0,0.0,0.0,0.0,1.8,2023-02-27T23:30:00+01:00,2023-02-27T22:30:00+00:00
122045,105b450c66b000000c400000,8.0,86.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,-1.9,2023-02-27T10:45:00+01:00,2023-02-27T09:45:00+00:00
122046,105b454c829800000c400000,8.0,83.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,0.0,2023-02-27T11:15:00+01:00,2023-02-27T10:15:00+00:00
122047,105b456c8a9800000c400000,8.0,83.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,0.2,2023-02-27T11:30:00+01:00,2023-02-27T10:30:00+00:00


In [9]:
dfMeteoBassoCombo.head(3)

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
0,10c914701aa00390ecbc4800,8.0,84.0,14.0,0.0,100100.0,114.0,ISS,0.0,315.0,7.0,18.0,14.6,2020-06-09T10:30:00+02:00,2020-06-09T08:30:00+00:00
1,10c815d0168804102c5a1800,8.0,81.0,13.0,0.0,99800.0,130.0,ISS,0.0,292.5,1.0,6.0,14.5,2020-06-08T13:15:00+02:00,2020-06-08T11:15:00+00:00
2,10c814d0862803910c584c00,8.0,69.0,12.0,0.0,99800.0,114.0,ISS,0.0,270.0,8.0,19.0,16.1,2020-06-08T11:15:00+02:00,2020-06-08T09:15:00+00:00


In [10]:
dfMeteoBassoCombo.tail(3)

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
122046,105b454c829800000c400000,8.0,83.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,0.0,2023-02-27T11:15:00+01:00,2023-02-27T10:15:00+00:00
122047,105b456c8a9800000c400000,8.0,83.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,0.2,2023-02-27T11:30:00+01:00,2023-02-27T10:30:00+00:00
122048,105c442c8e7000000d000000,8.0,78.0,0.0,0.0,100400.0,0.0,ISS,0.0,0.0,0.0,0.0,0.3,2023-02-28T09:00:00+01:00,2023-02-28T08:00:00+00:00


In [11]:
# NB: une colonne avec des entiers et données manquantes sera lue en float64 (nan)
# une colonne avec des entiers tous présents sera lue en int64
dfMeteoBassoCombo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122049 entries, 0 to 122048
Data columns (total 15 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   data                                        122049 non-null  object 
 1   id                                          122044 non-null  float64
 2   humidite                                    122043 non-null  float64
 3   direction_du_vecteur_de_vent_max            122043 non-null  float64
 4   pluie_intensite_max                         122043 non-null  float64
 5   pression                                    122043 non-null  float64
 6   direction_du_vecteur_vent_moyen             122043 non-null  float64
 7   type_de_station                             122043 non-null  object 
 8   pluie                                       122043 non-null  float64
 9   direction_du_vecteur_de_rafale_de_vent_max  122043 non-null  float64
 

In [13]:
# accéder à une colonne (recette 100%)
dfMeteoBassoCombo['temperature']

0         14.6
1         14.5
2         16.1
3         14.9
4         14.6
          ... 
122044     1.8
122045    -1.9
122046     0.0
122047     0.2
122048     0.3
Name: temperature, Length: 122049, dtype: float64

In [14]:
# accéder à une colonne avec recette qui ne marche pas si:
#  - espace ou autre caractère interdit)
#  - ambiguité avec un nom de méthode: sum, min, pop, ...
dfMeteoBassoCombo.temperature

0         14.6
1         14.5
2         16.1
3         14.9
4         14.6
          ... 
122044     1.8
122045    -1.9
122046     0.0
122047     0.2
122048     0.3
Name: temperature, Length: 122049, dtype: float64

In [18]:
# sum de la colonne avec les données manquantes ignorées (NaN insensitive)
# NB 1: floats suivent la norme IEEE754
# toutes les stats de Pandas sont par défaut NaN insensitive
# NB 2: appel de méthode de stat sur un objet Series de pandas
print("Somme:", dfMeteoBassoCombo.temperature.sum())
print("Moyenne:", dfMeteoBassoCombo.temperature.mean())
print("Minimum:", dfMeteoBassoCombo.temperature.min())
print("Maximum:", dfMeteoBassoCombo.temperature.max())

Somme: 1878882.2000000002
Moyenne: 15.395247576673802
Minimum: -50.1
Maximum: 39.7


In [20]:
dfMeteoBassoCombo["temperature"].sum()

1878882.2000000002

In [23]:
# data brute derrière une colonne: "tableau de numpy" i.e ndarray
rawData = dfMeteoBassoCombo.temperature.values
rawData

array([14.6, 14.5, 16.1, ...,  0. ,  0.2,  0.3])

In [24]:
type(rawData)

numpy.ndarray

In [32]:
# stats de numpy (fonction ou méthode) et python sont NaN sensitives
np.sum(rawData), rawData.sum(), sum(rawData)

(nan, nan, nan)

In [33]:
# Numpy proposent des alternatives
np.nansum(rawData), np.nanmean(rawData), np.nanstd(rawData)

(1878882.2000000002, 15.395247576673796, 7.943506160111766)

In [36]:
# un extrait du DataFrame où les températures sont nulles (NaN, None, NaT, ...)
dfMeteoBassoCombo[dfMeteoBassoCombo.temperature.isna()]

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
4050,"1,79769313486232E+308",,,,,,,,,,,,,,
13662,0,0.0,,,,,,,,,,,,,
19829,"1,1940250424E+023",,,,,,,,,,,,,,
19830,"1,19402102248E+023",,,,,,,,,,,,,,
26883,"1,19401704238E+023",,,,,,,,,,,,,,
27420,"1,17607300638E+023",,,,,,,,,,,,,,


In [37]:
dfMeteoBassoCombo[dfMeteoBassoCombo.heure_utc.isna()]

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
4050,"1,79769313486232E+308",,,,,,,,,,,,,,
13170,3e065000051dffffffffffff,31.0,35.0,15.0,3.0,115500.0,510.0,sous station,19.0,337.5,174.0,174.0,-50.1,,
13662,0,0.0,,,,,,,,,,,,,
19829,"1,1940250424E+023",,,,,,,,,,,,,,
19830,"1,19402102248E+023",,,,,,,,,,,,,,
26883,"1,19401704238E+023",,,,,,,,,,,,,,
27420,"1,17607300638E+023",,,,,,,,,,,,,,


In [39]:
# sur toutes les colonnes à la fois => à exploiter...
dfMeteoBassoCombo.isna()

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122044,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
122045,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
122046,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
122047,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [43]:
# repérer les index manquants
indexRowToDrop = dfMeteoBassoCombo[dfMeteoBassoCombo.heure_utc.isna()].index
indexRowToDrop

Int64Index([4050, 13170, 13662, 19829, 19830, 26883, 27420], dtype='int64')

In [44]:
# drop à partir d'un index de ligne ou plusieurs
# dfMeteoBassoCombo.drop(index=122048)
dfMeteoBassoCombo.drop(index=indexRowToDrop)

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
0,10c914701aa00390ecbc4800,8.0,84.0,14.0,0.0,100100.0,114.0,ISS,0.0,315.0,7.0,18.0,14.6,2020-06-09T10:30:00+02:00,2020-06-09T08:30:00+00:00
1,10c815d0168804102c5a1800,8.0,81.0,13.0,0.0,99800.0,130.0,ISS,0.0,292.5,1.0,6.0,14.5,2020-06-08T13:15:00+02:00,2020-06-08T11:15:00+00:00
2,10c814d0862803910c584c00,8.0,69.0,12.0,0.0,99800.0,114.0,ISS,0.0,270.0,8.0,19.0,16.1,2020-06-08T11:15:00+02:00,2020-06-08T09:15:00+00:00
3,10c8145026680390ec584c00,8.0,77.0,12.0,0.0,99800.0,114.0,ISS,0.0,270.0,7.0,19.0,14.9,2020-06-08T10:15:00+02:00,2020-06-08T08:15:00+00:00
4,10c814101a8003e0cc584000,8.0,80.0,12.0,0.0,99800.0,124.0,ISS,0.0,270.0,6.0,16.0,14.6,2020-06-08T09:45:00+02:00,2020-06-08T07:45:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122044,105b4b6ce23800000ce00000,8.0,71.0,0.0,0.0,100300.0,0.0,ISS,0.0,0.0,0.0,0.0,1.8,2023-02-27T23:30:00+01:00,2023-02-27T22:30:00+00:00
122045,105b450c66b000000c400000,8.0,86.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,-1.9,2023-02-27T10:45:00+01:00,2023-02-27T09:45:00+00:00
122046,105b454c829800000c400000,8.0,83.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,0.0,2023-02-27T11:15:00+01:00,2023-02-27T10:15:00+00:00
122047,105b456c8a9800000c400000,8.0,83.0,0.0,0.0,99800.0,0.0,ISS,0.0,0.0,0.0,0.0,0.2,2023-02-27T11:30:00+01:00,2023-02-27T10:30:00+00:00


In [54]:
# WARNING: attention au paramètre how (any|all) en cas de données optionnelles
dfMeteoBassoCombo.dropna(inplace=True)
dfMeteoBassoCombo.reset_index(drop=True, inplace=True)

In [55]:
dfMeteoBassoCombo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122042 entries, 0 to 122041
Data columns (total 15 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   data                                        122042 non-null  object 
 1   id                                          122042 non-null  float64
 2   humidite                                    122042 non-null  float64
 3   direction_du_vecteur_de_vent_max            122042 non-null  float64
 4   pluie_intensite_max                         122042 non-null  float64
 5   pression                                    122042 non-null  float64
 6   direction_du_vecteur_vent_moyen             122042 non-null  float64
 7   type_de_station                             122042 non-null  object 
 8   pluie                                       122042 non-null  float64
 9   direction_du_vecteur_de_rafale_de_vent_max  122042 non-null  float64
 

In [56]:
dfMeteoBassoCombo.head(3)

Unnamed: 0,data,id,humidite,direction_du_vecteur_de_vent_max,pluie_intensite_max,pression,direction_du_vecteur_vent_moyen,type_de_station,pluie,direction_du_vecteur_de_rafale_de_vent_max,force_moyenne_du_vecteur_vent,force_rafale_max,temperature,heure_de_paris,heure_utc
0,10c914701aa00390ecbc4800,8.0,84.0,14.0,0.0,100100.0,114.0,ISS,0.0,315.0,7.0,18.0,14.6,2020-06-09T10:30:00+02:00,2020-06-09T08:30:00+00:00
1,10c815d0168804102c5a1800,8.0,81.0,13.0,0.0,99800.0,130.0,ISS,0.0,292.5,1.0,6.0,14.5,2020-06-08T13:15:00+02:00,2020-06-08T11:15:00+00:00
2,10c814d0862803910c584c00,8.0,69.0,12.0,0.0,99800.0,114.0,ISS,0.0,270.0,8.0,19.0,16.1,2020-06-08T11:15:00+02:00,2020-06-08T09:15:00+00:00
