# Veille 3: A la découverte de la librairie Pandas

## - Importation de la librairie Pandas

In [1]:
import pandas as pd

## - Création de données : 
Il y a deux principaux objets en pandas : DataFrame et Series.

#### - DataFrame : 
Un DataFrame est une table. Il contient un tableau d'entrées individuelles, chacune ayant une certaine valeur. Chaque entrée correspond à une ligne (ou enregistrement) et une colonne.

In [16]:
pd.DataFrame({'Yes':[50., 21.], 'No':[131., 2.]})

Unnamed: 0,Yes,No
0,50.0,131.0
1,21.0,2.0


##### Les entrées DataFrame ne sont pas limitées aux entiers. On peut avoir d'autres types de données

In [8]:
pd.DataFrame({'Bod':['Il aime ça.', 'Je code.'], 'Sue':['Bonne journée.', 'Merveilleux']})

Unnamed: 0,Bod,Sue
0,Il aime ça.,Bonne journée.
1,Je code.,Merveilleux


In [157]:
#On peut aussi modifier le nom des index
pd.DataFrame({'Bod':['Il aime ça.', 'Je code.'], 'Sue':['Bonne journée.', 'Merveilleux']}, index = ['Option_A', 'Option'])

Unnamed: 0,Bod,Sue
Option_A,Il aime ça.,Bonne journée.
Option_B,Il aime ça.,Merveilleux


#### - Series :
Une series est une séquence de valeurs de données. Elle est considerée comme une liste, et par essence, elle est représente une seule colonne d'un DataFrame.

In [17]:
pd.Series([1., 2., 3., 4., 5., 6.])

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64

##### On peut attribuer des étiquettes de ligne à une Series de la même manière qu'avec un DataFrame, à l'aide d'un paramètre d'index. Cependant, une Series n'a pas de nom de colonne, elle n'a qu'un seul nom global.

In [19]:
pd.Series([15.75, 2.85, 501.], index = ['A', 'B', 'C'], name = 'Chiffres')

A     15.75
B      2.85
C    501.00
Name: Chiffres, dtype: float64

### - Lecture de fichiers de données (Reading data files) :
Un fichier CSV est un tableau de valeurs séparées par des virgules. D'où le nom : "Comma-Separated Values", ou CSV.

In [164]:
#Utilisation de la fonction pd.read_csv() pour lire les données (du fichier CSV) dans un DataFrame
Lecture = pd.read_csv('C:/Users/Usuario/Documents/Dossier L3/ODC_FORMATION/Veille_3_Pandas/covid.csv')
Lecture

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


In [21]:
#On utilise l'attribut shape pour vérifier la taille du DataFrame
Lecture.shape

(209, 16)

In [22]:
#On utilise la fonction head() pour afficher les 5 premières lignes du DataFrame
Lecture.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [30]:
#On peut utiliser l'attribut index_col pour modifier l'indexation automatique en une colonne du DataFrame en donnant son indexe
Lecture = pd.read_csv('C:/Users/Usuario/Documents/Dossier L3/ODC_FORMATION/Veille_3_Pandas/covid.csv', index_col = 1)
Lecture.head()

Unnamed: 0_level_0,Country/Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,USA,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
South America,Brazil,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
Asia,India,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
Europe,Russia,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
Africa,South Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


## - Indexation, sélection et affectation (Indexing, selecting, assigneting) : 

### - Introduction : 

In [92]:
Lecture = pd.read_csv('C:/Users/Usuario/Documents/Dossier L3/ODC_FORMATION/Veille_3_Pandas/covid.csv', index_col = 1)
pd.set_option("display.max_rows", 5)
Lecture

Unnamed: 0_level_0,Country/Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,USA,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
South America,Brazil,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Europe,Vatican City,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
Africa,Western Sahara,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


### - Accesseurs natifs (Native accessors) :

In [32]:
#Affichage de la colonne Continent
Lecture.Continent

0      North America
1      South America
2               Asia
3             Europe
4             Africa
           ...      
204    North America
205    North America
206    South America
207           Europe
208           Africa
Name: Continent, Length: 209, dtype: object

In [37]:
#Affichage de la colonne Continent en le passant en indexe
Lecture['Continent']

0      North America
1      South America
2               Asia
3             Europe
4             Africa
           ...      
204    North America
205    North America
206    South America
207           Europe
208           Africa
Name: Continent, Length: 209, dtype: object

In [39]:
#Affichage d'un élement de la colonne Continent en passant son num_indexe
Lecture['Continent'][206]

'South America'

### - Sélection basée sur l'index : 
On utilise l'opérateur d'accès : iloc.

In [42]:
#Affichage de la ligne (enregistrement) i de toutes les colonnes avec l'opérateur iloc[i]
Lecture.iloc[0]

Country/Region                USA
Continent           North America
Population            331198130.0
TotalCases                5032179
NewCases                      NaN
TotalDeaths              162804.0
NewDeaths                     NaN
TotalRecovered          2576668.0
NewRecovered                  NaN
ActiveCases             2292707.0
Serious,Critical          18296.0
Tot Cases/1M pop          15194.0
Deaths/1M pop               492.0
TotalTests             63139605.0
Tests/1M pop             190640.0
WHO Region               Americas
Name: 0, dtype: object

In [48]:
#Affichage de toutes les lignes de la première colonne
Lecture.iloc[:, 0]

0                        USA
1                     Brazil
2                      India
3                     Russia
4               South Africa
               ...          
204               Montserrat
205    Caribbean Netherlands
206         Falkland Islands
207             Vatican City
208           Western Sahara
Name: Country/Region, Length: 209, dtype: object

In [49]:
#Affichage de la ligne 200 à la ligne 206 (206 est exclus) de la deuxième colonne
Lecture.iloc[200:206, 1]

200             Asia
201    North America
202    North America
203    North America
204    North America
205    North America
Name: Continent, dtype: object

In [54]:
#Affichage des 5 dernièrres lignes de la deuxième colonne
Lecture.iloc[-5:, 1]

204    North America
205    North America
206    South America
207           Europe
208           Africa
Name: Continent, dtype: object

In [53]:
#Renvoie de la liste correspondante en passant en paramètre num_indexe des lignes à afficher avec iloc[[i, j, k], num_indexe_col]
Lecture.iloc[[200, 203, 208], 1]

200             Asia
203    North America
208           Africa
Name: Continent, dtype: object

In [55]:
#Affichage de la ligne 1 à la ligne 12 par pas de 2(12 est exclus) de la deuxième colonne
Lecture.iloc[1:12:2, 1]

1     South America
3            Europe
5     North America
7     South America
9            Europe
11           Europe
Name: Continent, dtype: object

### - Sélection basée sur l'index : 
On utilise l'opérateur d'accès : loc.

In [103]:
#Affichage de la valeur de la donnée à l'indexe i dans la colonne (etiquette) spécifiée avec loc[i, 'nom_col']
Lecture.loc[3, 'Continent']

'Europe'

In [66]:
#Affichage de toutes les valeurs des données dans les colonne ('Continent', 'Country/Region', 'Population') 
Lecture.loc[:, ['Continent', 'Country/Region', 'Population']]

Unnamed: 0,Continent,Country/Region,Population
0,North America,USA,3.311981e+08
1,South America,Brazil,2.127107e+08
2,Asia,India,1.381345e+09
3,Europe,Russia,1.459409e+08
4,Africa,South Africa,5.938157e+07
...,...,...,...
204,North America,Montserrat,4.992000e+03
205,North America,Caribbean Netherlands,2.624700e+04
206,South America,Falkland Islands,3.489000e+03
207,Europe,Vatican City,8.010000e+02


### - Manipulation de l'indice :

In [96]:
#Affichage du DataFrame avec Continent comme colonne d'indice
Lecture.set_index("Continent")

Unnamed: 0_level_0,Country/Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,USA,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
South America,Brazil,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Europe,Vatican City,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
Africa,Western Sahara,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


### - Sélection conditionnelle : 

In [99]:
#Affiche true lorsque l'enregistrement contient la valeur 'Africa' dans cas contraire false
Lecture.Continent == 'Africa'

0      False
1      False
       ...  
207    False
208     True
Name: Continent, Length: 209, dtype: bool

In [108]:
#Affichage de l'enregistrement contenant la valeur 'Africa' de la colonne Continent
Lecture.loc[Lecture.Continent == 'Africa']

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
4,South Africa,Africa,5.93816e+07,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa
26,Egypt,Africa,1.02517e+08,95006,,4951.0,,48898.0,,41157.0,41.0,927.0,48.0,135000.0,1317.0,EasternMediterranean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,Seychelles,Africa,9.84080e+04,126,,,,124.0,,0.0,,1280.0,,,,Africa
208,Western Sahara,Africa,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


In [115]:
#Affichage de l'enregistrement contenant la valeur 'Africa' de colonne Continent et TotalCases  <= 300
Lecture.loc[(Lecture.Continent == 'Africa') & (Lecture.TotalCases <= 300)]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
169,Eritrea,Africa,3551175.0,282,,,,225.0,,57.0,,79.0,,,,Africa
184,Seychelles,Africa,98408.0,126,,,,124.0,,0.0,,1280.0,,,,Africa
208,Western Sahara,Africa,598682.0,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


In [116]:
#Affichage de l'enregistrement contenant la valeur 'Africa' de colonne Continent ou TotalCases  <= 300
Lecture.loc[(Lecture.Continent == 'Africa') | (Lecture.TotalCases <= 300)]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
4,South Africa,Africa,5.93816e+07,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa
26,Egypt,Africa,1.02517e+08,95006,,4951.0,,48898.0,,41157.0,41.0,927.0,48.0,135000.0,1317.0,EasternMediterranean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


#####  Pandas est livré avec quelques sélecteurs conditionnels intégrés, dont : isin, isnull, notnull.

In [117]:
#Permet de faire la sélection des enregistrements contenant 'Africa' et 'Europe' dans la colonne Continent  
Lecture.loc[Lecture.Continent.isin(['Africa', 'Europe'])]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
3,Russia,Europe,1.45941e+08,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,5.93816e+07,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


In [120]:
#Permet de faire la sélection des enregistrements contenant NaN (vides) dans la colonne NewDeaths  
Lecture.loc[Lecture.NewDeaths.isnull()]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


In [121]:
#Permet de faire la sélection des enregistrements non vides dans la colonne NewDeaths  
Lecture.loc[Lecture.NewDeaths.notnull()]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
5,Mexico,North America,129066000.0,462690,6590.0,50517.0,819.0,308848.0,4140.0,103325.0,3987.0,3585.0,391.0,1056915.0,8189.0,Americas
28,Bolivia,South America,11688500.0,86423,1282.0,3465.0,80.0,27373.0,936.0,55585.0,71.0,7394.0,296.0,183583.0,15706.0,Americas
72,S. Korea,Asia,51273700.0,14519,20.0,303.0,0.0,13543.0,42.0,673.0,18.0,283.0,6.0,1613652.0,31471.0,WesternPacific


### - Assignation de donnée :

In [125]:
#Ajoute une nouvelle colonne Critic et attribut à tous ces enregistrement la valeur 'everyone'
Lecture['Critic'] = 'everyone'
Lecture.Critic

0      everyone
1      everyone
         ...   
207    everyone
208    everyone
Name: Critic, Length: 209, dtype: object

In [128]:
len(Lecture)

209

In [130]:
#Ajoute une nouvelle colonne index_backwards et attribut à tous ces enregistrement une valeur de 209 à 1
Lecture['index_backwards'] = range(len(Lecture), 0, -1)
Lecture['index_backwards']

0      209
1      208
      ... 
207      2
208      1
Name: index_backwards, Length: 209, dtype: int64

## - Fonctions récapitulatives et cartes :

### - Introduction :

In [131]:
import numpy as np
Lecture = pd.read_csv('C:/Users/Usuario/Documents/Dossier L3/ODC_FORMATION/Veille_3_Pandas/covid.csv', index_col = 1)
pd.set_option("display.max_rows", 5)
Lecture

Unnamed: 0_level_0,Country/Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,USA,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
South America,Brazil,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Europe,Vatican City,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe
Africa,Western Sahara,5.98682e+05,10,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


### - Fonctions récapitulatives (Summary founction) :
On a describe() : elle génère un résumé de haut niveau des attributs de la colonne donnée. Il est sensible au type, ce qui signifie que sa sortie change en fonction du type de données de l'entrée. Elle donne une description statistique de la colonne donnée 

In [134]:
#La sortie ci-dessous n'a de sens que pour les données numériques
Lecture.Population.describe()

count    2.08000e+02
mean     3.04155e+07
            ...     
75%      2.57561e+07
max      1.38134e+09
Name: Population, Length: 8, dtype: float64

In [135]:
#La sortie ci-dessous n'a de sens que pour les chaines de caractères (String)
Lecture.Continent.describe()

count        208
unique         6
top       Africa
freq          57
Name: Continent, dtype: object

##### Pour obtenir des statistiques récapitulatives simples et particulières sur une colonne dans un DataFrame ou une Series, il existe généralement quelques fonctions pandas utilent qui permettent de le faire : mean(), unique(), value_counts()

In [141]:
#mean() = Renvoie la moyenne de la colonne specifiée (mean est utilisé uniquement avec les données numériques)
Lecture.Population.mean()

30415486.971153848

In [142]:
#unique() = Renvoie les valeurs uniques contenues dans la colonne specifiée
Lecture.Continent.unique()

array(['North America', 'South America', 'Asia', 'Europe', 'Africa',
       'Australia/Oceania', nan], dtype=object)

In [147]:
#value_counts() = Renvoie une série contenant le nombre de valeurs uniques dans une colonne specifique.
Lecture.NewDeaths.value_counts()

819.0    1
80.0     1
0.0      1
Name: NewDeaths, dtype: int64

### - Maps :
En science des données, nous avons souvent besoin de créer de nouvelles représentations à partir de données existantes ou de transformer des données du format dans lequel elles se trouvent actuellement au format dans lequel nous voulons qu'elles soient plus tard. Les cartes (Maps) sont ce qui gère ce travail. Il existe deux méthodes de mappage que vous utiliserez souvent : map(), apply().

In [153]:
essai_1 = Lecture.Population.mean()
Lecture.Population.map(lambda p : p - essai_1)

0      3.00783e+08
1      1.82295e+08
          ...     
207   -3.04147e+07
208   -2.98168e+07
Name: Population, Length: 209, dtype: float64

In [170]:
essai_2 = Lecture.TotalCases.mean()
def funct_TCases(row):
    row.TotalCases = row.TotalCases - essai_2
    return row
a = Lecture.apply(funct_TCases, axis = 'columns')
a

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,3.31198e+08,4.94046e+06,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,2.12711e+08,2.82584e+06,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,8.01000e+02,-9.17065e+04,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,5.98682e+05,-9.17085e+04,,0.0,,8.0,,0.0,,17.0,0.0,,,Africa


In [171]:
a.head(1)

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198000.0,4940460.0,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas


In [169]:
#Pour redefinir notre colonne TotalCases, on peut faire le mapping
essai_2 = Lecture.TotalCases.mean()
Lecture.TotalCases - essai_2

0      4.94046e+06
1      2.82584e+06
          ...     
207   -9.17065e+04
208   -9.17085e+04
Name: TotalCases, Length: 209, dtype: float64

In [199]:
Lecture.rename(columns = {'Country/Region': 'Country_Region'}, inplace = True)

In [200]:
Lecture

Unnamed: 0_level_0,Country_Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,USA,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
South America,Brazil,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
Asia,India,1.38134e+09,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
Europe,Russia,1.45941e+08,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
Africa,South Africa,5.93816e+07,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
North America,Montserrat,4.99200e+03,13,,0.0,,10.0,,0.0,,2604.0,200.0,61.0,12220.0,
North America,Caribbean Netherlands,2.62470e+04,13,,,,7.0,,6.0,,495.0,,424.0,16154.0,
South America,Falkland Islands,3.48900e+03,13,,,,13.0,,0.0,,3726.0,,1816.0,520493.0,
Europe,Vatican City,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe


In [181]:
#Concatenetion de deux Series de longueur égale
Lecture.Continent +"  ->  "+ Lecture.Country_Region

0          North America  ->  USA
1       South America  ->  Brazil
                  ...            
207      Europe  ->  Vatican City
208    Africa  ->  Western Sahara
Length: 209, dtype: object

## - Regroupement et tri

### - Groupwise analysis (Analyse par groupe) :

In [201]:
import numpy as np
Lecture = pd.read_csv('C:/Users/Usuario/Documents/Dossier L3/ODC_FORMATION/Veille_3_Pandas/covid.csv', index_col = 1)
pd.set_option("display.max_rows", 20)
Lecture.rename(columns = {'Country/Region': 'Country_Region'}, inplace = True)
Lecture

Unnamed: 0_level_0,Country_Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
North America,USA,3.31198e+08,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
South America,Brazil,2.12711e+08,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
Asia,India,1.38134e+09,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
Europe,Russia,1.45941e+08,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
Africa,South Africa,5.93816e+07,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
North America,Montserrat,4.99200e+03,13,,0.0,,10.0,,0.0,,2604.0,200.0,61.0,12220.0,
North America,Caribbean Netherlands,2.62470e+04,13,,,,7.0,,6.0,,495.0,,424.0,16154.0,
South America,Falkland Islands,3.48900e+03,13,,,,13.0,,0.0,,3726.0,,1816.0,520493.0,
Europe,Vatican City,8.01000e+02,12,,,,12.0,,0.0,,14981.0,,,,Europe


In [202]:
#Grouper le cas TotalDeaths en fonction de du nombre de compte
Lecture.groupby('TotalDeaths').TotalDeaths.count()

TotalDeaths
0.0         7
0.0         3
0.0         5
0.0         1
0.0         3
           ..
41638.0     1
46413.0     1
50517.0     1
98644.0     1
162804.0    1
Name: TotalDeaths, Length: 150, dtype: int64

In [203]:
#Groupement de cas total en foction du minimum des cas total de mort
Lecture.groupby('TotalCases').TotalDeaths.min()

TotalCases
10              0.0
12              NaN
13              0.0
14              NaN
17              NaN
             ...   
538184       9604.0
871894      14606.0
2025409     41638.0
2917562     98644.0
5032179    162804.0
Name: TotalDeaths, Length: 206, dtype: float64

In [206]:
#Renvoie le nombre de population correspondant à la première occurence de chaque continent dans la colonne Continent
Lecture.groupby('Continent').apply(lambda df : df.Population.iloc[0]) 

Continent
Africa               5.93816e+07
Asia                 1.38134e+09
Australia/Oceania    2.55289e+07
Europe               1.45941e+08
North America        3.31198e+08
South America        2.12711e+08
dtype: float64

In [212]:
# idxmax() = renvoie l'index de retour de la première occurrence du maximum sur l'axe demandé.
#Permet de grouper les colonnes Continent et TotalCases en fonction de la première occurrence du maximum sur l'axe Population
Lecture.groupby(['Continent', 'TotalCases']).apply(lambda df : df.loc[df.Population.idxmax()]).tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,Country_Region,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
Continent,TotalCases,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
South America,455409,Tot Cases/1M pop,13793.0,,,,,,,,,,,,,,,
South America,455409,Deaths/1M pop,619.0,,,,,,,,,,,,,,,
South America,455409,TotalTests,2493429.0,,,,,,,,,,,,,,,
South America,455409,Tests/1M pop,75521.0,,,,,,,,,,,,,,,
South America,455409,WHO Region,Americas,,,,,,,,,,,,,,,
South America,2917562,Country_Region,Brazil,,,,,,,,,,,,,,,
South America,2917562,Population,212710692.0,,,,,,,,,,,,,,,
South America,2917562,TotalCases,2917562,,,,,,,,,,,,,,,
South America,2917562,NewCases,,,,,,,,,,,,,,,,
South America,2917562,TotalDeaths,98644.0,,,,,,,,,,,,,,,


In [214]:
#Fait un groupement de continent en fonction du nombre de cas, du minimum, et du maximum des cas totaux
Lecture.groupby(['Continent']).TotalCases.agg([len, min, max])

Unnamed: 0_level_0,len,min,max
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,57,10,538184
Asia,48,20,2025409
Australia/Oceania,6,22,19890
Europe,48,12,871894
North America,35,13,5032179
South America,14,13,2917562


### - Multi-indexes

In [231]:
essai_3 = Lecture.groupby(['Continent', 'Country_Region']).TotalCases.agg([len])
essai_3

Unnamed: 0_level_0,Unnamed: 1_level_0,len
Continent,Country_Region,Unnamed: 2_level_1
Africa,Algeria,1
Africa,Angola,1
Africa,Benin,1
Africa,Botswana,1
Africa,Burkina Faso,1
...,...,...
South America,Paraguay,1
South America,Peru,1
South America,Suriname,1
South America,Uruguay,1


In [225]:
#mi est multi-index
mi = essai_3.index
type(mi)

pandas.core.indexes.multi.MultiIndex

In [226]:
#reconversion en index normal
essai_3.reset_index()

Unnamed: 0,Continent,Country_Region,len
0,Africa,Algeria,1
1,Africa,Angola,1
2,Africa,Benin,1
3,Africa,Botswana,1
4,Africa,Burkina Faso,1
...,...,...,...
203,South America,Paraguay,1
204,South America,Peru,1
205,South America,Suriname,1
206,South America,Uruguay,1


### - Sorting (Tri) : 
Pour obtenir les données dans l'ordre souhaité, on utilise la méthode sort_values pour les trier.

In [227]:
#Groupements dans les colonnes Continent et Country_Region triées en fonction de leurs nombres de manière croissante.
essai_3 = essai_3.reset_index()
essai_3.sort_values(by = 'len').tail(20)

Unnamed: 0,Continent,Country_Region,len
58,Asia,Armenia,1
59,Asia,Azerbaijan,1
60,Asia,Bahrain,1
61,Asia,Bangladesh,1
62,Asia,Bhutan,1
63,Asia,Brunei,1
64,Asia,Cambodia,1
65,Asia,Cyprus,1
66,Asia,Georgia,1
67,Asia,Hong Kong,1


In [232]:
#Groupements dans les colonnes Continent et Country_Region triées en fonction de leurs nombres de manière decroissante.
essai_3 = essai_3.reset_index()
essai_3.sort_values(by = 'len', ascending = False).tail(20)

Unnamed: 0,Continent,Country_Region,len
57,Asia,Afghanistan,1
58,Asia,Armenia,1
59,Asia,Azerbaijan,1
60,Asia,Bahrain,1
61,Asia,Bangladesh,1
62,Asia,Bhutan,1
63,Asia,Brunei,1
65,Asia,Cyprus,1
76,Asia,Kuwait,1
66,Asia,Georgia,1


In [236]:
#tri par valeurs d'index de manière decroissante
essai_3.sort_index(ascending = False)

Unnamed: 0,Continent,Country_Region,len
207,South America,Venezuela,1
206,South America,Uruguay,1
205,South America,Suriname,1
204,South America,Peru,1
203,South America,Paraguay,1
...,...,...,...
4,Africa,Burkina Faso,1
3,Africa,Botswana,1
2,Africa,Benin,1
1,Africa,Angola,1


In [235]:
#Tri par valeur de pays et nombre d'occurence dans les colonnes Continent et Country_Region
essai_3.sort_values(['Country_Region', 'len'])

Unnamed: 0,Continent,Country_Region,len
57,Asia,Afghanistan,1
111,Europe,Albania,1
0,Africa,Algeria,1
112,Europe,Andorra,1
1,Africa,Angola,1
...,...,...,...
103,Asia,Vietnam,1
54,Africa,Western Sahara,1
104,Asia,Yemen,1
55,Africa,Zambia,1
