# Intro à l'analyse de données avec Numpy

## Introduction à Numpy

### Les listes

In [1]:
list_of_list =[["Bayonne",64],["Toulouse",31],["Bordeaux",33]]
list_of_list

[['Bayonne', 64], ['Toulouse', 31], ['Bordeaux', 33]]

In [3]:
list_of_list[1][0]

'Toulouse'

In [2]:
liste = [1,"plages",8]
liste.append("montagne")
liste

[1, 'plages', 8, 'montagne']

Avantages :
 - différents types de données
 - modification dynamique
 
Inconvénients :
 - bcp de mémoire et d'énergie consommées
 - problème quand la taille des datasets augmente

### Les tableaux

#### Type ndarray = tableaux de dimensions n

In [4]:
import numpy as np

In [17]:
vector = np.array([2,4,9,17])
vector, type(vector)

(array([ 2,  4,  9, 17]), numpy.ndarray)

In [13]:
matrix = np.array([[1,1,18],[23,4,'90'],[17,'11',87],[18,12,88]])
matrix

array([['1', '1', '18'],
       ['23', '4', '90'],
       ['17', '11', '87'],
       ['18', '12', '88']], dtype='<U21')

In [55]:
matrix = np.array([[1,1,18],[23,4,90],[17,11,87],[18,12,88]])
matrix

array([[ 1,  1, 18],
       [23,  4, 90],
       [17, 11, 87],
       [18, 12, 88]])

In [9]:
matrix.dtype

dtype('int64')

#### Taille d'un tableau

In [18]:
# propriété shape
print(vector.shape, matrix.shape)

(4,) (4, 3)


In [22]:
print(matrix.shape[0])
print(matrix.shape[1])

4
3


In [20]:
len(vector), len(matrix)

(4, 4)

In [19]:
# méthode size
print(vector.size,matrix.size)

4 12


#### Lecture d'un fichier de données

In [27]:
# numpy.genfromtxt() ou numpy.loadtxt()
data0 = np.genfromtxt("Data/world-alcohol.csv", delimiter = ",")

# les types de données classiques bool, int (16,32,64), float (16,32,64) et string
print(type(data0),data0.dtype)

<class 'numpy.ndarray'> float64


In [28]:
data0

array([[      nan,       nan,       nan,       nan,       nan],
       [1.986e+03,       nan,       nan,       nan, 0.000e+00],
       [1.986e+03,       nan,       nan,       nan, 5.000e-01],
       ...,
       [1.986e+03,       nan,       nan,       nan, 2.540e+00],
       [1.987e+03,       nan,       nan,       nan, 0.000e+00],
       [1.986e+03,       nan,       nan,       nan, 5.150e+00]])

In [29]:
data1 = np.genfromtxt("Data/world-alcohol.csv", delimiter = ",", dtype="U75")
# afficher data0, data1 et data pour voir les effets et l'intérêt des paramètres dtype et skip_header

In [30]:
data1

array([['Year', 'WHO region', 'Country', 'Beverage Types',
        'Display Value'],
       ['1986', 'Western Pacific', 'Viet Nam', 'Wine', '0'],
       ['1986', 'Americas', 'Uruguay', 'Other', '0.5'],
       ...,
       ['1986', 'Europe', 'Switzerland', 'Spirits', '2.54'],
       ['1987', 'Western Pacific', 'Papua New Guinea', 'Other', '0'],
       ['1986', 'Africa', 'Swaziland', 'Other', '5.15']], dtype='<U75')

In [31]:
data = np.genfromtxt("Data/world-alcohol.csv", delimiter = ",", dtype="U75", skip_header =1)
print(data)

[['1986' 'Western Pacific' 'Viet Nam' 'Wine' '0']
 ['1986' 'Americas' 'Uruguay' 'Other' '0.5']
 ['1985' 'Africa' "Cte d'Ivoire" 'Wine' '1.62']
 ...
 ['1986' 'Europe' 'Switzerland' 'Spirits' '2.54']
 ['1987' 'Western Pacific' 'Papua New Guinea' 'Other' '0']
 ['1986' 'Africa' 'Swaziland' 'Other' '5.15']]


#### Extraction de valeurs et sous-ensembles

Éxécuter les cellules suivantes pour voir et comprendre ce qui se passe. C'est ce qu'on appelle le *slicing*

In [32]:
data[0,3]

'Wine'

In [35]:
data[0,:]

array(['1986', 'Western Pacific', 'Viet Nam', 'Wine', '0'], dtype='<U75')

In [36]:
data[:,3]

array(['Wine', 'Other', 'Wine', ..., 'Spirits', 'Other', 'Other'],
      dtype='<U75')

In [37]:
print(data.shape,data.T.shape)

(3257, 5) (5, 3257)


In [38]:
data.T[3,:]

array(['Wine', 'Other', 'Wine', ..., 'Spirits', 'Other', 'Other'],
      dtype='<U75')

In [39]:
data[0:3,:]

array([['1986', 'Western Pacific', 'Viet Nam', 'Wine', '0'],
       ['1986', 'Americas', 'Uruguay', 'Other', '0.5'],
       ['1985', 'Africa', "Cte d'Ivoire", 'Wine', '1.62']], dtype='<U75')

In [40]:
data[:3,:2]

array([['1986', 'Western Pacific'],
       ['1986', 'Americas'],
       ['1985', 'Africa']], dtype='<U75')

In [41]:
data[:,2:5]

array([['Viet Nam', 'Wine', '0'],
       ['Uruguay', 'Other', '0.5'],
       ["Cte d'Ivoire", 'Wine', '1.62'],
       ...,
       ['Switzerland', 'Spirits', '2.54'],
       ['Papua New Guinea', 'Other', '0'],
       ['Swaziland', 'Other', '5.15']], dtype='<U75')

In [43]:
data[:,-2:]

array([['Wine', '0'],
       ['Other', '0.5'],
       ['Wine', '1.62'],
       ...,
       ['Spirits', '2.54'],
       ['Other', '0'],
       ['Other', '5.15']], dtype='<U75')

## Analyse de données avec Numpy

In [44]:
### On travaille sur les mêmes données
data = np.genfromtxt("Data/world-alcohol.csv", delimiter = ",", dtype="U75", skip_header =1)

### Comparer

Possibilité d'effectuer des comparaisons est ce qui rend la bibliothèque Numpy très intéressante !

In [45]:
vector

array([ 2,  4,  9, 17])

In [52]:
vector==9

array([False, False,  True, False])

In [58]:
vector[vector==9]

array([9])

In [59]:
matrix

array([[ 1,  1, 18],
       [23,  4, 90],
       [17, 11, 87],
       [18, 12, 88]])

In [64]:
matrix==1

array([[ True,  True, False],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

In [65]:
matrix[matrix==1]

array([1, 1])

In [66]:
data[:,2]=='France'

array([False, False, False, ..., False, False, False])

In [67]:
data[data[:,2]=='France']

array([['1985', 'Europe', 'France', 'Spirits', '2.7'],
       ['1989', 'Europe', 'France', 'Wine', '10.24'],
       ['1986', 'Europe', 'France', 'Other', '0.25'],
       ['1989', 'Europe', 'France', 'Other', '0.27'],
       ['1984', 'Europe', 'France', 'Spirits', '2.58'],
       ['1987', 'Europe', 'France', 'Beer', '2.45'],
       ['1986', 'Europe', 'France', 'Spirits', '2.71'],
       ['1989', 'Europe', 'France', 'Beer', '2.56'],
       ['1986', 'Europe', 'France', 'Beer', '2.55'],
       ['1984', 'Europe', 'France', 'Beer', '2.62'],
       ['1984', 'Europe', 'France', 'Other', '0.24'],
       ['1985', 'Europe', 'France', 'Wine', '11.1'],
       ['1985', 'Europe', 'France', 'Other', '0.25'],
       ['1986', 'Europe', 'France', 'Wine', '10.62'],
       ['1989', 'Europe', 'France', 'Spirits', '2.98'],
       ['1985', 'Europe', 'France', 'Beer', '2.54'],
       ['1987', 'Europe', 'France', 'Wine', '10.42'],
       ['1984', 'Europe', 'France', 'Wine', '11.45'],
       ['1987', 'Europe', '

In [68]:
data[(data[:,2]=='Canada')|(data[:,2]=='France')]

array([['1985', 'Europe', 'France', 'Spirits', '2.7'],
       ['1984', 'Americas', 'Canada', 'Spirits', '3.35'],
       ['1989', 'Europe', 'France', 'Wine', '10.24'],
       ['1989', 'Americas', 'Canada', 'Wine', '1.27'],
       ['1984', 'Americas', 'Canada', 'Beer', '5'],
       ['1985', 'Americas', 'Canada', 'Beer', '4.94'],
       ['1986', 'Europe', 'France', 'Other', '0.25'],
       ['1987', 'Americas', 'Canada', 'Wine', '1.3'],
       ['1989', 'Europe', 'France', 'Other', '0.27'],
       ['1987', 'Americas', 'Canada', 'Beer', '4.83'],
       ['1984', 'Europe', 'France', 'Spirits', '2.58'],
       ['1987', 'Europe', 'France', 'Beer', '2.45'],
       ['1986', 'Americas', 'Canada', 'Other', ''],
       ['1986', 'Europe', 'France', 'Spirits', '2.71'],
       ['1989', 'Europe', 'France', 'Beer', '2.56'],
       ['1986', 'Europe', 'France', 'Beer', '2.55'],
       ['1986', 'Americas', 'Canada', 'Spirits', '3.11'],
       ['1985', 'Americas', 'Canada', 'Spirits', '3.21'],
       ['1984',

In [69]:
data[(data[:,2]=='Canada') | (data[:,2]=='France') & (data[:,0] == '1986')]

array([['1984', 'Americas', 'Canada', 'Spirits', '3.35'],
       ['1989', 'Americas', 'Canada', 'Wine', '1.27'],
       ['1984', 'Americas', 'Canada', 'Beer', '5'],
       ['1985', 'Americas', 'Canada', 'Beer', '4.94'],
       ['1986', 'Europe', 'France', 'Other', '0.25'],
       ['1987', 'Americas', 'Canada', 'Wine', '1.3'],
       ['1987', 'Americas', 'Canada', 'Beer', '4.83'],
       ['1986', 'Americas', 'Canada', 'Other', ''],
       ['1986', 'Europe', 'France', 'Spirits', '2.71'],
       ['1986', 'Europe', 'France', 'Beer', '2.55'],
       ['1986', 'Americas', 'Canada', 'Spirits', '3.11'],
       ['1985', 'Americas', 'Canada', 'Spirits', '3.21'],
       ['1985', 'Americas', 'Canada', 'Other', ''],
       ['1986', 'Americas', 'Canada', 'Beer', '4.87'],
       ['1984', 'Americas', 'Canada', 'Wine', '1.24'],
       ['1986', 'Europe', 'France', 'Wine', '10.62'],
       ['1989', 'Americas', 'Canada', 'Spirits', '2.91'],
       ['1984', 'Americas', 'Canada', 'Other', ''],
       ['1985'

In [71]:
data86 = data[data[:,0] == '1986']
data86[(data86[:,2]=='Canada') | (data86[:,2]=='France')]

array([['1986', 'Europe', 'France', 'Other', '0.25'],
       ['1986', 'Americas', 'Canada', 'Other', ''],
       ['1986', 'Europe', 'France', 'Spirits', '2.71'],
       ['1986', 'Europe', 'France', 'Beer', '2.55'],
       ['1986', 'Americas', 'Canada', 'Spirits', '3.11'],
       ['1986', 'Americas', 'Canada', 'Beer', '4.87'],
       ['1986', 'Europe', 'France', 'Wine', '10.62'],
       ['1986', 'Americas', 'Canada', 'Wine', '1.33']], dtype='<U75')

In [70]:
data[((data[:,2]=='Canada') | (data[:,2]=='France')) & (data[:,0] == '1986')]

array([['1986', 'Europe', 'France', 'Other', '0.25'],
       ['1986', 'Americas', 'Canada', 'Other', ''],
       ['1986', 'Europe', 'France', 'Spirits', '2.71'],
       ['1986', 'Europe', 'France', 'Beer', '2.55'],
       ['1986', 'Americas', 'Canada', 'Spirits', '3.11'],
       ['1986', 'Americas', 'Canada', 'Beer', '4.87'],
       ['1986', 'Europe', 'France', 'Wine', '10.62'],
       ['1986', 'Americas', 'Canada', 'Wine', '1.33']], dtype='<U75')

### Remplacer des valeurs

In [72]:
data2=data.copy()
data2[data2[:,0]=='1986',0]='2018'
data2

array([['2018', 'Western Pacific', 'Viet Nam', 'Wine', '0'],
       ['2018', 'Americas', 'Uruguay', 'Other', '0.5'],
       ['1985', 'Africa', "Cte d'Ivoire", 'Wine', '1.62'],
       ...,
       ['2018', 'Europe', 'Switzerland', 'Spirits', '2.54'],
       ['1987', 'Western Pacific', 'Papua New Guinea', 'Other', '0'],
       ['2018', 'Africa', 'Swaziland', 'Other', '5.15']], dtype='<U75')

### Convertir des types de données

In [73]:
### pour effectuer des calculs sur la consommation moyenne, celle-ci doit être numérique
### or pour le moment il s'agit d'un string (conséquence du dtype="U75")

### On utilise la méthode .astype()
mon_filtre_valeurs_vide = (data[:,4] == '')
data[:,4][mon_filtre_valeurs_vide] = '0'
consommation = data[:,4].astype(float)
consommation

array([0.  , 0.5 , 1.62, ..., 2.54, 0.  , 5.15])

### Calculs avec Numpy

In [74]:
### sum(), mean(), max(), ...
vector

array([ 2,  4,  9, 17])

In [76]:
sum(vector)

32

In [75]:
vector.sum()

32

In [77]:
vector.mean()

8.0

In [78]:
vector.max()

17

In [79]:
vector.min()

2

In [80]:
matrix

array([[ 1,  1, 18],
       [23,  4, 90],
       [17, 11, 87],
       [18, 12, 88]])

In [81]:
matrix[:,2].sum()

283

In [82]:
matrix.sum(axis=0)

array([ 59,  28, 283])

In [83]:
matrix.sum(axis=1)

array([ 20, 117, 115, 118])

In [84]:
matrix.sum(axis=0)[2]

283

In [85]:
print(matrix.sum(axis=1),' et ', matrix.sum(axis=0))

[ 20 117 115 118]  et  [ 59  28 283]


In [86]:
print(matrix.max(axis=1),' et ', matrix.max(axis=0))

[18 90 87 88]  et  [23 12 90]


In [87]:
print(matrix.mean(axis=1),' et ', matrix.mean(axis=0))

[ 6.66666667 39.         38.33333333 39.33333333]  et  [14.75  7.   70.75]


## Exo : comparer les consommations annuelles d'alcool par pays et déterminer le pays qui a la plus grande consommation d'alcool