# <font color=red> Importation des outils et des données

In [1]:
# Importation des librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
# Données se trouvant dans Seaborn
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [3]:
# Importation du jeu de données 'tips'
df = sns.load_dataset('tips')
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# Structure des données: tip signifie pourboir
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


# <font color=red> Paramètres statistiques usuels

In [5]:
# Pourboire moyen 
df['tip'].mean()


2.9982786885245902

In [6]:
# Pourboire médian
df['tip'].median()


2.9

In [7]:
# Pourboire minimum
df['tip'].min()


1.0

In [8]:
# Pourboire maximum
df['tip'].max()


10.0

In [9]:
# Distribution de fréquence de la variable 'tip'
df['tip'].value_counts()


2.00    33
3.00    23
4.00    12
5.00    10
2.50    10
        ..
2.83     1
1.58     1
3.71     1
3.35     1
2.18     1
Name: tip, Length: 123, dtype: int64

In [10]:
# Distribution de fréquence de la variable 'time'
df['time'].value_counts()


Dinner    176
Lunch      68
Name: time, dtype: int64

In [11]:
# Distribution de fréquence de la variable 'time' (en %)
df['time'].value_counts(normalize=True)*100


Dinner    72.131148
Lunch     27.868852
Name: time, dtype: float64

In [12]:
# Etendue de la variable 'tip'
df['tip'].max()-df['tip'].min()


9.0

In [13]:
# Variance de la variable 'tip'
df['tip'].var()

1.9144546380624725

In [14]:
# Ecart-type de la variable 'tip'
df['tip'].std()


1.3836381890011826

In [15]:
# Vérifions que l'écart-type est bel et bien la racine caréée de la variance
df['tip'].std()==(df['tip'].var())**0.5

True

In [16]:
# Table de statistiques descriptives

df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [17]:
# Quartiles de la variable 'total_bill' : méthode .quantile()
print("1er quartile :", df['total_bill'].quantile(0.25))
print("2eme quartile :", df['total_bill'].quantile(0.5))
print("3eme quartile :", df['total_bill'].quantile(0.75))

1er quartile : 13.3475
2eme quartile : 17.795
3eme quartile : 24.127499999999998


In [18]:
# Quartiles de la variable 'total_bill' : fonction quantile() de numpy
print("1er quartile :", np.quantile(df['total_bill'], 0.25))
print("2eme quartile :", np.quantile(df['total_bill'], 0.5))
print("3eme quartile :", np.quantile(df['total_bill'], 0.75))

1er quartile : 13.3475
2eme quartile : 17.795
3eme quartile : 24.127499999999998


In [19]:
# Quartile de la variable 'total_bill' : fonction percentile() de numpy
print("1er quartile :", np.percentile(df['total_bill'], 25))
print("2eme quartile :", np.percentile(df['total_bill'], 50))
print("3eme quartile :", np.percentile(df['total_bill'], 75))


1er quartile : 13.3475
2eme quartile : 17.795
3eme quartile : 24.127499999999998


In [20]:
# Tous les quartiles en une seule ligne de code
np.percentile(df['total_bill'], [25, 50, 75])


array([13.3475, 17.795 , 24.1275])

In [21]:
np.quantile(df['total_bill'], [0.25, 0.5, 0.75])

array([13.3475, 17.795 , 24.1275])

In [22]:
df['total_bill'].quantile([0.25, 0.5, 0.75])

0.25    13.3475
0.50    17.7950
0.75    24.1275
Name: total_bill, dtype: float64

In [23]:
# On peut aussi calculer des quintiles

df['total_bill'].quantile([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

0.2    12.636
0.3    14.249
0.4    16.222
0.5    17.795
0.6    19.818
0.7    22.508
0.8    26.098
0.9    32.235
Name: total_bill, dtype: float64

In [24]:
# IQR de la variable 'total_bill'
np.quantile(df['total_bill'], 0.75)-np.quantile(df['total_bill'], 0.25)


10.779999999999998

In [25]:
# Importation du module stats
from scipy import stats

# IQR de la variable 'total_bill'

stats.iqr(df['total_bill'])

10.779999999999998

In [26]:
# Table de statistiques descriptives incluant toutes les variables
df.describe(include='all').transpose()


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
total_bill,244,,,,19.7859,8.90241,3.07,13.3475,17.795,24.1275,50.81
tip,244,,,,2.99828,1.38364,1.0,2.0,2.9,3.5625,10.0
sex,244,2.0,Male,157.0,,,,,,,
smoker,244,2.0,No,151.0,,,,,,,
day,244,4.0,Sat,87.0,,,,,,,
time,244,2.0,Dinner,176.0,,,,,,,
size,244,,,,2.56967,0.9511,1.0,2.0,2.0,3.0,6.0


In [29]:
# Dataframe des clients du samedi et du dimanche (filtration)
clients = df[(df.day=='Sun')|(df.day=='Sat')]
clients.head(5)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [31]:
# Consommation moyenne et médiane des clients du samedi et du dimanche
clients.groupby('day')['total_bill'].agg(['mean', 'median'])


Unnamed: 0_level_0,mean,median
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,,
Fri,,
Sat,20.441379,18.24
Sun,21.41,19.63


# <center>**GRAPHIQUES BASIQUES POUR DECRIRE UNE VARIABLE**