# Descriptive Statistics

In [1]:
import pandas as pd

df = pd.read_csv('../data/wine.csv')

In [2]:
# First 5 rows of the data
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,5.9,0.4451,0.1813,2.049401,0.070574,16.593818,42.27,0.9982,3.27,0.71,8.64,7
1,8.4,0.5768,0.2099,3.10959,0.101681,22.555519,16.01,0.996,3.35,0.57,10.03,8
2,7.54,0.5918,0.3248,3.673744,0.072416,9.316866,35.52,0.999,3.31,0.64,9.23,8
3,5.39,0.4201,0.3131,3.371815,0.072755,18.2123,41.97,0.9945,3.34,0.55,14.07,9
4,6.51,0.5675,0.194,4.404723,0.066379,9.360591,46.27,0.9925,3.27,0.45,11.49,8


In [3]:
# Show column names and their data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1000 non-null   float64
 1   volatile acidity      1000 non-null   float64
 2   citric acid           1000 non-null   float64
 3   residual sugar        1000 non-null   float64
 4   chlorides             1000 non-null   float64
 5   free sulfur dioxide   1000 non-null   float64
 6   total sulfur dioxide  1000 non-null   float64
 7   density               1000 non-null   float64
 8   pH                    1000 non-null   float64
 9   sulphates             1000 non-null   float64
 10  alcohol               1000 non-null   float64
 11  quality               1000 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 93.9 KB


In [4]:
# Description of the data: mean, std, min, max, quartiles, etc
desc = df.describe()

# Drop redundant 'count' column
desc = desc.drop(labels='count')

# Create a dataframe containing the modes of each column
mode = df.mode()

# This loop ensures that only columns with one (or more) mode(s) has the mode value
# If a column doesn't have a mode (i.e. all values appear the same number of times), its mode value will be NaN
for col_name, col_data in mode.items():
    if col_data.isna().any():  # If there is any NaN entry (it means that the column has one (or more) mode(s))
        desc.loc['mode', col_name] = col_data.iloc[0]  # Add it to the mode row in desc

# Add more descriptive statistics as rows
desc.loc['range'] = (desc.loc['max'] - desc.loc['min'])
desc.loc['var'] = df.var()
desc.loc['iqr'] = (desc.loc['75%'] - desc.loc['25%'])
desc.loc['skew'] = df.skew()
desc.loc['kurt'] = df.kurtosis()

# Show the data frame
desc

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
mean,7.15253,0.520839,0.270517,2.567104,0.081195,14.907679,40.29015,0.995925,3.30361,0.59839,10.59228,7.958
std,1.201598,0.095848,0.049098,0.987915,0.020111,4.8881,9.965767,0.00202,0.104875,0.100819,1.510706,0.902802
min,3.32,0.1399,0.1167,0.032555,0.015122,0.194679,3.15,0.9888,2.97,0.29,6.03,5.0
25%,6.3775,0.4561,0.2378,1.89633,0.066574,11.426717,33.785,0.9946,3.23,0.53,9.56,7.0
50%,7.15,0.52485,0.2722,2.51943,0.082167,14.860346,40.19,0.996,3.3,0.595,10.61,8.0
75%,8.0,0.585375,0.302325,3.220873,0.095312,18.313098,47.0225,0.9972,3.37,0.67,11.6225,9.0
max,11.49,0.8051,0.4096,5.550755,0.140758,27.462525,69.96,1.0026,3.71,0.96,15.02,10.0
mode,6.54,0.5546,0.3019,,,,35.2,0.9959,3.34,0.59,9.86,8.0
range,8.17,0.6652,0.2929,5.5182,0.125635,27.267847,66.81,0.0138,0.74,0.67,8.99,5.0
var,1.443837,0.009187,0.002411,0.975977,0.000404,23.893519,99.316519,4e-06,0.010999,0.010164,2.282233,0.815051


`mean`: mean\
`std`: standard deviation\
`min`: minimum value\
`25%`: first quartile\
`50%`: second quartile (median)\
`75%`: third quartile\
`max`: maximum value\
`mode`: mode (`NaN` if the column has no mode, i.e. all values appear the same number of times).
If there are multiple modes in a column, only one value is chosen\
`range`: range\
`var`: variance\
`iqr`: interquartile range (IQR)\
`skew`: skewness\
`kurt`: kurtosis