# BASIC EDA - {"RED WINE QUALITY" DATASET} 

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

In [3]:
data = pd.read_csv('winequality_red.csv')
df = data.copy()

## 3. Basic EDA

### 3.1 Sample of the Dataset

In [4]:
df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
884,8.8,0.61,0.19,4.0,0.094,30.0,69.0,0.9979,3.22,0.5,10.0,6
1508,7.1,0.27,0.6,2.1,0.074,17.0,25.0,0.9981,3.38,0.72,10.6,6
451,8.4,0.37,0.53,1.8,0.413,9.0,26.0,0.9979,3.06,1.06,9.1,6
1134,8.5,0.28,0.35,1.7,0.061,6.0,15.0,0.9952,3.3,0.74,11.8,7
360,8.2,0.7,0.23,2.0,0.099,14.0,81.0,0.9973,3.19,0.7,9.4,5


### 3.2 Shape of the Dataset

In [5]:
print(f'No. of Observations (rows) : {df.shape[0]}')
print(f'No. of Features (columns)  : {df.shape[1]}')

No. of Observations (rows) : 1599
No. of Features (columns)  : 12


### 3.3 Dataset Information

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


### 3.4 Checking Any Missing Values in the Dataset

In [7]:
miss_cnt = df.isna().sum().reset_index().rename(columns={'index':'Features', 0:'Missing Count'})
miss_cnt['Missing PCT'] = miss_cnt['Missing Count'].apply(lambda x: (x/df.shape[0])*100)
miss_cnt

Unnamed: 0,Features,Missing Count,Missing PCT
0,fixed acidity,0,0.0
1,volatile acidity,0,0.0
2,citric acid,0,0.0
3,residual sugar,0,0.0
4,chlorides,0,0.0
5,free sulfur dioxide,0,0.0
6,total sulfur dioxide,0,0.0
7,density,0,0.0
8,pH,0,0.0
9,sulphates,0,0.0


### 3.5 Checking Unique Values for Individual Features

In [8]:
unq_cnt = df.nunique().reset_index().rename(columns={'index':'Features', 0:'Unique Values'})
unq_cnt['Unique Values PCT'] = unq_cnt['Unique Values'].apply(lambda x: (x/df.shape[0])*100)
unq_cnt

Unnamed: 0,Features,Unique Values,Unique Values PCT
0,fixed acidity,96,6.0038
1,volatile acidity,143,8.9431
2,citric acid,80,5.0031
3,residual sugar,91,5.6911
4,chlorides,153,9.5685
5,free sulfur dioxide,60,3.7523
6,total sulfur dioxide,144,9.0056
7,density,436,27.267
8,pH,89,5.566
9,sulphates,96,6.0038


### 3.6 Checking Any Duplicate Records

In [9]:
print(f'Total No. of Duplicated Observations in the Dataset : {df.duplicated().sum()}')

Total No. of Duplicated Observations in the Dataset : 240


### 3.7 Generating Statistical Summary on Numerical Columns

In [10]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.3196,0.5278,0.271,2.5388,0.0875,15.8749,46.4678,0.9967,3.3111,0.6581,10.423,5.636
std,1.7411,0.1791,0.1948,1.4099,0.0471,10.4602,32.8953,0.0019,0.1544,0.1695,1.0657,0.8076
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.9901,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.9968,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.9978,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.0037,4.01,2.0,14.9,8.0


### 3.8 Features Specific to a Datatype

In [11]:
print(f'Number of Unique Data Types : {df.dtypes.nunique()}')
print(f'Unique Data Types : {df.dtypes.unique().tolist()}')

Number of Unique Data Types : 2
Unique Data Types : [dtype('float64'), dtype('int64')]


In [12]:
fea_flo = df.select_dtypes(include='float64').columns.values.tolist()
fea_int = df.select_dtypes(include='int64').columns.values.tolist()

print(f'Float Type Features    : {len(fea_flo)} \n{fea_flo} \n\n')
print(f'Integer Type Features  : {len(fea_int)} \n{fea_int}')

Float Type Features    : 11 
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'] 


Integer Type Features  : 1 
['quality']
