In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact,Dropdown, FloatSlider
from traitlets import directional_link
from IPython.display import display
import time
import warnings
warnings.filterwarnings('ignore')
sns.set()

In [None]:
# My Table of content
- [Section 1](#id-section1)
- [Section 2](#id-section2)

<div id='id-section1'/>
## Section 1
<div id='id-section2'/>
## Section 2

In [2]:
df_red = pd.read_csv('winequality-red.csv',sep=';')
df_white = pd.read_csv('winequality-white.csv',sep=';')

In [3]:
df_red["type"] = 'red'
df_white["type"] = 'white'

In [4]:
df = pd.concat([df_red, df_white], axis=0)

In [5]:
# Renaming columns:
for col in df.columns:
    if col != 'pH':
        df.rename(columns={col:col.capitalize()}, inplace=True)

In [6]:
df = df.reset_index(drop=True)

In [7]:
# Searching for null values if present
for column in df.columns:
    missing = df[column].isna().sum() / df.shape[0]
    print(f"{column:{21}}: ==============> {missing * 100:.2f}%")



In [8]:
# Detailed analysis of correlations between data
numerical_columns = [column for column in df.columns if df[column].dtype == "float"]
numerical_columns

['Fixed acidity',
 'Volatile acidity',
 'Citric acid',
 'Residual sugar',
 'Chlorides',
 'Free sulfur dioxide',
 'Total sulfur dioxide',
 'Density',
 'pH',
 'Sulphates',
 'Alcohol']

In [9]:
categorical_columns = [column for column in df.columns if df[column].dtype in ["object", "int64"]]
categorical_columns

['Quality', 'Type']

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Fixed acidity         6497 non-null   float64
 1   Volatile acidity      6497 non-null   float64
 2   Citric acid           6497 non-null   float64
 3   Residual sugar        6497 non-null   float64
 4   Chlorides             6497 non-null   float64
 5   Free sulfur dioxide   6497 non-null   float64
 6   Total sulfur dioxide  6497 non-null   float64
 7   Density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   Sulphates             6497 non-null   float64
 10  Alcohol               6497 non-null   float64
 11  Quality               6497 non-null   int64  
 12  Type                  6497 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


In [161]:
def cat_data_analysis(df):

    categorical_menu = widgets.Dropdown(options=categorical_columns,
                                        value=categorical_columns[0],
                                        description="Column: ")

    opt1_values = sorted([str(x) for x in df[categorical_menu.value].unique()])
    opt1_values.append('all') 

    opt1 = widgets.Dropdown(options=opt1_values,
                            value='all',
                            description="Options: ")

    categorical_menu_hue = widgets.Dropdown(options=categorical_columns,
                                            value=categorical_columns[1],
                                            description="Hue: ")

    opt2_values = sorted([str(x) for x in df[categorical_menu_hue.value].unique()])
    opt2_values.append('all')

    opt2 = widgets.Dropdown(options=opt2_values,
                            value='all',
                            description="Options: ")
    property_distribution = widgets.Dropdown(options=numerical_columns,
                                            value=numerical_columns[0],
                                            description="Property: ")

    box1 = widgets.VBox([categorical_menu,opt1])
    box2 = widgets.VBox([categorical_menu_hue,opt2])
    category_box = widgets.HBox([box1,box2,property_distribution])

    out = widgets.Output()
    out2 = widgets.Output()
    cat_box_out = widgets.HBox([out,out2])


    def update_dropdown1(change):

        opt1_options = sorted([str(x) for x in df[categorical_menu.value].unique()])
        opt1_options.append('all')
        opt1.options = opt1_options
        opt1.value = 'all'

    def update_dropdown2(change):

        opt2_options = sorted([str(x) for x in df[categorical_menu_hue.value].unique()])
        opt2_options.append('all')
        opt2.options = opt2_options
        opt2.value = 'all'

    
    def common_filtering(column,hue,option1,option2,property):

        column = categorical_menu.value
        hue = categorical_menu_hue.value
        option1 = opt1.value
        option2 = opt2.value
        property = property_distribution.value

        if option1 != 'all':
                filtered_data = df[df[column].astype('string')==option1]
        else:
                filtered_data = df

        
        with out:
            out.clear_output(wait=True)
            if option2 == 'all':
                p = sns.countplot(data=filtered_data, x=column, hue=hue)
                if len(filtered_data[column].unique()) > 4:
                    p.tick_params(axis='x', rotation=90)
                plt.show()
            else:
                try:
                    p = sns.countplot(data=filtered_data[filtered_data[hue].astype('string')==option2], x=column)
                    plt.show()
                except ValueError:
                    print("No samples meeting the selected parameters!")
            
            
        with out2:
            out2.clear_output(wait=True)
            if option2 == 'all':
                p2 = sns.histplot(data=filtered_data[property])
                plt.show()
            else:
                try:
                    p2 = sns.histplot(data=filtered_data[filtered_data[hue].astype('string')==option2][property])
                    plt.show()
                except ValueError:
                    print("No samples meeting the selected parameters!")


    def categorical_menu_eventhandler(change):
        update_dropdown1(change.new)
        common_filtering(change.new, categorical_menu_hue.value,opt1.value,opt2.value,property_distribution.value)
        

    def categorical_menu_hue_eventhandler(change):
        update_dropdown2(change.new)
        common_filtering(categorical_menu.value, change.new,opt1.value,opt2.value,property_distribution.value)

    def opt1_eventhandler(change):
        common_filtering(categorical_menu.value, categorical_menu_hue.value,change.new,opt2.value, property_distribution.value)

    def opt2_eventhandler(change):
        common_filtering(categorical_menu.value, categorical_menu_hue.value,opt1.value,change.new,property_distribution.value)

    def property_eventhandler(change):
        common_filtering(categorical_menu.value, categorical_menu_hue.value,opt1.value,opt2.value,change.new)

    categorical_menu.observe(categorical_menu_eventhandler, names='value')
    categorical_menu_hue.observe(categorical_menu_hue_eventhandler, names='value')
    opt1.observe(opt1_eventhandler, names='value')
    opt2.observe(opt2_eventhandler, names='value')
    property_distribution.observe(property_eventhandler, names='value')

    display(category_box,cat_box_out)

In [162]:
cat_data_analysis(df)

HBox(children=(VBox(children=(Dropdown(description='Column: ', options=('Quality', 'Type'), value='Quality'), …

HBox(children=(Output(), Output()))

In [163]:
def num_data_analysis(df):
    
    n1 = widgets.Dropdown(options=numerical_columns,
                    value = numerical_columns[0],
                    description="Column 1: ")

    n2 = widgets.Dropdown(options=numerical_columns,
                        value = numerical_columns[1],
                        description="Column 2: ")

    n3 = widgets.Dropdown(options=categorical_columns,
                                    description="Hue: ")

    d3_options = sorted([str(x) for x in df[n3.value].unique()])
    d3_options.append('all')

    d3 = widgets.Dropdown(options=d3_options,
                        value="all",
                        description="Options: ")

    s1 = widgets.FloatRangeSlider(min = df[n1.value].min(), max = df[n1.value].max(), value=(df[n1.value].min(), df[n1.value].max()),  description='Value range: ')
    s2 = widgets.FloatRangeSlider(min = df[n2.value].min(), max = df[n2.value].max(), value=(df[n2.value].min(), df[n2.value].max()), description='Value range: ')
    n1_box = widgets.VBox([n1,s1])
    n2_box = widgets.VBox([n2,s2])
    n3_box = widgets.VBox([n3,d3])
    numeric_box = widgets.HBox([n1_box,n2_box,n3_box])

    out = widgets.Output()
    out2 = widgets.Output()
    num_box_out = widgets.HBox(children=[out,out2])


    def draw_plot2(column1,column2,hue,slider1,slider2,dropdown3,filtered_data):
        
        column1 = n1.value
        column2 = n2.value
        hue = n3.value
        slider1 = s1.value
        slider2=s2.value
        dropdown3 = d3.value

        print(f"""
        {column1} min value: {filtered_data[column1].min():.2f}
        {column1} max value: {filtered_data[column1].max():.2f}
        {column2} min value: {filtered_data[column2].min():.2f}
        {column2} max value: {filtered_data[column2].max():.2f}""")

        if dropdown3 == 'all':

            fig, axes = plt.subplots(1, 2,figsize=(10, 5.75))
            fig.suptitle(f'Data distribution for wines of different {hue.lower()}')
            sns.boxplot(data=filtered_data, x=hue, y=column1,showmeans=True,meanprops={"markeredgecolor": "yellow"},ax=axes[0])
            sns.boxplot(data=filtered_data, x=hue, y=column2,showmeans=True,meanprops={"markeredgecolor": "yellow"},ax=axes[1])
            plt.show()
            
        else:

            if hue=='Type':
                fig, axes = plt.subplots(1, 2,figsize=(10, 5.75))
                fig.suptitle(f'Data distribution for {dropdown3} wines')
                sns.boxplot(data=filtered_data, x='Quality', y=column1,showmeans=True,meanprops={"markeredgecolor": "yellow"},ax=axes[0])
                sns.boxplot(data=filtered_data, x='Quality', y=column2,showmeans=True,meanprops={"markeredgecolor": "yellow"},ax=axes[1])
                plt.show()
                
            else:
                fig, axes = plt.subplots(1, 2, figsize=(10, 5.75))
                fig.suptitle(f'Data distribution for "{dropdown3}" quality wines')
                sns.boxplot(data=filtered_data, x='Type', y=column1,showmeans=True,meanprops={"markeredgecolor": "yellow"},ax=axes[0])
                sns.boxplot(data=filtered_data, x='Type', y=column2,showmeans=True,meanprops={"markeredgecolor": "yellow"},ax=axes[1])
                plt.show()
                
        

    def draw_plot1(column1,column2,hue,slider1,slider2,dropdown3,filtered_data):

        column1 = n1.value
        column2 = n2.value
        hue = n3.value
        slider1 = s1.value
        slider2=s2.value
        dropdown3 = d3.value
        
        print(f"""
        Correlation coefficient: {filtered_data[column1].corr(filtered_data[column2]):.3f}
        Number of samples: {filtered_data[column1].count()}
        {column1} mean value: {filtered_data[column1].mean():.2f}
        {column2} mean value: {filtered_data[column2].mean():.2f}""")


        if dropdown3 == 'all':
            p = sns.jointplot(data=filtered_data,x=column1,y=column2,hue=hue)
            p.fig.suptitle(f"Relationship between the '{column1.lower()}' and '{column2.lower()}'\n variables for wines of different {hue.lower()}")
            p.fig.tight_layout()
            plt.figure(figsize=(10,5.75))
            plt.show()
            
            
        else:
            if hue == 'Quality':
                p = sns.jointplot(data=filtered_data,x=column1,y=column2,hue='Type')
                p.fig.suptitle(f"Relationship between the '{column1.lower()}' and '{column2.lower()}'\n variables for wines of different types")
                p.fig.tight_layout()
                plt.figure(figsize=(10,5.75))
                plt.show()
                

            else:
                p = sns.jointplot(data=filtered_data,x=column1,y=column2,hue='Quality')
                p.fig.suptitle(f"Relationship between the '{column1.lower()}' and '{column2.lower()}'\n variables for wines of different quality")
                p.fig.tight_layout()
                plt.figure(figsize=(10,5.75))
                plt.show()
                

    def common_data_filter(column1,column2,hue,slider1,slider2,dropdown3):

        column1 = n1.value
        column2 = n2.value
        hue = n3.value
        slider1 = s1.value
        slider2=s2.value
        dropdown3 = d3.value

        filtered_data = df[(df[column1] < slider1[1]+0.01) & (df[column1] >= slider1[0])
                        & (df[column2] >= slider2[0]) & (df[column2] < slider2[1]+0.01)]
                        
        if dropdown3 == 'all':
            filtered_data
        else:
            if hue =='Type':
                filtered_data = filtered_data[filtered_data[hue]==dropdown3]
            else:
                filtered_data = filtered_data[filtered_data[hue]==int(dropdown3)]


        with out:
            out.clear_output(wait=True)
            draw_plot1(column1,column2,hue,slider1,slider2,dropdown3,filtered_data)
            
            
        with out2:
            out2.clear_output(wait=True)
            draw_plot2(column1,column2,hue,slider1,slider2,dropdown3,filtered_data)
            
                
    def update_slider1(change):

        s1.min = 0
        s1.max =1000
        s1.max=df[n1.value].max()
        s1.min = df[n1.value].min()
        s1.value = (df[n1.value].min(), df[n1.value].max())
        
    def update_slider2(change):
        
        s2.min = 0
        s2.max =1000
        s2.max=df[n2.value].max()
        s2.min = df[n2.value].min()
        s2.value = (df[n2.value].min(),df[n2.value].max())
    
    def update_dropdown3(change):

        d3_options = sorted([str(x) for x in df[n3.value].unique()])
        d3_options.append('all')
        d3.options=d3_options
        d3.value = 'all' 
        update_slider1(change.new)
        common_data_filter(change.new, n2.value,n3.value, s1.value,s2.value,d3.value)

    def n1_menu_eventhandler(change):
        update_slider1(change.new)
        common_data_filter(change.new, n2.value, n3.value, s1.value,s2.value,d3.value)
        
    def n2_menu_eventhandler(change):
        update_slider2(change.new)
        common_data_filter(n1.value, change.new,n3.value, s1.value,s2.value,d3.value)
    
    def n3_menu_eventhandler(change):
        update_dropdown3(change.new)
        common_data_filter(n1.value, n2.value,change.new, s1.value,s2.value,d3.value)

    def s1_eventhandler(change):
        common_data_filter(n1.value, n2.value,n3.value, change.new,s2.value,d3.value)

    def s2_eventhandler(change):
        common_data_filter(n1.value, n2.value,n3.value, s1.value,change.new,d3.value)
    
    def d3_eventhandler(change):
        common_data_filter(n1.value, n2.value,n3.value, s1.value,s2.value,change.new)

    n1.observe(n1_menu_eventhandler, names='value')
    n2.observe(n2_menu_eventhandler, names='value')
    n3.observe(n3_menu_eventhandler, names='value') 
    s1.observe(s1_eventhandler, names='value')
    s2.observe(s2_eventhandler, names='value')
    d3.observe(d3_eventhandler, names='value')
    
    display(numeric_box, num_box_out)

In [165]:
num_data_analysis(df)

HBox(children=(VBox(children=(Dropdown(description='Column 1: ', options=('Fixed acidity', 'Volatile acidity',…

HBox(children=(Output(), Output()))