### Creating a Machine Learning model to determine the quality of the wine

#### Importing libraries

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

#### Reading the dataset

In [None]:
df = pd.read_csv('winequality-white.csv', sep=';')

In [None]:
df.head(10)

In [None]:
df['color'] = 'white'

In [None]:
df.head(10)

#### Description of the dataset

In [None]:
df.describe().transpose()

#### Retrieving information about the dataset

In [None]:
df.info()

#### Unique colors in the dataset

In [None]:
df['color'].unique()

In [None]:
df.count()

In [None]:
df['quality'].unique()

#### Checking if there is any missing value in the dataset

In [None]:
df.isnull().sum()

*We can see that there no missing values in the dataset*

#### Defining function to create boxplots and distplots

In [None]:
from scipy.stats import skew

In [None]:

def create_boxplots_distplots(dataset):
    features = [feature for feature in dataset.columns if feature != 'color']
    for feature in features:
        plt.figure(figsize=(10,7))
        plt.subplot(2,2,1)
        dataset.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.subplot(2,2,2)
        sns.boxplot(x=dataset['color'], y=dataset[feature])
        plt.subplot(2,2,3)
        plt.hist(dataset[feature])
        plt.xlabel(feature)
        plt.subplot(2,2,4)
        sns.distplot(dataset[feature].dropna())
        plt.show()
    
create_boxplots_distplots(df)


We can see from the boxplots that there are outliers. So we need to to handle these outliers.

#### Defining function to handle outliers

*Using 3xIQR for features having extreme oultiers and high skewness*

In [None]:
def remove_extreme_outliers(dataset, list_of_features):
    for feature in list_of_features:
        if feature != 'color':
            IQR = dataset[feature].quantile(0.75)-dataset[feature].quantile(0.25)
            lower_bridge = dataset[feature].quantile(0.25) - IQR*(3)
            upper_bridge = dataset[feature].quantile(0.75) + IQR*(3)
            #print(lower_bridge, upper_bridge)
            dataset.loc[dataset[feature] >= upper_bridge, feature] = upper_bridge
            if lower_bridge >=0:    
                dataset.loc[dataset[feature] <= lower_bridge, feature] = lower_bridge


            
features = ['citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'density', 'sulphates', 'alcohol']
remove_extreme_outliers(df, features)

*Using 1.5xIQR for features having not so extreme oultiers and low skewness*

In [None]:
def remove_outliers(dataset, list_of_features):
    for feature in list_of_features:
        if feature != 'color':
            IQR = dataset[feature].quantile(0.75)-df[feature].quantile(0.25)
            lower_bridge = dataset[feature].quantile(0.25) - IQR*(1.5)
            upper_bridge = dataset[feature].quantile(0.75) + IQR*(1.5)
            #print(lower_bridge, upper_bridge)
            dataset.loc[dataset[feature] >= upper_bridge, feature] = upper_bridge
            if lower_bridge >=0:
                dataset.loc[dataset[feature] <= lower_bridge, feature] = lower_bridge


features = ['fixed acidity', 'volatile acidity', 'total sulfur dioxide', 'pH']    
remove_outliers(df, features)

In [None]:
create_boxplots_distplots(df)

In [None]:
df.head(15)

#### Plotting a count of number of samples for respective qualities

In [None]:
sns.countplot('quality', data=df)

In [None]:
df['quality'].value_counts()

#### Pie chart showing distribution of qualities

In [None]:
plt.figure(figsize=(15,10))
plt.pie(df['quality'].value_counts(), labels=df['quality'].unique(), autopct='%0.2f%%', explode=(0,0,0,0,0,0,0.7))
plt.show()

#### Determining correlation between the features of the dataset

In [None]:
df.corr()

#### Determining correlation between the features of the dataset using a heatmap

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn")

** In the above heatmap, if there are any 2 independent features that are highly correlated i.e. 80% or more, then we can drop 1 of those 2 features because both those features are serving the same purpose. We can see that density and 'residual sugar' features have a pearson correlation coefficient of 0.83 i.e. 83% and thus we can drop 1 of these 2 features**

In [None]:
df = df.drop(columns=['residual sugar'])

In [None]:
df.head()

#### Determining the data types for each column of the dataset

In [None]:
df.dtypes

In [None]:
#b = sns.PairGrid(df)
#b.map(plt.scatter)

In [None]:
df.head(20)

In [None]:
df.shape

#### Identifying and removing duplicate rows in the dataset

In [None]:
df.duplicated(['fixed acidity'])

In [None]:
df[df.duplicated(['fixed acidity', 'volatile acidity', 'citric acid', 
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
      'pH', 'sulphates', 'alcohol'])==True]

In [None]:
df = df[df.duplicated(['fixed acidity', 'volatile acidity', 'citric acid',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'])==False]


In [None]:
df.count()

In [None]:
df.shape

#### Determining the count of unique values in each column of the dataset

In [None]:
df.nunique()

#### Plotting a scatterplot matrix of the independent features

In [None]:
#pd.plotting.scatter_matrix(df, alpha=0.1, figsize=(20,20), diagonal='hist', grid=True, )

In [None]:
'''
def create_relplot(dataset):
    features = [feature for feature in dataset.columns if feature != 'color']
    for feature in features:
        for bivariate_feature in features:
            if bivariate_feature != feature:
                plt.figure(figsize=(10,10))
                sns.relplot(x=feature, y=bivariate_feature, data=dataset, hue='quality')
                plt.show()


create_relplot(df)
'''

In [None]:
#sns.pairplot(df, hue='quality')

#### Removing multivariate outliers

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['fixed acidity'] > 8.5)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['fixed acidity'] > 8.5)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['fixed acidity'] > 8.5)]

In [None]:
df.shape

In [None]:
df[(df['chlorides'] > 0.08) & (df['fixed acidity'] > 8.5)]

In [None]:
df = df.drop(df[(df['chlorides'] > 0.08) & (df['fixed acidity'] > 8.5)].index)

In [None]:
df[(df['chlorides'] > 0.08) & (df['fixed acidity'] > 8.5)]

In [None]:
df.shape

In [None]:
df[(df['density'] > 1.005)]

In [None]:
df = df.drop(df[(df['density'] > 1.005)].index)

In [None]:
df[(df['density'] > 1.005)]

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['volatile acidity'] > 0.45)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['volatile acidity'] > 0.45)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['volatile acidity'] > 0.45)]

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['total sulfur dioxide'] > 225)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['total sulfur dioxide'] > 225)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['total sulfur dioxide'] > 225)]

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['pH'] > 3.4)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['pH'] > 3.4)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['pH'] > 3.4)]

In [None]:
df.shape

In [None]:
df[(df['alcohol'] > 13) & (df['total sulfur dioxide'] > 250)]

In [None]:
df = df.drop(df[(df['alcohol'] > 13) & (df['total sulfur dioxide'] > 250)].index)

In [None]:
df[(df['alcohol'] > 13) & (df['total sulfur dioxide'] > 250)]

In [None]:
df.shape

In [None]:
#sns.pairplot(df, hue='quality')

#### Checking for skewness

In [None]:
#create_boxplots_distplots(df)

**Skewness scores to determine skewness in the column**

In [None]:
#for feature in df:
#    print(feature + ": \t")
#    print(skew(df[feature]))
#    print("\n")

From the distplots and skewness scores, it can be observed that the feature columns **free sulfur dioxide**, **sulphates**, **chlorides**, **citric acid** and **alcohol** are considerably positively skewed. From the heatmap, it can be seen that  **free sulfur dioxide**, **sulphates**, **chlorides** and **citric acid** have weak correlation with **quality** whereas **alcohol** has moderate correlation with **quality**. So we will remove skewness from **free sulfur dioxide**, **sulphates**, **chlorides** and **citric acid** by performing square root transformation

In [None]:
#df['free sulfur dioxide'] = np.sqrt(df['free sulfur dioxide'])
#print(skew(df['free sulfur dioxide']))
#sns.distplot(df['free sulfur dioxide'].dropna())

## Reading the dataset of red wines

In [None]:
df1 = pd.read_csv('winequality-red.csv', sep=';')

In [None]:
df1.head(10)

In [None]:
df1['color'] = 'red'

In [None]:
df1.head(10)

In [None]:
df1.describe().transpose()

In [None]:
df1.info()

In [None]:
df1['color'].unique()

In [None]:
df1.count()

In [None]:
df1['quality'].unique()

In [None]:
df1.isnull().sum()

In [None]:
create_boxplots_distplots(df1)

In [None]:
features = ['residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol', 'quality']    
remove_extreme_outliers(df1, features)

In [None]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'density']
remove_outliers(df1, features)

In [None]:
create_boxplots_distplots(df1)

In [None]:
df1.head(10)

In [None]:
sns.countplot('quality', data=df1)

In [None]:
df1['quality'].value_counts()

In [None]:
df2 = pd.concat([df, df1])

In [None]:
df2

#### Dropping colummns from the dataframe

In [None]:
q = df['quality']
c = df['color']
df =df.drop(columns=['quality', 'color'])

In [None]:
q

In [None]:
c

#### Scaling the dataframe

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler()
df_scaled = scaling.fit_transform(df)
df_scaled

In [None]:
df_white = pd.DataFrame({'fixed acidity':df_scaled[:,0],'volatile acidity':df_scaled[:,1], 'citric acid':df_scaled[:,2], 'residual sugar':df_scaled[:,3], 'chlorides':df_scaled[:,4], 'free sulfur dioxide':df_scaled[:,5], 'total sulfur dioxide':df_scaled[:,6], 'density':df_scaled[:,7], 'pH':df_scaled[:,8], 'sulphates':df_scaled[:,9], 'alcohol':df_scaled[:,10], 'color':c, 'quality':q})

In [None]:
df_white.head(20)

In [None]:
q = df['quality']
c = df['color']
df =df.drop(columns=['quality', 'color'])

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
X_train.describe()

In [None]:
X_test.describe()

In [None]:
X_train.corr()

In [None]:
pd.plotting.scatter_matrix(X_train, alpha=0.1, figsize=(20,20), diagonal='hist')

In [None]:
X_train.nunique()

In [None]:
X_train.head(10)

In [None]:
X_train = pd.get_dummies(X_train, drop_first=True)