### Creating a Machine Learning model to determine the quality of the wine

#### Importing libraries

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

#### Reading the dataset

In [None]:
df = pd.read_csv('winequality-white.csv', sep=';')

In [None]:
df.head(10)

In [None]:
df['color'] = 'white'

In [None]:
df.head(10)

#### Description of the dataset

In [None]:
df.describe().transpose()

#### Retrieving information about the dataset

In [None]:
df.info()

#### Unique colors in the dataset

In [None]:
df['color'].unique()

In [None]:
df.count()

In [None]:
df['quality'].unique()

#### Checking if there is any missing value in the dataset

In [None]:
df.isnull().sum()

*We can see that there no missing values in the dataset*

#### Defining function to create boxplots and distplots

In [None]:

def create_boxplots_distplots(dataset):
    features = [feature for feature in dataset.columns if feature != 'color']
    for feature in features:
        plt.figure(figsize=(10,7))
        plt.subplot(2,2,1)
        dataset.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.subplot(2,2,2)
        sns.boxplot(x=dataset['color'], y=dataset[feature])
        plt.subplot(2,2,3)
        plt.hist(dataset[feature])
        plt.xlabel(feature)
        plt.subplot(2,2,4)
        sns.distplot(dataset[feature].dropna())
        plt.show()
    
create_boxplots_distplots(df)

We can see from the boxplots that there are outliers. So we need to to handle these outliers.

#### Defining function to handle outliers

*Using 3xIQR for features having extreme oultiers and high skewness*

In [None]:
def remove_extreme_outliers(dataset, list_of_features):
    for feature in list_of_features:
        if feature != 'color':
            IQR = dataset[feature].quantile(0.75)-dataset[feature].quantile(0.25)
            lower_bridge = dataset[feature].quantile(0.25) - IQR*(3)
            upper_bridge = dataset[feature].quantile(0.75) + IQR*(3)
            #print(lower_bridge, upper_bridge)
            dataset.loc[dataset[feature] >= upper_bridge, feature] = upper_bridge
            if lower_bridge >=0:    
                dataset.loc[dataset[feature] <= lower_bridge, feature] = lower_bridge

            
features = ['citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'density', 'sulphates', 'alcohol']
remove_extreme_outliers(df, features)

*Using 1.5xIQR for features having not so extreme oultiers and low skewness*

In [None]:
def remove_outliers(dataset, list_of_features):
    for feature in list_of_features:
        if feature != 'color':
            IQR = dataset[feature].quantile(0.75)-df[feature].quantile(0.25)
            lower_bridge = dataset[feature].quantile(0.25) - IQR*(1.5)
            upper_bridge = dataset[feature].quantile(0.75) + IQR*(1.5)
            #print(lower_bridge, upper_bridge)
            dataset.loc[dataset[feature] >= upper_bridge, feature] = upper_bridge
            if lower_bridge >=0:
                dataset.loc[dataset[feature] <= lower_bridge, feature] = lower_bridge


features = ['fixed acidity', 'volatile acidity', 'total sulfur dioxide', 'pH']    
remove_outliers(df, features)

In [None]:
create_boxplots_distplots(df)

In [None]:
df.head(15)

#### Plotting a count of number of samples for respective qualities

In [None]:
sns.countplot('quality', data=df)

In [None]:
df['quality'].value_counts()

#### Pie chart showing distribution of qualities

In [None]:
plt.figure(figsize=(15,10))
plt.pie(df['quality'].value_counts(), labels=df['quality'].unique(), autopct='%0.2f%%', explode=(0,0,0,0,0,0,0.7))
plt.show()

#### Determining correlation between the features of the dataset

In [None]:
df.corr()

#### Determining correlation between the features of the dataset using a heatmap

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn")

** In the above heatmap, if there are any 2 independent features that are highly correlated i.e. 80% or more, then we can drop 1 of those 2 features because both those features are serving the same purpose. We can see that density and 'residual sugar' features have a pearson correlation coefficient of 0.83 i.e. 83% and thus we can drop 1 of these 2 features. But in the dataset of red wines, these 2 features are not strongly correlated and thus we cannot drop 1 amongst these 2features from the red wine dataset. So we will retain these features in the white wine dataset as well**

#### Determining the data types for each column of the dataset

In [None]:
df.dtypes

In [None]:
#b = sns.PairGrid(df)
#b.map(plt.scatter)

#### Identifying and removing duplicate rows in the dataset

In [None]:
df.duplicated(['fixed acidity'])

In [None]:
df[df.duplicated(['fixed acidity', 'volatile acidity', 'citric acid', 
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
      'pH', 'sulphates', 'alcohol'])==True]

In [None]:
df = df[df.duplicated(['fixed acidity', 'volatile acidity', 'citric acid',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'])==False]

In [None]:
df.count()

In [None]:
df.shape

#### Determining the count of unique values in each column of the dataset

In [None]:
df.nunique()

#### Plotting a scatterplot matrix of the independent features

In [None]:
#pd.plotting.scatter_matrix(df, alpha=0.1, figsize=(20,20), diagonal='hist', grid=True, )

In [None]:
'''
def create_relplot(dataset):
    features = [feature for feature in dataset.columns if feature != 'color']
    for feature in features:
        for bivariate_feature in features:
            if bivariate_feature != feature:
                plt.figure(figsize=(10,10))
                sns.relplot(x=feature, y=bivariate_feature, data=dataset, hue='quality')
                plt.show()


create_relplot(df)
'''

In [None]:
#sns.pairplot(df, hue='quality')

#### Removing multivariate outliers

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['fixed acidity'] > 8.5)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['fixed acidity'] > 8.5)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['fixed acidity'] > 8.5)]

In [None]:
df.shape

In [None]:
df[(df['chlorides'] > 0.08) & (df['fixed acidity'] > 8.5)]

In [None]:
df = df.drop(df[(df['chlorides'] > 0.08) & (df['fixed acidity'] > 8.5)].index)

In [None]:
df[(df['chlorides'] > 0.08) & (df['fixed acidity'] > 8.5)]

In [None]:
df.shape

In [None]:
df[(df['density'] > 1.005)]

In [None]:
df = df.drop(df[(df['density'] > 1.005)].index)

In [None]:
df[(df['density'] > 1.005)]

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['volatile acidity'] > 0.45)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['volatile acidity'] > 0.45)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['volatile acidity'] > 0.45)]

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['total sulfur dioxide'] > 225)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['total sulfur dioxide'] > 225)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['total sulfur dioxide'] > 225)]

In [None]:
df.shape

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['pH'] > 3.4)]

In [None]:
df = df.drop(df[(df['free sulfur dioxide'] > 100) & (df['pH'] > 3.4)].index)

In [None]:
df[(df['free sulfur dioxide'] > 100) & (df['pH'] > 3.4)]

In [None]:
df.shape

In [None]:
df[(df['alcohol'] > 13) & (df['total sulfur dioxide'] > 250)]

In [None]:
df = df.drop(df[(df['alcohol'] > 13) & (df['total sulfur dioxide'] > 250)].index)

In [None]:
df[(df['alcohol'] > 13) & (df['total sulfur dioxide'] > 250)]

In [None]:
df.shape

In [None]:
#sns.pairplot(df, hue='quality')

In [None]:
df.head(20)

#### Checking for skewness

In [None]:
create_boxplots_distplots(df)

**Skewness scores to determine skewness in the column**

From the distplots, it can be observed that the feature columns **free sulfur dioxide**, **sulphates**, **chlorides**, **citric acid** and **alcohol** are considerably positively skewed. From the heatmap, it can be seen that  **free sulfur dioxide**, **sulphates**, **chlorides** and **citric acid** have weak correlation with **quality** whereas **alcohol** has moderate correlation with **quality**. So we will remove skewness from **free sulfur dioxide**, **sulphates**, **chlorides** and **citric acid** by performing square root transformation

In [None]:
from scipy.stats import skew
print(skew(df['fixed acidity']))
df['fixed acidity log'] = np.log(df['fixed acidity'])
sns.distplot(df['fixed acidity log'].dropna())

In [None]:
print(skew(df['free sulfur dioxide']))
df['free sulfur dioxide sqrt'] = np.sqrt(df['free sulfur dioxide'])
sns.distplot(df['free sulfur dioxide sqrt'].dropna())

In [None]:
print(skew(df['residual sugar']))
df['residual sugar sqrt'] = np.sqrt(df['residual sugar'])
sns.distplot(df['residual sugar sqrt'].dropna())

In [None]:
print(skew(df['sulphates']))
df['sulphates log'] = np.log(df['sulphates'])
sns.distplot(df['sulphates log'].dropna())

In [None]:
print(skew(df['alcohol']))
df['alcohol sqrt'] = np.sqrt(df['alcohol'])
sns.distplot(df['alcohol sqrt'].dropna())

In [None]:
print(skew(df['total sulfur dioxide']))
df['total sulfur dioxide sqrt'] = np.sqrt(df['total sulfur dioxide'])
sns.distplot(df['total sulfur dioxide sqrt'].dropna())

In [None]:
df.head(20)

In [None]:
df.isnull().sum()

## Reading the dataset of red wines

In [None]:
df1 = pd.read_csv('winequality-red.csv', sep=';')

In [None]:
df1.head(10)

In [None]:
df1['color'] = 'red'

In [None]:
df1.head(10)

In [None]:
df1.describe().transpose()

In [None]:
df1.info()

In [None]:
df1['color'].unique()

In [None]:
df1.count()

In [None]:
df1['quality'].unique()

In [None]:
df1.isnull().sum()

In [None]:
create_boxplots_distplots(df1)

In [None]:
features = ['residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol', 'quality']    
remove_extreme_outliers(df1, features)

In [None]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'density']
remove_outliers(df1, features)

In [None]:
create_boxplots_distplots(df1)

In [None]:
df1.head(10)

In [None]:
sns.countplot('quality', data=df1)

In [None]:
df1['quality'].value_counts()

In [None]:
plt.figure(figsize=(15,10))
plt.pie(df1['quality'].value_counts(), labels=df1['quality'].unique(), autopct='%0.2f%%', explode=(0,0,0,0,0,0.7))
plt.show()

In [None]:
df1.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df1.corr(), annot=True, cmap="RdYlGn")

In [None]:
df1.dtypes

In [None]:
#b = sns.PairGrid(df1)
#b.map(plt.scatter)

In [None]:
df1.duplicated(['fixed acidity'])

In [None]:
df1[df1.duplicated(['fixed acidity', 'volatile acidity', 'citric acid', 
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
      'pH', 'sulphates', 'alcohol'])==True]

In [None]:
df1 = df1[df1.duplicated(['fixed acidity', 'volatile acidity', 'citric acid',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'])==False]

In [None]:
df1.count()

In [None]:
df1.shape

In [None]:
df1.nunique()

In [None]:
#pd.plotting.scatter_matrix(df1, alpha=0.1, figsize=(20,20), diagonal='hist', grid=True, )

In [None]:
'''
def create_relplot(dataset):
    features = [feature for feature in dataset.columns if feature != 'color']
    for feature in features:
        for bivariate_feature in features:
            if bivariate_feature != feature:
                plt.figure(figsize=(10,10))
                sns.relplot(x=feature, y=bivariate_feature, data=dataset, hue='quality')
                plt.show()


create_relplot(df1)
'''

In [None]:
#sns.pairplot(df1, hue='quality')

In [None]:
df1.shape

In [None]:
df1[(df1['free sulfur dioxide'] > 50) & (df1['fixed acidity'] > 11)]

In [None]:
df1 = df1.drop(df1[(df1['free sulfur dioxide'] > 50) & (df1['fixed acidity'] > 11)].index)

In [None]:
df1[(df1['free sulfur dioxide'] > 50) & (df1['fixed acidity'] > 11)]

In [None]:
df1.shape

In [None]:
df1[(df1['total sulfur dioxide'] > 175)]

In [None]:
df1 = df1.drop(df1[(df1['total sulfur dioxide'] > 175)].index)

In [None]:
df1[(df1['total sulfur dioxide'] > 175)]

In [None]:
df1.shape

In [None]:
df1[(df1['sulphates'] > 1.2) & (df1['fixed acidity'] > 12)]

In [None]:
df1 = df1.drop(df1[(df1['sulphates'] > 1.2) & (df1['fixed acidity'] > 12)].index)

In [None]:
df1[(df1['sulphates'] > 1.2) & (df1['fixed acidity'] > 12)]

In [None]:
df1.shape

In [None]:
df1[(df1['alcohol'] > 14) & (df1['fixed acidity'] > 12)]

In [None]:
df1 = df1.drop(df1[(df1['alcohol'] > 14) & (df1['fixed acidity'] > 12)].index)

In [None]:
df1[(df1['alcohol'] > 14) & (df1['fixed acidity'] > 12)]

In [None]:
df1.shape

In [None]:
df1[(df1['citric acid'] > 0.3) & (df1['volatile acidity'] > 1.2)]

In [None]:
df1 = df1.drop(df1[(df1['citric acid'] > 0.3) & (df1['volatile acidity'] > 1.2)].index)

In [None]:
df1[(df1['citric acid'] > 0.3) & (df1['volatile acidity'] > 1.2)]

In [None]:
df1.shape

In [None]:
df1[(df1['residual sugar'] > 4) & (df1['volatile acidity'] > 1.1)]

In [None]:
df1 = df1.drop(df1[(df1['residual sugar'] > 4) & (df1['volatile acidity'] > 1.1)].index)

In [None]:
df1[(df1['residual sugar'] > 4) & (df1['volatile acidity'] > 1.1)]

In [None]:
df1.shape

In [None]:
df1[(df1['chlorides'] > 0.12) & (df1['volatile acidity'] > 1.2)]

In [None]:
df1 = df1.drop(df1[(df1['chlorides'] > 0.12) & (df1['volatile acidity'] > 1.2)].index)

In [None]:
df1[(df1['chlorides'] > 0.12) & (df1['volatile acidity'] > 1.2)]

In [None]:
df1.shape

In [None]:
df1[(df1['free sulfur dioxide'] > 60)]

In [None]:
df1 = df1.drop(df1[(df1['free sulfur dioxide'] > 60)].index)

In [None]:
df1[(df1['free sulfur dioxide'] > 60)]

In [None]:
df1.shape

In [None]:
df1[(df1['volatile acidity'] > 1.2)]

In [None]:
df1 = df1.drop(df1[(df1['volatile acidity'] > 1.2)].index)

In [None]:
df1[(df1['volatile acidity'] > 1.2)]

In [None]:
df1.shape

In [None]:
df1[(df1['chlorides'] > 0.14) & (df1['residual sugar'] > 4.5)]

In [None]:
df1 = df1.drop(df1[(df1['chlorides'] > 0.14) & (df1['residual sugar'] > 4.5)].index)

In [None]:
df1[(df1['chlorides'] > 0.14) & (df1['residual sugar'] > 4.5)]

In [None]:
df1.shape

In [None]:
df1.head(20)

#### Handle skewness

In [None]:
create_boxplots_distplots(df1)

In [None]:
print(skew(df1['fixed acidity']))
df1['fixed acidity log'] = np.log(df1['fixed acidity'])
sns.distplot(df1['fixed acidity log'].dropna())

In [None]:
print(skew(df1['free sulfur dioxide']))
df1['free sulfur dioxide sqrt'] = np.sqrt(df1['free sulfur dioxide'])
sns.distplot(df1['free sulfur dioxide sqrt'].dropna())

In [None]:
print(skew(df1['residual sugar']))
df1['residual sugar sqrt'] = np.sqrt(df1['residual sugar'])
sns.distplot(df1['residual sugar sqrt'].dropna())

In [None]:
print(skew(df1['free sulfur dioxide']))
df1['free sulfur dioxide sqrt'] = np.sqrt(df1['free sulfur dioxide'])
sns.distplot(df1['free sulfur dioxide sqrt'].dropna())

In [None]:
print(skew(df1['total sulfur dioxide']))
df1['total sulfur dioxide sqrt'] = np.sqrt(df1['total sulfur dioxide'])
sns.distplot(df1['total sulfur dioxide sqrt'].dropna())

In [None]:
print(skew(df1['sulphates']))
df1['sulphates log'] = np.log(df1['sulphates'])
sns.distplot(df1['sulphates log'].dropna())

In [None]:
print(skew(df1['alcohol']))
df1['alcohol sqrt'] = np.sqrt(df1['alcohol'])
sns.distplot(df1['alcohol sqrt'].dropna())

In [None]:
print(skew(df1['total sulfur dioxide']))
df1['total sulfur dioxide sqrt'] = np.sqrt(df1['total sulfur dioxide'])
sns.distplot(df1['total sulfur dioxide sqrt'].dropna())

In [None]:
df1.head(20)

In [None]:
df1.isnull().sum()

In [None]:
df2 = pd.concat([df, df1])

In [None]:
df2

In [None]:
sns.countplot(y='quality', data=df2, hue='color')

In [None]:
plt.figure(figsize=(15,10))
plt.pie(df2['quality'].value_counts(), labels=df2['quality'].unique(), autopct='%0.2f%%', explode=(0,0,0,0,0,1.5,0.7))
plt.show()

#### One hot encoding of color feature

In [None]:
df3 = pd.get_dummies(df2, drop_first=True)

In [None]:
df3

In [None]:
df3.isnull().sum()

In [None]:
X = df3.drop(columns=["fixed acidity", "free sulfur dioxide", "residual sugar", "sulphates", "alcohol", "total sulfur dioxide", "quality"], axis=1)
y = df3.quality

In [None]:
X

In [None]:
y

#### Splitting the dataset into training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, shuffle=True)
'''
split = StratifiedShuffleSplit(n_splits=1, test_size = 0.3, random_state = 42)
for train_index, test_index in split.split(df3, df3['color_white']):
    strat_train_set = df3.loc[train_index]
    strat_test_set = df3.loc[test_index]
    '''

In [None]:
X_train.shape

In [None]:
X_train

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
y_train.unique()

In [None]:
y_train.value_counts()

In [None]:
from collections import Counter
Counter(y_train)

In [None]:
plt.figure(figsize=(15,10))
plt.pie(X_train['color_white'].value_counts(), labels=X_train['color_white'].unique(), autopct='%0.2f%%')
#plt.pie(y_train['quality'].value_counts(), labels=y_train['quality'].unique(), autopct='%0.2f%%')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.pie(X_test['color_white'].value_counts(), labels=X_test['color_white'].unique(), autopct='%0.2f%%')
#plt.pie(y_test['quality'].value_counts(), labels=y_test['quality'].unique(), autopct='%0.2f%%')
plt.show()

#### Creating a RandomForestClassifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
y_pred = classifier.predict(X_test)
#print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

*The accuracy score is very less. We need to improve this accuracy score. To improve the accuracy, we can perform Hyperparameter tuning*

#### Hyperparameter tuning

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

In [None]:
#Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 5, stop = 30, num = 6)]
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

random_grid

In [None]:
rfc = RandomForestClassifier()
rsc = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, scoring = 'accuracy', n_iter = 10, cv = 5, random_state = 42, verbose = 2)

In [None]:
rsc.fit(X_train, y_train)

In [None]:
prediction = rsc.predict(X_test)
prediction

In [None]:
y_test

In [None]:
rsc.best_params_

In [None]:
plt.figure(figsize = (8,8))
sns.distplot(y_test - prediction)
plt.show()

In [None]:
metrics.r2_score(y_test, prediction)