In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Read Input

In [None]:
raw_data = pd.read_csv(r"../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

raw_data.head(n=10)

## Preprocessing

### Binning Target Feature

In [None]:
# Reduce target variable to 4 classes: 1, 2, 3, 4 with 4 being highest quality
df = raw_data.copy()

df.loc[(df['quality']==3),'quality']=1
df.loc[(df['quality']==4),'quality']=1

df.loc[(df['quality']==5),'quality']=2

df.loc[(df['quality']==6),'quality']=3

df.loc[(df['quality']==7),'quality']=4
df.loc[(df['quality']==8),'quality']=4

### Handling Outliers

In [None]:
# Create a function to return index of outliers
def indicies_of_outliers(x, tolerance = 1.5):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * tolerance)
    upper_bound = q3 + (iqr * tolerance)
    return np.where((x > upper_bound) | (x < lower_bound))

# Run function
outliers = []
for i in range(df.shape[1] - 1):
    outliers.extend(*indicies_of_outliers(df.iloc[:,i], 3))

outliers = list(set(outliers))

In [None]:
n1 = len(raw_data[raw_data.quality == 3])
n2 = len(raw_data[raw_data.quality == 8])
n1, n2

In [None]:
n1 = len(df[df.quality == 1])
n2 = len(df[df.quality == 2])
n3 = len(df[df.quality == 3])
n4 = len(df[df.quality == 4])

print('All Outliers:', len(outliers))
print('Outliers % with quality == 1:', df.iloc[list(set(outliers))][df.quality == 1].shape[0]/n1)
print('Outliers % with quality == 2:', df.iloc[list(set(outliers))][df.quality == 2].shape[0]/n2)

print('Outliers % with quality == 3:', df.iloc[list(set(outliers))][df.quality == 3].shape[0]/n3)
print('Outliers % with quality == 4:', df.iloc[list(set(outliers))][df.quality == 4].shape[0]/n4)

In [None]:
df = df.drop(outliers)

### Seperating Target Feature

In [None]:
X = df.drop("quality", axis=1)
Y = df[["quality"]]

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler()
X_scaled = X_scaler.fit_transform(X)

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split


test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=test_size, random_state=1999)

## Model Creation

### Grid Searching for Best Parameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
model = SVC(random_state=0)
model = GridSearchCV(model, param_grid)

# model.fit(X_train, y_train)
# report(model.cv_results_)

In [None]:
param_grid = [
  {'n_neighbors': [1, 3, 5, 9], 'weights': ['uniform', 'distance'], 'p': [1, 2]}
 ]
model = KNeighborsClassifier()
model = GridSearchCV(model, param_grid)

# model.fit(X_train, y_train)
# report(model.cv_results_)

In [None]:
param_grid = [
  {'var_smoothing': np.logspace(0,-9, num=100)}
 ]
model = GaussianNB()
model = GridSearchCV(model, param_grid)

# model.fit(X_train, y_train)
# report(model.cv_results_)

In [None]:
param_grid = [
  { 'penalty' : ['l1'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']},
  { 'penalty' : ['l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear', 'lbfgs', 'newton-cg']}
 ]
model = LogisticRegression()
model = GridSearchCV(model, param_grid)

# model.fit(X_train, y_train)
# report(model.cv_results_)

In [None]:
param_grid = [
  { 'criterion':['gini', 'entropy'],
    'max_depth':range(1,10),
    'min_samples_split':range(1,10),
    'min_samples_leaf':range(1,5)
  }
]
model = DecisionTreeClassifier(random_state=0)
model = GridSearchCV(model, param_grid)

# model.fit(X_train, y_train)
# report(model.cv_results_)

In [None]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)

param_grid = {'n_estimators': [int(x) for x in np.linspace(start = 500, stop = 1500, num = 3)],
               'max_depth': max_depth,
               'min_samples_split': [2, 5],
               'min_samples_leaf': [1, 2]}

model = RandomForestClassifier(random_state=0)
model = GridSearchCV(model, param_grid, verbose=2)

# model.fit(X_train, y_train)
# report(model.cv_results_)

In [None]:
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

model = XGBClassifier(random_state=0)
model = GridSearchCV(model, param_grid, verbose=1)

# model.fit(X_train, y_train)
# report(model.cv_results_)

### Creating All Models

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

class ann():
    def __init__(self):
        ann = keras.Sequential([
            layers.Dense(64, input_shape=(X.shape[1],), activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dense(256, activation='relu'),
            layers.Dense(256, activation='relu'),
            layers.Dense(1),
        ])

        ann.compile(loss='mse',
                    optimizer=keras.optimizers.Adam(0.002))

        history = ann.fit(
            X_train,
            y_train,
            #validation_split=0.3,
            verbose=0, epochs=100)
        
        self.ann = ann

    def predict(self, X_test):
        # make predictions for test data
        y_pred = self.ann.predict(X_test)
        y_pred = [np.round(value) for value in y_pred]
        return y_pred

In [None]:
def create_models():
    models = {}
    models["KNN"] = KNeighborsClassifier(n_neighbors=9, p=1, weights='distance').fit(X_train, y_train)
    models["NB"] = GaussianNB(var_smoothing=1.873817422860384e-01).fit(X_train, y_train)
    models["LR"] = LogisticRegression(random_state=0, C=545.5, penalty='l1', solver='liblinear').fit(X_train, y_train)
    models["SVM"] = SVC(random_state=0, C=1000, gamma=0.001, kernel='rbf').fit(X_train, y_train)
    models["DT"] = DecisionTreeClassifier(random_state=0, criterion='gini', max_depth=4, min_samples_leaf=4, min_samples_split=2).fit(X_train, y_train)
    models["RF"] = RandomForestClassifier(random_state=0, max_depth=60, max_features='auto', min_samples_leaf=1, min_samples_split=5, n_estimators=1000).fit(X_train, y_train)
    models["XGB"] = XGBClassifier(random_state=0, colsample_bytree=0.8, gamma=0.5, max_depth=4, min_child_weight=1, subsample=1.0).fit(X_train, y_train)
    models["ANN"] = ann() 
    
    return models

## Model Evaluation

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

def evaluate(models):
    reports = {}
    for key in models:
        model = models[key]
        y_pred = model.predict(X_test)
        print(f"{key} accuracy: {accuracy_score(y_test, y_pred)}")
        reports[key] = classification_report(y_test, y_pred)
    return reports

models = create_models()
reports = evaluate(models)

In [None]:
for key in reports:
    print(f"{key} Model Classification Report:")
    print(reports[key])

## Testing Best Model

In [None]:
# Random Forest Model
model = models['RF']

In [None]:
test_example = X_scaler.transform([
    [8.3, 0.675, 0.26, 2.1, 0.084, 11, 43, 0.9976, 3.31, 0.53, 9.2], # Quality -> 1
    [7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.9978, 3.51, 0.56, 9.4], # Quality -> 2
    [8.9, 0.22, 0.48, 1.8, 0.077, 29, 60, 0.9968, 3.39, 0.53, 9.4], # Quality -> 3
    [8, 0.59, 0.16, 1.8, 0.065, 3, 16, 0.9962, 3.42, 0.92, 10.5], # Quality -> 4
])
model.predict(test_example)

In [None]:
from xgboost import plot_importance
model = models["XGB"]
model.get_booster().feature_names = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
plot_importance(model)
plt.show()

In [None]:
df.keys()