In [1]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
import wineClassification

# Import the Dataset

In [3]:
wineQuality = wineClassification.read_csv()
wineQuality

FileNotFoundError: [Errno 2] File C:\Users\lmurphy4\.conda\envs\myEnv\forkproj\PRESC\dev\datasets\winequality.csv does not exist: 'C:\\Users\\lmurphy4\\.conda\\envs\\myEnv\\forkproj\\PRESC\\dev\\datasets\\winequality.csv'

# Exploratory Data Analysis and Data Pre-processing

In [None]:
wineClassification.is_null_values(wineQuality)

### Eliminate the feature 'quality' from the dataset. This is because we are creating models to predict wine recommendations (binary classification), and recommend is a direct function of wine quality.

In [None]:
wineQuality = wineClassification.drop_quality_column(wineQuality, "quality")
wineQuality

### Extract Wine Recommendation Features from the dataset into separate dataframe

In [None]:
recommend_features = wineClassification.extract_features(wineQuality)
recommend_features

### Encode Target Labels with 0(False) and 1(True)

In [None]:
recommend_labels = wineClassification.label_encoding(wineQuality["recommend"])
recommend_labels

### Imbalanced Classification: The distribution of classes (True/False) in our Recommend target variable is scewed more towards negative ratings (~78% of the dataset). 

In [None]:
wineClassification.class_distribution(wineQuality)

## Univariate Plots: Data Distribution Across Wine Quality features

### Features generally have a Gaussian distribution, with most features seeming to skew towards the left, such as free sulfur dioixde, chlorides, residual sugar, and density. pH is more normally distributed between 2.8 and 3.6

In [None]:
wineClassification.data_distribution(recommend_features)

## Bivariate Plotting: Explore Relationships Among Features

### Correlation Matrix of Wine Quality Features

In [None]:
wineClassification.feature_correlations(recommend_features)

### There appears to be moderate positive correlations between total sulfur dioxide and free sulfur dioxide, and density and total sulfur dioxide. There is also a strong negative correlation between density and alcohol. 

### Density & Residual Sugar

#### The greater the sugar content, seems the more dense the liquid.

In [None]:
wineClassification.bivar_plot(wineQuality, 'density', 'residual sugar')

### Density & Alcohol

#### Appears the less alcohol content there is, the greater density the liquid has.

In [None]:
wineClassification.bivar_plot(wineQuality, 'density', 'alcohol')

In [None]:
## 

### Total Sulfur Dioxide & Free Sulfur Dioxide

In [None]:
wineClassification.bivar_plot(wineQuality, 'total sulfur dioxide', 'free sulfur dioxide')

## Feature Scaling

### The data appears to generally have a Gaussian distribution around each feature, so Standard Scaler is used

In [None]:
recommend_features = wineClassification.feature_scaling(recommend_features)
recommend_features

## Train Test Split

### Using an 80/20 split as a starting point. Would like to explore utilizing other split ratios as well

In [None]:
X_train, X_test, y_train, y_test=train_test_split(recommend_features, recommend_labels, test_size=0.2, random_state=42)

# Generate Models

In [None]:
ml_models = []
wineClassification.create_ml_models(ml_models)
ml_models

### For each model, we split the wine features and the target variable recommend into k-folds. In this case, we use a Stratified K-Fold method to ensure each class is represented proportionally for each fold, since our classes are imbalanced. Below, the average of the cross validation scores is reported for a range of number of folds(2 to 10). Here, kNN is estimated to have the highest accuracy at 83.0% for 10 folds, and Decision Trees are estimated to have the highest accuracy at 83.1% for 7 folds. 

In [None]:
wineClassification.traversal_space_cross_val(ml_models, X_train, y_train)

# Hyperparameter Tuning Using Grid Search

## The default parameters were used in the classifiers above. However, we can improve model accuracy by tuning the hyperparameters. To do this, we create a grid of parameters to test on the model, and use GridSearchCV to determine the best performing model based on the set of parameters we put in the grid.

### Hyperparameter tuning with K-Nearest Neighbors

In [None]:
wineClassification.grid_search_knn(X_train, y_train)

### Hyperparameter tuning with Decision Trees

In [None]:
wineClassification.grid_search_dt(X_train, y_train)

# Model Predictions

## For each model(kNN, Decision Tree), we fit the model on the training set with the specific hyperparmeters determined in the previous section, and made predictions on the test set. 

In [None]:
wineClassification.knn_eval_model_predictions(X_train, y_train, X_test, y_test)    

In [None]:
wineClassification.dc_eval_model_predictions(X_train, y_train, X_test, y_test) 

### Overall, it appears that kNN was better able to predict the negative class predictions(sensitivity) as well as better able to predict the positive class predictions(specificity) than the Decision tree. 