In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
# machine learning imports 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score, precision_score, recall_score
from pydataset import data
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Removes big scary warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_red_wine():
    '''this function reads in a csv file and returns a dataframe'''
    
    filename = "winequality-red.csv"

    if os.path.isfile(filename):
        return pd.read_csv(filename)
    
def get_white_wine():
    '''this function reads in a csv file and returns a dataframe'''
    
    filename = "winequality-white.csv"
    
    if os.path.isfile(filename):
        return pd.read_csv(filename)
       

In [3]:
red_wine = get_red_wine()
white_wine = get_white_wine()

In [14]:
red_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_color
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,red
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,red
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,red
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,red


In [15]:
white_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_color
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,white
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,white
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,white
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,white
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [6]:
def prep_wine(red_wine, white_wine):
    '''this function preps the wine csv's, concatenates them, changes data types, 
        and renames columns'''
    # take in the df's and concant and reset index
    red_wine.insert(12, column="wine_color", value='red')
    white_wine.insert(12, column="wine_color", value='white')
    frames = [red_wine, white_wine]
    wines = pd.concat(frames)
    wines = wines.reset_index(drop=True)
    
    # change some data types
    wines["free sulfur dioxide"] = wines["free sulfur dioxide"].astype(int)
    wines["total sulfur dioxide"] = wines["total sulfur dioxide"].astype(int)
    
    # rename some columns
    wines = wines.rename(columns={'fixed acidity':'fixed_acidity', 
                            'volatile acidity':'volatile_acidity', 
                            'citric acid':'citric_acid',
                            'residual sugar':'residual_sugar', 
                            'free sulfur dioxide':'free_sulfur_dioxide',
                            'total sulfur dioxide':'total_sulfur_dioxide'
                            })
    
    # Create dummies for non-binart categorical variables
    to_dummy = ['wine_color']
    dummies = pd.get_dummies(wines[to_dummy], drop_first=False)
    wines = pd.concat([wines, dummies], axis=1)
    
    # drop redundant column
    drop = ['wine_color']
    wines.drop(columns=drop, inplace=True)
    
    return wines

In [7]:
wines = prep_wine(red_wine, white_wine)

In [8]:
wines.head(1)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine_color_red,wine_color_white
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5,1,0


In [9]:
def remove_outliers(df,col_list,k):
    ''' 
    This function takes in a dataframe, the threshold and a list of columns 
    and returns the dataframe with outliers removed
    '''   
    for col in col_list:

        q1, q3 = df[col].quantile([.25, .75])  # get quartiles
        
        iqr = q3 - q1   # calculate interquartile range
        
        upper_bound = q3 + k * iqr   # get upper bound
        lower_bound = q1 - k * iqr   # get lower bound

        # return dataframe without outliers
        
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
        
    return df

In [10]:
col_list = ['fixed_acidity', 'volatile_acidity', 'citric_acid',
                            'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
                            'total_sulfur_dioxide', 'density', 'pH', 'sulphates',
                            'alcohol']
            
wines = remove_outliers(wines, col_list, k=1.5)
wines

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine_color_red,wine_color_white
20,8.9,0.220,0.48,1.8,0.077,29,60,0.99680,3.39,0.53,9.4,6,1,0
25,6.3,0.390,0.16,1.4,0.080,11,23,0.99550,3.34,0.56,9.3,5,1,0
26,7.6,0.410,0.24,1.8,0.080,4,11,0.99620,3.28,0.59,9.5,5,1,0
33,6.9,0.605,0.12,10.7,0.073,40,83,0.99930,3.45,0.52,9.4,6,1,0
37,8.1,0.380,0.28,2.1,0.066,13,30,0.99680,3.23,0.73,9.7,7,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.210,0.29,1.6,0.039,24,92,0.99114,3.27,0.50,11.2,6,0,1
6493,6.6,0.320,0.36,8.0,0.047,57,168,0.99490,3.15,0.46,9.6,5,0,1
6494,6.5,0.240,0.19,1.2,0.041,30,111,0.99254,2.99,0.46,9.4,6,0,1
6495,5.5,0.290,0.30,1.1,0.022,20,110,0.98869,3.34,0.38,12.8,7,0,1


In [11]:
def split_data(df, target):
    '''
    This function take in a dataframe performs a train, validate, test split
    Returns train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test
    and prints out the shape of train, validate, test
    '''
    #create train_validate and test datasets
    train, test = train_test_split(df, train_size = 0.8, random_state = 123)
    #create train and validate datasets
    train, validate = train_test_split(train, train_size = 0.7, random_state = 123)

    #Split into X and y
    X_train = train.drop(columns=[target])
    y_train = train[target]

    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]

    X_test = test.drop(columns=[target])
    y_test = test[target]

    # Have function print datasets shape
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
   
    return train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test

In [12]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = split_data(wines, 
                                                                                             'quality')


train -> (2548, 14)
validate -> (1092, 14)
test -> (910, 14)


In [16]:
train.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine_color_red,wine_color_white
1718,7.3,0.32,0.48,13.3,0.06,57,196,0.9982,3.04,0.5,9.2,5,0,1
3520,7.6,0.13,0.34,9.3,0.062,40,126,0.9966,3.21,0.39,9.6,5,0,1
1733,6.8,0.27,0.22,8.1,0.034,55,203,0.9961,3.19,0.52,8.9,5,0,1
2539,6.2,0.34,0.28,7.5,0.034,40,197,0.99485,3.14,0.6,9.7,5,0,1
4367,7.7,0.3,0.34,1.2,0.048,4,119,0.99084,3.18,0.34,12.1,6,0,1


In [17]:
# code to generate baseline 
wines['baseline'] = wines['quality'].value_counts().idxmax()
(wines['quality'] == wines['baseline']).mean()
# clean f string
print(f"Baseline: {(wines['quality'] == wines['baseline']).mean()*100:.2f}%")

Baseline: 45.58%


In [19]:
# random forest algorithm 
rf3 = RandomForestClassifier(max_depth=8, random_state=42,
                            max_samples=0.5)
#fit it 
rf3.fit(X_train, y_train)
# clean f string
print('Random Forest Model')
print(f"Accuracy of Random Forest on train data: {rf3.score(X_train, y_train)}") 
print(f"Accuracy of Random Forest on validate: {rf3.score(X_validate, y_validate)}")

Random Forest Model
Accuracy of Random Forest on train data: 0.717425431711146
Accuracy of Random Forest on validate: 0.5989010989010989


In [26]:
# random forest algorithm 
rf2 = RandomForestClassifier(max_depth=5, random_state=42,
                            max_samples=0.5)
#fit it 
rf2.fit(X_train, y_train)
# clean f string
print('Random Forest Model')
print(f"Accuracy of Random Forest on train data: {rf2.score(X_train, y_train)}") 
print(f"Accuracy of Random Forest on validate: {rf2.score(X_validate, y_validate)}")

Random Forest Model
Accuracy of Random Forest on train data: 0.5989010989010989
Accuracy of Random Forest on validate: 0.5393772893772893


In [53]:
# random forest algorithm 
rf2 = RandomForestClassifier(max_depth=4, random_state=42,
                            max_samples=0.5)
#fit it 
rf2.fit(X_train, y_train)
# clean f string
print('Random Forest Model')
print(f"Accuracy of Random Forest on train data: {rf2.score(X_train, y_train)}") 
print(f"Accuracy of Random Forest on validate: {rf2.score(X_validate, y_validate)}")

Random Forest Model
Accuracy of Random Forest on train data: 0.5726059654631083
Accuracy of Random Forest on validate: 0.5274725274725275


In [22]:
# Logistic Regression algorithm
logit2 = LogisticRegression(C=.1, random_state=42, 
                           intercept_scaling=1, solver='lbfgs')

#fit the model
logit2.fit(X_train, y_train)
#clean f string
print('Logistic Regression Model')
print(f"Accuracy of Logistic Regression on train: {logit2.score(X_train, y_train)}") 
print(f"Accuracy of Logistic Regression on validate: {logit2.score(X_validate, y_validate)}")

Logistic Regression Model
Accuracy of Logistic Regression on train: 0.4721350078492936
Accuracy of Logistic Regression on validate: 0.4697802197802198


In [34]:
# Logistic Regression algorithm
logit2 = LogisticRegression(C=.9, random_state=42, 
                           intercept_scaling=2, solver='lbfgs')

#fit the model
logit2.fit(X_train, y_train)
#clean f string
print('Logistic Regression Model')
print(f"Accuracy of Logistic Regression on train: {logit2.score(X_train, y_train)}") 
print(f"Accuracy of Logistic Regression on validate: {logit2.score(X_validate, y_validate)}")

Logistic Regression Model
Accuracy of Logistic Regression on train: 0.47135007849293564
Accuracy of Logistic Regression on validate: 0.47802197802197804


In [36]:
# Logistic Regression algorithm
logit2 = LogisticRegression(C=.7, random_state=42, 
                           intercept_scaling=6, solver='lbfgs')

#fit the model
logit2.fit(X_train, y_train)
#clean f string
print('Logistic Regression Model')
print(f"Accuracy of Logistic Regression on train: {logit2.score(X_train, y_train)}") 
print(f"Accuracy of Logistic Regression on validate: {logit2.score(X_validate, y_validate)}")

Logistic Regression Model
Accuracy of Logistic Regression on train: 0.47017268445839877
Accuracy of Logistic Regression on validate: 0.4716117216117216


In [37]:
# K nearest neighbor algorithm

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, y_train)
knn3.score(X_train, y_train)
knn3.score(X_validate, y_validate)
# clean f string
print('KNN Model')
print(f"Accuracy of KNN on train: {knn3.score(X_train, y_train)}") 
print(f"Accuracy of KNN on validate: {knn3.score(X_validate, y_validate)}")

KNN Model
Accuracy of KNN on train: 0.717032967032967
Accuracy of KNN on validate: 0.4542124542124542


In [40]:
# K nearest neighbor algorithm

knn3 = KNeighborsClassifier(n_neighbors=10)
knn3.fit(X_train, y_train)
knn3.score(X_train, y_train)
knn3.score(X_validate, y_validate)
# clean f string
print('KNN Model')
print(f"Accuracy of KNN on train: {knn3.score(X_train, y_train)}") 
print(f"Accuracy of KNN on validate: {knn3.score(X_validate, y_validate)}")

KNN Model
Accuracy of KNN on train: 0.5832025117739403
Accuracy of KNN on validate: 0.4542124542124542


In [46]:
# K nearest neighbor algorithm

knn3 = KNeighborsClassifier(n_neighbors=11)
knn3.fit(X_train, y_train)
knn3.score(X_train, y_train)
knn3.score(X_validate, y_validate)
# clean f string
print('KNN Model')
print(f"Accuracy of KNN on train: {knn3.score(X_train, y_train)}") 
print(f"Accuracy of KNN on validate: {knn3.score(X_validate, y_validate)}")

KNN Model
Accuracy of KNN on train: 0.5726059654631083
Accuracy of KNN on validate: 0.4633699633699634


In [54]:
# best Random Forest algorithm 

test_score = rf2.score(X_test, y_test)

# clean f string
print('Random Forest Model')
print(f'Accuracy on Test {test_score:.2f}')



Random Forest Model
Accuracy on Test 0.54
