In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [4]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [6]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [None]:
LogisticRegression

In [7]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.5418367346938775


In [8]:
def fit_predict(train, test, y_train, y_test,  max_depth = None , 
                n_estimators = 10, max_features = 'auto', min_samples_split = 2,scaler = None):
    if scaler:
        train = scaler.fit_transform(train)
        test = scaler.transform(test)        
    RF = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth, 
                                random_state = 42, max_features = max_features,
                               min_samples_split = min_samples_split)
    RF.fit(train, y_train)
    y_pred = RF.predict(test)
    print(accuracy_score(y_test, y_pred))

In [9]:
print('baseline accuracy score', end = ': ')
fit_predict(train,test,y_train,y_test)
print('baseline accuracy score with scaler', end = ': ')
fit_predict(train,test,y_train,y_test,scaler=StandardScaler())

baseline accuracy score: 0.6489795918367347
baseline accuracy score with scaler: 0.6489795918367347


In [10]:
for n_estimators in range(20,200,20):
    print('Accuracy score using n_estimators =', n_estimators, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = n_estimators)


Accuracy score using n_estimators = 20: 0.6571428571428571
Accuracy score using n_estimators = 40: 0.6683673469387755
Accuracy score using n_estimators = 60: 0.6693877551020408
Accuracy score using n_estimators = 80: 0.676530612244898
Accuracy score using n_estimators = 100: 0.6724489795918367
Accuracy score using n_estimators = 120: 0.6795918367346939
Accuracy score using n_estimators = 140: 0.6836734693877551
Accuracy score using n_estimators = 160: 0.6795918367346939
Accuracy score using n_estimators = 180: 0.6836734693877551


In [7]:
for max_depth in range(1,20):
    print('Accuracy score using max_depth =', max_depth, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_depth = max_depth)


Accuracy score using max_depth = 1: 0.44081632653061226
Accuracy score using max_depth = 2: 0.4897959183673469
Accuracy score using max_depth = 3: 0.49387755102040815
Accuracy score using max_depth = 4: 0.5051020408163265
Accuracy score using max_depth = 5: 0.5244897959183673
Accuracy score using max_depth = 6: 0.5357142857142857
Accuracy score using max_depth = 7: 0.563265306122449
Accuracy score using max_depth = 8: 0.5826530612244898
Accuracy score using max_depth = 9: 0.5959183673469388
Accuracy score using max_depth = 10: 0.6091836734693877
Accuracy score using max_depth = 11: 0.6469387755102041
Accuracy score using max_depth = 12: 0.6744897959183673
Accuracy score using max_depth = 13: 0.6795918367346939
Accuracy score using max_depth = 14: 0.6979591836734694
Accuracy score using max_depth = 15: 0.7010204081632653
Accuracy score using max_depth = 16: 0.6959183673469388
Accuracy score using max_depth = 17: 0.6969387755102041
Accuracy score using max_depth = 18: 0.7020408163265306


In [8]:
for max_features in np.linspace(0.1,1,10):
    print('Accuracy score using max_features =', max_features, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = max_features,max_depth = 18)


Accuracy score using max_features = 0.1: 0.6969387755102041
Accuracy score using max_features = 0.2: 0.7040816326530612
Accuracy score using max_features = 0.30000000000000004: 0.7020408163265306
Accuracy score using max_features = 0.4: 0.6948979591836735
Accuracy score using max_features = 0.5: 0.6969387755102041
Accuracy score using max_features = 0.6: 0.6908163265306122
Accuracy score using max_features = 0.7000000000000001: 0.6969387755102041
Accuracy score using max_features = 0.8: 0.6989795918367347
Accuracy score using max_features = 0.9: 0.6918367346938775
Accuracy score using max_features = 1.0: 0.7020408163265306


In [9]:
for min_samples_split in range(2,10):
    print('Accuracy score using min_samples_split =', min_samples_split, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=min_samples_split
               ,max_depth = 18)


Accuracy score using min_samples_split = 2: 0.7040816326530612
Accuracy score using min_samples_split = 3: 0.7193877551020408
Accuracy score using min_samples_split = 4: 0.7040816326530612
Accuracy score using min_samples_split = 5: 0.6938775510204082
Accuracy score using min_samples_split = 6: 0.6938775510204082
Accuracy score using min_samples_split = 7: 0.6857142857142857
Accuracy score using min_samples_split = 8: 0.6806122448979591
Accuracy score using min_samples_split = 9: 0.6714285714285714


In [11]:
print('tuned accuracy score', end = ': ')
fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=3,max_depth = 18)
print('tuned accuracy score with scaler', end = ': ')

fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=3,
            max_depth = 18,scaler=StandardScaler())

tuned accuracy score: 0.6806122448979591
tuned accuracy score with scaler: 0.6806122448979591


In [11]:
original_score = 0.514285714286
best_score = 0.7193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 39.88 %


In [12]:
original_score = 0.6428571428571429
best_score = 0.7193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement compare to non tuned model is {} %'.format(improvement))

overall improvement compare to non tuned model is 11.9 %
