In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [8]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

In [9]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [10]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.5153061224489796


In [11]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                 max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [12]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.5989795918367347


### Max depth tuning

In [13]:
for i in range(1, 20):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth = 1: 0.44081632653061226
Accuracy score using max_depth = 2: 0.4530612244897959
Accuracy score using max_depth = 3: 0.46224489795918366
Accuracy score using max_depth = 4: 0.5010204081632653
Accuracy score using max_depth = 5: 0.4530612244897959
Accuracy score using max_depth = 6: 0.4928571428571429
Accuracy score using max_depth = 7: 0.5091836734693878
Accuracy score using max_depth = 8: 0.5224489795918368
Accuracy score using max_depth = 9: 0.47959183673469385
Accuracy score using max_depth = 10: 0.5142857142857142
Accuracy score using max_depth = 11: 0.5091836734693878
Accuracy score using max_depth = 12: 0.5469387755102041
Accuracy score using max_depth = 13: 0.5173469387755102
Accuracy score using max_depth = 14: 0.5306122448979592
Accuracy score using max_depth = 15: 0.5673469387755102
Accuracy score using max_depth = 16: 0.5826530612244898
Accuracy score using max_depth = 17: 0.5479591836734694
Accuracy score using max_depth = 18: 0.595918367346938

### Max features tuning

In [14]:
for i in np.arange(0.1, 1.0, 0.1):
    print('Accuracy score using max features =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 18, max_features=i)

Accuracy score using max features = 0.1: 0.5551020408163265
Accuracy score using max features = 0.2: 0.5989795918367347
Accuracy score using max features = 0.30000000000000004: 0.5816326530612245
Accuracy score using max features = 0.4: 0.6040816326530613
Accuracy score using max features = 0.5: 0.5887755102040816
Accuracy score using max features = 0.6: 0.5755102040816327
Accuracy score using max features = 0.7000000000000001: 0.5959183673469388
Accuracy score using max features = 0.8: 0.5979591836734693
Accuracy score using max features = 0.9: 0.5846938775510204


### Min samples split tuning

In [15]:
for i in range(2, 10):
    print('Accuracy score using min samples split =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, max_features=0.3, min_samples_split=i)

Accuracy score using min samples split = 2: 0.6224489795918368
Accuracy score using min samples split = 3: 0.5989795918367347
Accuracy score using min samples split = 4: 0.5908163265306122
Accuracy score using min samples split = 5: 0.5897959183673469
Accuracy score using min samples split = 6: 0.563265306122449
Accuracy score using min samples split = 7: 0.5418367346938775
Accuracy score using min samples split = 8: 0.5428571428571428
Accuracy score using min samples split = 9: 0.563265306122449


### Criterion tuning

In [16]:
for i in ['gini', 'entropy']:
    print('Accuracy score using criterion =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')

Accuracy score using criterion = gini: 0.6153061224489796
Accuracy score using criterion = entropy: 0.613265306122449


In [17]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [18]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print('Polynomial degree',degree)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')
    print(10*'-')
    
train_poly, test_poly = create_poly(train, test, 2) 

Polynomial degree 1
0.610204081632653
----------
Polynomial degree 2
0.6336734693877552
----------
Polynomial degree 3
0.6081632653061224
----------
Polynomial degree 4
0.6020408163265306
----------


In [19]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] / df['sulphates']
    return df

train = feat_eng(train)
test = feat_eng(test)

print('Additional feature engineering:')

fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')

train_poly, test_poly = create_poly(train, test, 2)

fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')


Additional feature engineering:
0.5989795918367347
0.613265306122449


In [20]:
original_score = 0.514285714286
best_score = 0.625510204082
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 21.63 %


In [27]:
import pydotplus
from sklearn import tree
import collections

dot_data = tree.export_graphviz(dt,
                                feature_names=df.columns,
                                out_file=None,
                                filled=True,
                                rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('tree1.png')

True

In [39]:
y

array([6, 6, 6, ..., 6, 7, 6])

In [31]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')