use regression on white wine dataset

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [2]:
wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";")
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
X = wine[["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide", "density","pH","sulphates","alcohol"]]
y = wine[["quality"]]

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33, random_state=42)

In [5]:
import numpy as np
best_score = -np.inf
parameters = {}

In [6]:
for depth in range (5,11):
    for min_split in range(2,21):
        for min_leaf in range(2,21):
            regressor = DecisionTreeRegressor(max_depth = depth,
                                              min_samples_split = min_split,
                                              min_samples_leaf = min_leaf)
            scores = cross_val_score(regressor, X, y, cv = 5)
            if scores.mean() > best_score:
                parameters["best_depth"] = depth
                parameters["best_min_split"] = min_split            
                parameters["best_min_leaf"] = min_leaf
                best_score = scores.mean()
                print("R^2: {}".format(scores.mean()))
                print("max_depth: {}, min_samples_split: {}, min_samples_leaf: {}".format(depth, min_split, min_leaf))    
                
             
                
        

R^2: 0.24042922276539813
max_depth: 5, min_samples_split: 2, min_samples_leaf: 2
R^2: 0.2447864158426753
max_depth: 5, min_samples_split: 2, min_samples_leaf: 3
R^2: 0.244812414192576
max_depth: 5, min_samples_split: 3, min_samples_leaf: 3
R^2: 0.2456793596308562
max_depth: 5, min_samples_split: 6, min_samples_leaf: 3
R^2: 0.2465095729566195
max_depth: 5, min_samples_split: 9, min_samples_leaf: 3
R^2: 0.24667992017913914
max_depth: 5, min_samples_split: 12, min_samples_leaf: 3
R^2: 0.24731219460054432
max_depth: 5, min_samples_split: 15, min_samples_leaf: 3
R^2: 0.24793862957667212
max_depth: 5, min_samples_split: 17, min_samples_leaf: 3
R^2: 0.24972183099536888
max_depth: 5, min_samples_split: 19, min_samples_leaf: 3


In [7]:
parameters

{'best_depth': 5, 'best_min_leaf': 3, 'best_min_split': 19}

In [8]:
regressor = DecisionTreeRegressor(max_depth = parameters['best_depth'], 
                                 min_samples_split = parameters["best_min_split"], 
                                 min_samples_leaf = parameters["best_min_leaf"])

In [9]:
regressor.fit(X = X_train, y = y_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=19, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [10]:
prediction_dtc = regressor.predict(X_test)

In [11]:
results = pd.DataFrame(y_test)
results["y_dtc"] = prediction_dtc

In [12]:
results

Unnamed: 0,quality,y_dtc
4656,7,6.105000
3659,8,6.716146
907,8,6.716146
4352,5,5.915663
3271,7,6.716146
4632,6,6.534247
2244,5,5.274744
1924,4,5.928889
3801,6,6.105000
2634,5,5.274744


In [14]:
r2_score(y_true = results["quality"], y_pred = results["y_dtc"])

0.30348893163217794