# Building a Classification Model for the "Wine" data set
### Nesrin Özcan 

In this Jupyter notebook, we will be building a classification model for the "Wine" data set using the random forest algorithm.

In [26]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [14]:
wine = datasets.load_wine()
wine

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [15]:
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [16]:
print(wine.target_names)


['class_0' 'class_1' 'class_2']


In [17]:
wine.data

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [18]:
wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [19]:
X = wine.data
Y = wine.target

In [20]:
X.shape

(178, 13)

In [21]:
Y.shape

(178,)

## Build Classification Model using Random Forest

In [27]:
clf = RandomForestClassifier()

In [28]:
clf.fit(X, Y)

RandomForestClassifier()

In [29]:
print(clf.feature_importances_)

[0.13506234 0.04031283 0.01461705 0.02508595 0.03509165 0.05200057
 0.15009817 0.00701982 0.01350941 0.15933921 0.06840189 0.14265133
 0.15680978]


In [30]:
X[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [34]:
print(clf.predict([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03]]))

[0]


In [35]:
print(clf.predict(X[[0]]))

[0]


In [36]:
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [37]:
clf.fit(wine.data, wine.target_names[wine.target])

RandomForestClassifier()

## Data Split with ratio (70/30)

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [39]:
X_train.shape, Y_train.shape

((124, 13), (124,))

In [40]:
X_test.shape, Y_test.shape

((54, 13), (54,))

In [41]:
clf.fit(X_train, Y_train)

RandomForestClassifier()

In [43]:
print(clf.predict(X[[0]]))

[0]


In [55]:
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [56]:
print(clf.predict(X_test))

[1 0 0 2 1 1 1 1 0 0 2 0 0 1 0 2 0 2 2 1 1 0 0 2 0 2 1 1 0 1 1 0 2 2 0 1 0
 2 1 2 0 1 0 1 2 2 1 2 1 1 1 1 1 0]


In [57]:
print(Y_test)

[1 0 0 2 1 1 1 1 0 0 2 0 0 1 0 2 1 2 2 1 1 0 0 2 0 2 1 1 0 1 1 0 2 2 0 1 0
 2 1 2 0 1 0 1 2 2 1 2 1 1 1 1 1 0]


## Model Performance

In [58]:
print(clf.score(X_test, Y_test))

0.9814814814814815
