# Load the data and the required modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/wine_train.csv')
test = pd.read_csv('data/wine_test.csv')

# Data cleaning

In [3]:
train.columns

Index(['wine_ID', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'wine_type', 'target'],
      dtype='object')

In [4]:
# print the type of each column
train.dtypes


wine_ID                   int64
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
wine_type                 int64
target                    int64
dtype: object

In [6]:
#print the number of labels in each category
print(train['target'].value_counts())

6    1997
5    1475
7     761
4     151
8     135
3      23
9       5
Name: target, dtype: int64


In [7]:
#count the number of nan values in each column
train.isnull().sum()

wine_ID                 0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
wine_type               0
target                  0
dtype: int64

# Training a random forest

In [9]:
#split the data into a training set and a validation set
from sklearn.model_selection import train_test_split
X = train.drop('target', axis=1)
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [10]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3046, 13) (1501, 13) (3046,) (1501,)


In [11]:
#try a multiclass classification with a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))   

0.6402398401065956


In [12]:
#print the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[  0   0   4   4   1   0   0]
 [  1   4  30  19   0   0   0]
 [  0   1 322 155   1   0   0]
 [  0   1 104 515  31   0   0]
 [  0   0   6 141 109   0   0]
 [  0   0   0  23  17  11   0]
 [  0   0   0   1   0   0   0]]


In [13]:
#print the number of 0 1 and 2 y_pred
print(np.unique(y_pred, return_counts=True))

(array([3, 4, 5, 6, 7, 8], dtype=int64), array([  1,   6, 466, 858, 159,  11], dtype=int64))


# Submit the result

In [16]:
#export x_pred['obj_ID'] and y_pred to a csv file
y_pred = rfc.predict(test)
df = pd.DataFrame({'wine_ID':test['wine_ID'], 'label':rfc.predict(test)})
df.to_csv('submission.csv', index=False)