# Imports

In [25]:
import pandas as pd
import numpy as np
import csv

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# Preprocessing

In [26]:
data = pd.read_csv("data/train.csv", sep = ';').to_numpy()
x_test = pd.read_csv("data/test_public.csv", sep = ';').to_numpy()

#Deleting the first column (id) since it's not relevant for the prediction
data = np.delete(data, 0, 1)
x_test = np.delete(x_test, 0, 1)

for i in range(len(data)):
    if data[i][0]=='white':
        data[i][0]=0
    elif data[i][0]=='red':
        data[i][0]=1
        
for i in range(len(x_test)):
    if x_test[i][0]=='white':
        x_test[i][0]=0
    elif x_test[i][0]=='red':
        x_test[i][0]=1
        
train = data[:3600]
valid = data[3600:]

x_train = train.T[:12].T
y_train = train[:,12]

x_valid = valid.T[:12].T
y_valid = valid[:,12]

y = []
for i in y_train:
    y.append(int(i))
y_train = y

y = []
for i in y_valid:
    y.append(int(i))
y_valid = y

# Feature selection

In [27]:
classifier = DecisionTreeClassifier()

selector = RFE(classifier, n_features_to_select=10, step=1)
selector = selector.fit(x_train, y_train)
selector.ranking_

array([3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [28]:
#Deleting white/red wine column
x_train = np.delete(x_train, 0, 1)
x_valid = np.delete(x_valid, 0, 1)
x_test = np.delete(x_test, 0, 1)

#Deleting fixed acidity column
x_train = np.delete(x_train, 0, 1)
x_valid = np.delete(x_valid, 0, 1)
x_test = np.delete(x_test, 0, 1)

In [29]:
classifier = DecisionTreeClassifier()

selector = RFE(classifier, n_features_to_select=10, step=1)
selector = selector.fit(x_train, y_train)
selector.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Model

In [32]:
classifier = BernoulliNB()
classifier = classifier.fit(x_train, y_train)
predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)

0.45195353748680045

In [33]:
classifier = LogisticRegression()
classifier = classifier.fit(x_train, y_train)
predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.4857444561774023

In [34]:
classifier = LinearSVC()
classifier = classifier.fit(x_train, y_train)
predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)



0.470960929250264

In [35]:
classifier = KNeighborsClassifier()
classifier = classifier.fit(x_train, y_train)
predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)

0.4804646251319958

In [36]:
classifier = DecisionTreeClassifier()
classifier = classifier.fit(x_train, y_train)
predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)

0.5913410770855333

In [37]:
classifier = RandomForestClassifier()
classifier = classifier.fit(x_train, y_train)
predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)

0.6779303062302007

In [30]:
params = [{'max_depth': np.arange(1,10,1),
           'max_leaf_nodes': [5,10,15], 
           'min_samples_split': [2,5,10]}]

classifier = GridSearchCV(DecisionTreeClassifier(), params, refit=True)
classifier = classifier.fit(x_train, y_train)

print(classifier.best_params_)

predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)



{'max_depth': 3, 'max_leaf_nodes': 10, 'min_samples_split': 2}


0.5586061246040127

In [14]:
params = [{'max_depth': [17,18],
           'n_estimators': [400,1000]}]

classifier = GridSearchCV(RandomForestClassifier(criterion='entropy'), params, refit=True)
classifier = classifier.fit(x_train, y_train)

print(classifier.best_params_)

predictions_valid = classifier.predict(x_valid)
accuracy_score(y_valid, predictions_valid)



{'max_depth': 17, 'n_estimators': 1000}


0.6789862724392819

# Prediction

In [15]:
predictions = classifier.predict(x_test)


with open('submission.csv', 'w', newline = '') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'quality'])
    for i in range(len(predictions)):
        writer.writerow([i,predictions[i]])