In [None]:
import pandas as pd

#importing dataset
X = pd.read_csv('/content/winequality.csv')
X

In [None]:
#extracting labels
y = X.pop('quality')
#looking at the labels
y.unique()
#array([5, 6, 7, 4, 8, 3])

##looking at the data

All features follow normal distributions, this means two things:

- for the scaling I need to standardize data

- Becaus the distributions are almost OVERLAPPING in any feature, it is very difficult for the model to make distinctions between different categories


In [None]:
#I am using a pairplot to look at different distributions
import seaborn as sns
sns.set_theme(style="ticks")

sns.pairplot(X, hue="quality")

In [None]:
#scaling data: standardization given that the data follows a normal distribution
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
#not transforming y because it has categorical data

##splitting the data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=82)

##testing several models

In [None]:
#Naive Bayes Classifier: usually optimal with normal distributions
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf = clf.fit(X_train, y_train)
print('train', clf.score(X_train, y_train))
print('test', clf.score(X_test, y_test))
#result: 
#train 0.5683
#test 0.5375

In [None]:
# Classification Report
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
pred = clf.predict(X_test)

# Metrics
print("Precision = {}".format(precision_score(y_test, pred, average='macro')))
print("Recall = {}".format(recall_score(y_test, pred, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, pred)))
print("F1 Score = {}".format(f1_score(y_test, pred,average='macro')))

##decision tree classifier

In [None]:
#Decision tree classifier
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print('train', clf.score(X_train, y_train))
print('test', clf.score(X_test, y_test))
#result: 
#train 1.0
#test 0.5833

In [None]:
# Classification Report
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
pred = clf.predict(X_test)

# Metrics
print("Precision = {}".format(precision_score(y_test, pred, average='macro')))
print("Recall = {}".format(recall_score(y_test, pred, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, pred)))
print("F1 Score = {}".format(f1_score(y_test, pred,average='macro')))

##XGboost

In [None]:
#xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  

param_dict = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2),
    'learning_rate': [0.00001,0.001,0.01,0.1,1,2],
    'n_estimators': [10,190,200,210,500,1000,2000]
    
}

param_dict = {
    'max_depth': [5],
    'min_child_weight': [1],
    'learning_rate': [0.1],
    'n_estimators': [190]
    
}

xgc = XGBClassifier(booster='gbtree', learning_rate =0.01, n_estimators=200, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27)

clf = GridSearchCV(xgc,param_dict,cv=3, n_jobs = -1).fit(X_train,y_train)

print("Tuned: {}".format(clf.best_params_)) 
print("Mean of the cv scores is {:.6f}".format(clf.best_score_))
print("Train Score {:.6f}".format(clf.score(X_train,y_train)))
print("Test Score {:.6f}".format(clf.score(X_test,y_test)))
print("Seconds used for refitting the best model on the train dataset: {:.6f}".format(clf.refit_time_))

#Tuned: {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 190}

In [None]:
clf = XGBClassifier(booster='gbtree', learning_rate =0.1, n_estimators=190, max_depth=5,
 min_child_weight=0.1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27)

clf = clf.fit(X_train,y_train)

print("Train Score {:.6f}".format(clf.score(X_train,y_train)))
print("Test Score {:.6f}".format(clf.score(X_test,y_test)))
#Train Score 1.000
#Test Score 0.6645

In [None]:
# Classification Report
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score
pred = clf.predict(X_test)

# Metrics
print("Precision = {}".format(precision_score(y_test, pred, average='macro')))
print("Recall = {}".format(recall_score(y_test, pred, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, pred)))
print("F1 Score = {}".format(f1_score(y_test, pred,average='macro')))

#Precision = 0.32051403130059364
#Recall = 0.3174093879976233
#ccuracy = 0.6645833333333333
#F1 Score = 0.3162862725534215

##Conclusions

Xgboost is able to perform with a better Accuracy than the Decision Tree Classifier that averages .60 accuracy score: so far XGboos is the best algorithm that performs for this project (0.64).

However, Decision Tree Classifier performs better in Accuracy and Precision with a score of .37, compared with the .31 of XgBoost

In [None]:
#confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))

xgb_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, xgb_pred)
ax = sns.heatmap(cm, square=True, annot=True, cbar=False)
ax.xaxis.set_ticklabels(y.unique(), fontsize = 12)
ax.yaxis.set_ticklabels(y.unique(), fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()