In [192]:
# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [193]:
# read the data
red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

In [194]:
# remove lines that have all values duplicated
red_wine.drop_duplicates(inplace=True)
white_wine.drop_duplicates(inplace=True)

In [195]:
# create a df with all wines

# add color of wine as parameter
red_wine['color'] = 'red'
white_wine['color'] = 'white'

# combine the wine dfs
wine = pd.concat([red_wine, white_wine], ignore_index=True)

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
wine['color'] = pd.Categorical(wine['color'],
categories=['red', 'white'])

In [196]:
# use the rename method to change all columns names lowercase and add an underscore if they are made of 2 words
wine.rename(str.lower, axis='columns', inplace=True)  # make the names lowercase
wine.columns = wine.columns.str.replace(' ', '_')       # replace space with underscore in column names

In [197]:
wine

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [198]:
# alternative to OneHotEncoding
# the colors get transformed to binary values, so they can be used as target variable
wine['color_binary'] = np.where(wine['color'].str.contains('red'), 1,0)

In [199]:
# split the data into features (X) and target variable (y)
X = wine.drop(['color', 'color_binary'], axis=1) # features
y = wine['color_binary']  # target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
# initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# train the model
model.fit(X_train, y_train)

# make predictions on the test set
y_pred = model.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.9868
Confusion Matrix:
 [[770   5]
 [  9 280]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       775
           1       0.98      0.97      0.98       289

    accuracy                           0.99      1064
   macro avg       0.99      0.98      0.98      1064
weighted avg       0.99      0.99      0.99      1064



In [201]:
# initialize the model
forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
forest.fit(X_train, y_train.ravel())

# make predictions on the test set
y_pred = forest.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.9972
Confusion Matrix:
 [[775   0]
 [  3 286]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       775
           1       1.00      0.99      0.99       289

    accuracy                           1.00      1064
   macro avg       1.00      0.99      1.00      1064
weighted avg       1.00      1.00      1.00      1064



In [202]:
models = [LogisticRegression, LinearRegression, DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier, SVC, GaussianNB]

In [203]:
def evaluate_model(model_name):
# just runs a model and outputs it's evaluation metrics

    try: # if there is no errors
        model = model_name()
        model.fit(X_train, y_train.ravel())

        # make predictions on the test set
        y_pred = model.predict(X_test)

        # evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)
    except: # if there are errors, return 0 for evaluation
        accuracy = 0
        conf_matrix = 0
        classification_rep = 0
    
    return (accuracy, conf_matrix, classification_rep)

In [204]:
# check metrics for multiple models
evaluation_results = []

for model in models:
    accuracy, conf_matrix, classification_rep = evaluate_model(model)

    # make the model name more readabkle in output
    evaluation_results.append([str(model).split('.')[-1].strip("'>"), accuracy])


# convert evaluation metrics to df
evaluation_results = pd.DataFrame(evaluation_results, columns=['model', 'accuracy'])

# show the evaluation metrics ordered by accuracy
evaluation_results.sort_values('accuracy', ascending=False)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,accuracy
3,RandomForestClassifier,0.996241
0,LogisticRegression,0.984023
2,DecisionTreeClassifier,0.979323
6,GaussianNB,0.975564
4,KNeighborsClassifier,0.933271
5,SVC,0.928571
1,LinearRegression,0.0


In [205]:
# investigate why LinearRegression model does not work

# initialize the model
model = LinearRegression()
model.fit(X_train, y_train.ravel())

# Make predictions on the test set
y_pred = model.predict(X_test)

# evaluate the model
accuracy = model.score(X_test, y_test)
#accuracy = accuracy_score(y_test, y_pred)
#conf_matrix = confusion_matrix(y_test, y_pred)
#classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(model)
print(f"Accuracy: {accuracy:.2f}")
#print("Confusion Matrix:\n", conf_matrix)
#print("Classification Report:\n", classification_rep)

LinearRegression()
Accuracy: 0.88


In [206]:
# it seems that for the results of LinearRegression, the sklearn.metrics (accuracy_score, classification_report, confusion_matrix) don't work