In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

In [2]:
# read the data
red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

In [3]:
# remove lines that have all values duplicated
red_wine.drop_duplicates(inplace=True)
white_wine.drop_duplicates(inplace=True)

In [4]:
# create a df with all wines for exercises later

# add color of wine as parameter
red_wine['color'] = 'red'
white_wine['color'] = 'white'

# combine the wine dfs
wine = pd.concat([red_wine, white_wine])

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
wine['color'] = pd.Categorical(wine['color'],
categories=['red', 'white'])

In [5]:
# use the rename method to change all columns names lowercase and add an underscore if they are made of 2 words
wine.rename(str.lower, axis='columns', inplace=True)  # make the names lowercase
wine.columns = wine.columns.str.replace(' ', '_')       # replace space with underscore in column names

In [6]:
wine

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
5,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [None]:
wine['color']

# fit and transform vs fit_transform

# 

In [16]:
# divide the data into the features (X) and target variable (y)
X = wine.drop('color', axis=1)
y = wine['color']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform the target variable in the training set
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))

# Transform the target variable in the testing set
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))


AttributeError: 'csr_matrix' object has no attribute 'head'

In [8]:
# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)

In [9]:
# Example
# Fit the model
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Make predictions
pred_lr = lr.predict(X_test)

In [11]:
pred_lr

array(['white', 'white', 'white', ..., 'white', 'red', 'white'],
      dtype=object)

In [12]:
print("Confusion matrix:")
print(confusion_matrix(y_test, pred_lr))

Confusion matrix:
[[279  10]
 [  7 768]]


In [13]:
print("Accuracy score:", accuracy_score(y_test, pred_lr))

Accuracy score: 0.9840225563909775


In [14]:
print(classification_report(y_test, pred_lr, target_names=["red","white"]))

              precision    recall  f1-score   support

         red       0.98      0.97      0.97       289
       white       0.99      0.99      0.99       775

    accuracy                           0.98      1064
   macro avg       0.98      0.98      0.98      1064
weighted avg       0.98      0.98      0.98      1064

