In [33]:
import pandas as pd
import matplotlib as plt 
import seaborn as sns 
import numpy as np

In [34]:
red_wine_with_dup= pd.read_csv('winequality-red.csv', sep=';')

In [35]:
red_wine_with_dup.shape

(1599, 12)

In [36]:
red_wine_with_dup.duplicated().sum()

240

In [37]:
red_wine= red_wine_with_dup.drop_duplicates().reset_index(drop=True)

In [38]:
red_wine.shape

(1359, 12)

## Adding label column


In [39]:
red_wine ['quality_label'] = red_wine['quality'].apply(
lambda value: 'low'
    if value <= 5 
    else 'medium' if value <= 7 
    else 'high')

red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])

## Label counts 

In [40]:
# Getting the counts of each label
counts = red_wine['quality_label'].value_counts()

# Accessing each count individually
low_count = counts.get('low', 0)  # Gets the count for 'low', defaults to 0 if 'low' is not found
medium_count = counts.get('medium', 0)  # Gets the count for 'medium', defaults to 0 if 'medium' is not found
high_count = counts.get('high', 0)  # Gets the count for 'high', defaults to 0 if 'high' is not found

# Printing the counts
print(f"Low quality count: {low_count}")
print(f"Medium quality count: {medium_count}")
print(f"High quality count: {high_count}")

Low quality count: 640
Medium quality count: 702
High quality count: 17


## Adding Wine Type

In [41]:
#red_wine['wine_type'] = 'Red'
#white_wine['wine_type'] = 'White'

In [42]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Encoding Label Encoding 

In [43]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
red_wine['quality_label'] = label_encoder.fit_transform(red_wine['quality_label'])

# Unique species after label encoding
print(red_wine['quality_label'].unique())

[1 2 0]


In [44]:
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print(class_mapping)

{'high': 0, 'low': 1, 'medium': 2}


In [45]:
red_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,2
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,1


## Split

In [46]:
from sklearn.model_selection import train_test_split    

# Split data into features (X) and target (y)
# Ensure 'quality_label' is not included in the encoded columns if it's your target variable
X = red_wine.drop(['quality_label','quality'] , axis = 1)
y = red_wine['quality_label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=52)

## Building Model

### Logistic Regression with outliners 

In [47]:
from sklearn.linear_model import LogisticRegression

# Initialize and fit the logistic regression model
lr_model_red = LogisticRegression(random_state=42)

#fit model
lr_model_red.fit(X_train, y_train)

# Make predictions on the test set
pred=lr_model_red.predict(X_test)

# import accuracy_score  metric
from sklearn.metrics import  accuracy_score 

# Calculate and print the accuracy score
acc = accuracy_score(y_test, pred)

print(acc)

0.6911764705882353


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Random Forest with outliners 

In [48]:
# import Random Forest Classifier model
from  sklearn.ensemble import RandomForestClassifier

# Building model 
rf_model_red =  RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state=453)

# fit model 
rf_model_red.fit(X_train, y_train)

# Predict 
y_pred = rf_model_red.predict(X_test)

# Evaluate 
acc = accuracy_score(y_test, y_pred)
print(acc)

0.7279411764705882


### Confusion Matrix

In [49]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

array([[ 0,  0,  4],
       [ 0, 93, 36],
       [ 0, 44, 95]])

In [50]:

from sklearn.metrics import precision_recall_fscore_support

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Precision: 0.6815594039724568
Recall: 0.6911764705882353
F1-Score: 0.6859921648259766


  _warn_prf(average, modifier, msg_start, len(result))


## IQR - Outliners 

In [51]:
# Calculate Q1, Q3, and IQR 

Q1 = red_wine['alcohol'].quantile(0.25)
Q3 = red_wine['alcohol'].quantile(0.75)
IQR = Q3 - Q1

# Define thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#identify outliers
outliers = red_wine[(red_wine['alcohol'] < lower_bound) | (red_wine['alcohol'] > upper_bound)]

# Remove
red_wine_filtered = red_wine[(red_wine['alcohol'] >= lower_bound) & (red_wine['alcohol'] <= upper_bound)]

print(f"Original dataset size: {len(red_wine)}")
print(f"Filtered dataset size: {len(red_wine_filtered)}")
print(f"Number of outliers removed: {len(outliers)}")

Original dataset size: 1359
Filtered dataset size: 1347
Number of outliers removed: 12


## Model Logistic Regression without outliners (red_wine_filtered)

In [52]:
from sklearn.model_selection import train_test_split    

# Split data into features (X) and target (y)
# Ensure 'quality_label' is not included in the encoded columns if it's your target variable
X = red_wine_filtered.drop(['quality_label','quality'] , axis = 1)
y = red_wine_filtered['quality_label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=52)


from sklearn.linear_model import LogisticRegression

# Initialize and fit the logistic regression model
lr_model_red_filtered = LogisticRegression(random_state=42)

#fit model
lr_model_red_filtered.fit(X_train, y_train)

# Make predictions on the test set
pred=lr_model_red_filtered.predict(X_test)

# import accuracy_score  metric
from sklearn.metrics import  accuracy_score 

# Calculate and print the accuracy score
acc = accuracy_score(y_test, pred)

print(acc)

0.6777777777777778


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model Random forrest without outliners

In [53]:
# import Random Forest Classifier model
from  sklearn.ensemble import RandomForestClassifier

# Building model 
rf_model_red_filtered =  RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state=453)

# fit model 
rf_model_red_filtered.fit(X_train, y_train)

# Predict 
y_pred = rf_model_red_filtered.predict(X_test)

# Evaluate 
acc = accuracy_score(y_test, y_pred)
print(acc)

0.7111111111111111


### Confusion Matrix

In [54]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

array([[ 0,  0,  1],
       [ 0, 84, 37],
       [ 0, 49, 99]])

### Classification Report

In [55]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

#{'high': 0, 'low': 1, 'medium': 2}

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.63      0.69      0.66       121
           2       0.72      0.67      0.69       148

    accuracy                           0.68       270
   macro avg       0.45      0.45      0.45       270
weighted avg       0.68      0.68      0.68       270



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Precision, Recall, and F1-Score Calculation

In [56]:
from sklearn.metrics import precision_recall_fscore_support

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Precision: 0.6791479916335852
Recall: 0.6777777777777778
F1-Score: 0.6772316618317447


  _warn_prf(average, modifier, msg_start, len(result))


## Stratified Kfold with Accuracy per folder and average accuracy

In [57]:
print('Class Ratio:',sum(red_wine['quality_label'])/len(red_wine['quality_label']))

Class Ratio: 1.504047093451067


In [58]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Assuming 'red_wine' is your DataFrame and 'quality_label' is the column for stratification
target = red_wine['quality_label']

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store accuracies for later averaging
accuracies = []

# Enumerate over folds, starting from 1
for fold_no, (train_index, test_index) in enumerate(skf.split(red_wine, target), start=1):
    # Create training and testing sets
    X_train = red_wine.iloc[train_index].drop(['quality_label', 'quality'], axis=1)
    y_train = red_wine.iloc[train_index]['quality_label']
    X_test = red_wine.iloc[test_index].drop(['quality_label', 'quality'], axis=1)
    y_test = red_wine.iloc[test_index]['quality_label']
    
    # Initialize and train the logistic regression model
    lr_model_red = LogisticRegression(max_iter=1000)
    lr_model_red.fit(X_train, y_train)
    
    # Predict and calculate accuracy
    pred = lr_model_red.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    accuracies.append(accuracy)
    
    # Print accuracy for the current fold
    print(f'Fold {fold_no} Accuracy: {accuracy:.4f}')

# Print average accuracy after all folds
average_accuracy = sum(accuracies) / len(accuracies)
print(f'\nAverage Accuracy: {average_accuracy:.4f}')

Fold 1 Accuracy: 0.7426


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2 Accuracy: 0.7316


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3 Accuracy: 0.7169


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4 Accuracy: 0.7243
Fold 5 Accuracy: 0.7269

Average Accuracy: 0.7285


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Balanced accuracy

In [59]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Assuming 'red_wine' is your DataFrame and 'quality_label' is the column for stratification
target = red_wine['quality_label']

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store balanced accuracies for later averaging
balanced_accuracies = []

# Enumerate over folds, starting from 1
for fold_no, (train_index, test_index) in enumerate(skf.split(red_wine, target), start=1):
    # Create training and testing sets
    X_train = red_wine.iloc[train_index].drop(['quality_label', 'quality'], axis=1)
    y_train = red_wine.iloc[train_index]['quality_label']
    X_test = red_wine.iloc[test_index].drop(['quality_label', 'quality'], axis=1)
    y_test = red_wine.iloc[test_index]['quality_label']
    
    # Initialize and train the logistic regression model
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    
    # Predict and calculate balanced accuracy
    pred = lr_model.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, pred)
    balanced_accuracies.append(balanced_accuracy)
    
    # Print balanced accuracy for the current fold
    print(f'Fold {fold_no} Balanced Accuracy: {balanced_accuracy:.4f}')

# Print average balanced accuracy after all folds
average_balanced_accuracy = sum(balanced_accuracies) / len(balanced_accuracies)
print(f'\nAverage Balanced Accuracy: {average_balanced_accuracy:.4f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1 Balanced Accuracy: 0.5013


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2 Balanced Accuracy: 0.4952


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3 Balanced Accuracy: 0.4862


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4 Balanced Accuracy: 0.4891
Fold 5 Balanced Accuracy: 0.4896

Average Balanced Accuracy: 0.4923


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Balanced accuracy with Random Forest

In [60]:
from sklearn.model_selection import StratifiedKFold
from  sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Assuming 'red_wine' is your DataFrame and 'quality_label' is the column for stratification
target = red_wine['quality_label']

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store balanced accuracies for later averaging
balanced_accuracies = []

# Enumerate over folds, starting from 1
for fold_no, (train_index, test_index) in enumerate(skf.split(red_wine, target), start=1):
    # Create training and testing sets
    X_train = red_wine.iloc[train_index].drop(['quality_label', 'quality'], axis=1)
    y_train = red_wine.iloc[train_index]['quality_label']
    X_test = red_wine.iloc[test_index].drop(['quality_label', 'quality'], axis=1)
    y_test = red_wine.iloc[test_index]['quality_label']
    
    # Initialize and train the logistic regression model
    rf_model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state=453)
    rf_model.fit(X_train, y_train)
    
    # Predict and calculate balanced accuracy
    pred = rf_model.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, pred)
    balanced_accuracies.append(balanced_accuracy)
    
    # Print balanced accuracy for the current fold
    print(f'Fold {fold_no} Balanced Accuracy: {balanced_accuracy:.4f}')

# Print average balanced accuracy after all folds
average_balanced_accuracy = sum(balanced_accuracies) / len(balanced_accuracies)
print(f'\nAverage Balanced Accuracy: {average_balanced_accuracy:.4f}')

Fold 1 Balanced Accuracy: 0.5186
Fold 2 Balanced Accuracy: 0.5039
Fold 3 Balanced Accuracy: 0.4705
Fold 4 Balanced Accuracy: 0.5082
Fold 5 Balanced Accuracy: 0.5182

Average Balanced Accuracy: 0.5039


## Balanced accuracy with Gradient Boosting Machines 

In [61]:
from sklearn.model_selection import StratifiedKFold
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Assuming 'red_wine' is your DataFrame and 'quality_label' is the column for stratification
target = red_wine['quality_label']

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store balanced accuracies for later averaging
balanced_accuracies = []

# Enumerate over folds, starting from 1
for fold_no, (train_index, test_index) in enumerate(skf.split(red_wine, target), start=1):
    # Create training and testing sets
    X_train = red_wine.iloc[train_index].drop(['quality_label', 'quality'], axis=1)
    y_train = red_wine.iloc[train_index]['quality_label']
    X_test = red_wine.iloc[test_index].drop(['quality_label', 'quality'], axis=1)
    y_test = red_wine.iloc[test_index]['quality_label']
    
    # Initialize and train the logistic regression model
    gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=453)
    gbm_model.fit(X_train, y_train)
    
    # Predict and calculate balanced accuracy
    pred = gbm_model.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, pred)
    balanced_accuracies.append(balanced_accuracy)
    
    # Print balanced accuracy for the current fold
    print(f'Fold {fold_no} Balanced Accuracy: {balanced_accuracy:.4f}')

# Print average balanced accuracy after all folds
average_balanced_accuracy = sum(balanced_accuracies) / len(balanced_accuracies)
print(f'\nAverage Balanced Accuracy: {average_balanced_accuracy:.4f}')

Fold 1 Balanced Accuracy: 0.4985
Fold 2 Balanced Accuracy: 0.4689
Fold 3 Balanced Accuracy: 0.4879
Fold 4 Balanced Accuracy: 0.4911
Fold 5 Balanced Accuracy: 0.5115

Average Balanced Accuracy: 0.4916


In [62]:
print(classification_report(y_test, pred, target_names=["high","low","medium"]))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         3
         low       0.78      0.73      0.76       128
      medium       0.75      0.80      0.78       140

    accuracy                           0.76       271
   macro avg       0.51      0.51      0.51       271
weighted avg       0.76      0.76      0.76       271



## Random oversampling

In [63]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Assuming 'red_wine' is your DataFrame and 'quality_label' is the target column
X = red_wine.drop(['quality_label', 'quality'], axis=1)
y = red_wine['quality_label']

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply RandomOverSampler to your training data to balance it
X_ros, y_ros = ros.fit_resample(X_train, y_train)

# Initialize your model - here, we use GradientBoostingClassifier as an example
model = GradientBoostingClassifier(n_estimators=100, random_state=453)

# Fit the model on the oversampled training data
model.fit(X_ros, y_ros)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
balanced_accuracy = balanced_accuracy_score(y_test, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_accuracy:.4f}")

Accuracy: 0.6728
Balanced Accuracy: 0.4550


In [64]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

ValueError: Found input variables with inconsistent numbers of samples: [272, 271]

In [None]:
print(classification_report(y_test, pred, target_names=["high","low","medium"]))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         3
         low       0.69      0.82      0.75       128
      medium       0.77      0.62      0.69       141

    accuracy                           0.71       272
   macro avg       0.49      0.48      0.48       272
weighted avg       0.72      0.71      0.71       272

