In [25]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'Chapter2/wine.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
wine = pd.read_csv(file_content_stream)
wine.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [26]:
wine['Wine'].value_counts()

2    71
1    59
3    48
Name: Wine, dtype: int64

## Data Splitting

In [27]:
## Defining input and target variables
X = wine.drop(columns = 'Wine', axis = 1)
Y = wine['Wine']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Standardizing the data 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## One-vs-all Classifier

In [28]:
## Building the multi-classifier (using RF) 
one_vs_all_RF = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicting on the test
one_vs_all_RF_pred = one_vs_all_RF.predict_proba(X_test)
one_vs_all_RF_pred = np.argmax(one_vs_all_RF_pred, axis = 1) + 1
one_vs_all_RF_pred

array([2, 1, 3, 1, 2, 3, 3, 1, 1, 1, 3, 1, 3, 3, 2, 1, 3, 2, 1, 2, 2, 2,
       3, 1, 2, 2, 3, 1, 2, 3, 2, 1, 1, 2, 1, 1])

In [29]:
## Creating the confusion matrix
confusion_matrix(Y_test, one_vs_all_RF_pred)

array([[12,  0,  0],
       [ 2, 12,  0],
       [ 0,  0, 10]])

In [30]:
## Creating the classification report 
print(classification_report(Y_test, one_vs_all_RF_pred))

              precision    recall  f1-score   support

           1       0.86      1.00      0.92        12
           2       1.00      0.86      0.92        14
           3       1.00      1.00      1.00        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.95      0.94      0.94        36



In [31]:
## Building the multi-classifier (using SVM) 
one_vs_all_svm = OneVsRestClassifier(estimator = SVC(kernel = 'rbf', probability = True)).fit(X_train, Y_train)

## Predicting on the test
one_vs_all_svm_pred = one_vs_all_svm.predict_proba(X_test)
one_vs_all_svm_pred = np.argmax(one_vs_all_svm_pred, axis = 1) + 1
one_vs_all_svm_pred

array([2, 2, 3, 1, 2, 3, 3, 1, 1, 1, 3, 1, 3, 3, 2, 1, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 1, 2, 3, 2, 1, 2, 2, 1, 1])

In [32]:
## Creating the confusion matrix
confusion_matrix(Y_test, one_vs_all_svm_pred)

array([[10,  2,  0],
       [ 0, 14,  0],
       [ 0,  1,  9]])

In [33]:
## Creating the classification report 
print(classification_report(Y_test, one_vs_all_svm_pred))

              precision    recall  f1-score   support

           1       1.00      0.83      0.91        12
           2       0.82      1.00      0.90        14
           3       1.00      0.90      0.95        10

    accuracy                           0.92        36
   macro avg       0.94      0.91      0.92        36
weighted avg       0.93      0.92      0.92        36



## One-vs-One Classifier

In [15]:
## Building the multi-classifier (using RF) 
one_vs_one_RF = OneVsOneClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicting on the test
one_vs_one_RF_pred = one_vs_one_RF.predict(X_test)
one_vs_one_RF_pred

array([3, 1, 3, 3, 3, 2, 3, 1, 1, 1, 2, 3, 2, 2, 1, 1, 2, 2, 3, 2, 1, 1,
       2, 1, 1, 3, 3, 3, 2, 2, 3, 1, 2, 1, 1, 3])

In [34]:
## Creating the confusion matrix
confusion_matrix(Y_test, one_vs_one_RF_pred)

array([[7, 1, 4],
       [6, 3, 5],
       [0, 7, 3]])

In [35]:
## Creating the classification report 
print(classification_report(Y_test, one_vs_one_RF_pred))

              precision    recall  f1-score   support

           1       0.54      0.58      0.56        12
           2       0.27      0.21      0.24        14
           3       0.25      0.30      0.27        10

    accuracy                           0.36        36
   macro avg       0.35      0.37      0.36        36
weighted avg       0.35      0.36      0.36        36



In [14]:
## Building the multi-classifier (using SVM) 
one_vs_one_svm = OneVsOneClassifier(estimator = SVC(kernel = 'rbf', probability = True)).fit(X_train, Y_train)

## Predicting on the test
one_vs_one_svm_pred = one_vs_one_svm.predict(X_test)
one_vs_one_svm_pred

array([3, 1, 3, 3, 3, 2, 3, 1, 1, 1, 2, 3, 2, 2, 1, 1, 2, 2, 3, 2, 1, 1,
       2, 1, 1, 3, 2, 3, 2, 2, 3, 1, 2, 1, 1, 2])

In [36]:
## Creating the confusion matrix
confusion_matrix(Y_test, one_vs_one_svm_pred)

array([[7, 2, 3],
       [6, 3, 5],
       [0, 8, 2]])

In [37]:
## Creating the classification report 
print(classification_report(Y_test, one_vs_one_svm_pred))

              precision    recall  f1-score   support

           1       0.54      0.58      0.56        12
           2       0.23      0.21      0.22        14
           3       0.20      0.20      0.20        10

    accuracy                           0.33        36
   macro avg       0.32      0.33      0.33        36
weighted avg       0.32      0.33      0.33        36

