In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import boto3
from io import StringIO

In [48]:
# Load data from p4-s3-laxman/data object
s3 = boto3.client('s3')
s3_object = s3.get_object(Bucket='p4-s3-laxman', Key='data/winequality-white.csv')
s3_data = s3_object['Body'].read().decode('utf-8')

df = pd.read_csv(StringIO(s3_data), sep=";")

In [49]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [50]:
#Check for null values
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [51]:
# Categorize 0-4 to low quality, 5-7 to Average quality and 8-10 to High quality   
df['quality'] = pd.cut(df['quality'], bins=[0, 4, 7, 10], labels=['Low Quality', 'Average Quality', 'High Quality'])

In [52]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,Average Quality
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,Average Quality
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,Average Quality
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,Average Quality
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,Average Quality


In [53]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2


In [54]:
# Split data into train, test and validation
train, test = train_test_split(df, test_size=0.2, random_state=5914)
test, val = train_test_split(test, test_size=0.5, random_state=5914)
X_train = train.drop('quality', axis=1)
y_train = train['quality']
X_test = test.drop('quality', axis=1)
y_test = test['quality']
X_val = val.drop('quality', axis=1)
y_val = val['quality']

In [55]:
# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [56]:
# Finding best parameters for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best parameters: ", best_params)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   2.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; t

In [62]:
# Random Forest Model Prediction and metrics
rf = RandomForestClassifier(bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=5914)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

class_labels = rf.classes_

cm_rf = confusion_matrix(y_test, y_pred_rf, labels=class_labels)
df_cm_rf = pd.DataFrame(cm_rf, 
                        index=['Actual ' + cls for cls in class_labels], 
                        columns=['Predicted ' + cls for cls in class_labels])
print('Confusion matrix for RandomForestClassifier:')
print(df_cm_rf)

print('RandomForestClassifier Test metrics:')
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Precision:', precision_score(y_test, y_pred_rf, average='weighted'))
print('Recall:', recall_score(y_test, y_pred_rf, average='weighted'))

y_pred_val_rf = rf.predict(X_val)

print('Validation metrics:')
print('Accuracy:', accuracy_score(y_val, y_pred_val_rf))
print('Precision:', precision_score(y_val, y_pred_val_rf, average='weighted'))
print('Recall:', recall_score(y_val, y_pred_val_rf, average='weighted'))

Confusion matrix for RandomForestClassifier:
                        Predicted Average Quality  Predicted High Quality  \
Actual Average Quality                        468                       0   
Actual High Quality                             8                       1   
Actual Low Quality                             10                       0   

                        Predicted Low Quality  
Actual Average Quality                      1  
Actual High Quality                         0  
Actual Low Quality                          2  
RandomForestClassifier Test metrics:
Accuracy: 0.9612244897959183
Precision: 0.956386999244142
Recall: 0.9612244897959183
Validation metrics:
Accuracy: 0.9285714285714286
Precision: 0.925728862973761
Recall: 0.9285714285714286


In [58]:
# Finding best parameters for Support vector
param_grid = {
    'C': [1, 10, 100],
    'kernel': ['linear', 'poly'],
    'degree': [1, 2, 3]
}

svc = SVC(random_state=5914)

grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best parameters: ", best_params)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   2.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   2.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   2.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time= 

In [63]:
# Support vector Model Prediction and metrics
svc = SVC(C=10, degree=2, kernel='poly', random_state=5914)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

class_labels = svc.classes_

cm_svc = confusion_matrix(y_test, y_pred_svc, labels=class_labels)
df_cm_svc = pd.DataFrame(cm_svc, 
                        index=['Actual ' + cls for cls in class_labels], 
                        columns=['Predicted ' + cls for cls in class_labels])
print('\nConfusion matrix for SVC:')
print(df_cm_svc)

print('SVC Test metrics:')
print('Accuracy:', accuracy_score(y_test, y_pred_svc))
print('Precision:', precision_score(y_test, y_pred_svc, average='weighted', zero_division=0))
print('Recall:', recall_score(y_test, y_pred_svc, average='weighted'))

y_pred_val_svc = svc.predict(X_val)

print('Validation metrics:')
print('Accuracy:', accuracy_score(y_val, y_pred_val_svc))
print('Precision:', precision_score(y_val, y_pred_val_svc, average='weighted', zero_division=0))
print('Recall:', recall_score(y_val, y_pred_val_svc, average='weighted'))


Confusion matrix for SVC:
                        Predicted Average Quality  Predicted High Quality  \
Actual Average Quality                        469                       0   
Actual High Quality                             9                       0   
Actual Low Quality                             12                       0   

                        Predicted Low Quality  
Actual Average Quality                      0  
Actual High Quality                         0  
Actual Low Quality                          0  
SVC Test metrics:
Accuracy: 0.9571428571428572
Precision: 0.9161224489795919
Recall: 0.9571428571428572
Validation metrics:
Accuracy: 0.9040816326530612
Precision: 0.8173635985006247
Recall: 0.9040816326530612


In [60]:
metrics = {
    'Algorithm': ['RandomForestClassifier', 'SVC'],
    'Test Accuracy': [accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_svc)],
    'Test Precision': [precision_score(y_test, y_pred_rf, average='weighted'), precision_score(y_test, y_pred_svc, average='weighted', zero_division=0)],
    'Test Recall': [recall_score(y_test, y_pred_rf, average='weighted'), recall_score(y_test, y_pred_svc, average='weighted')],
    'Validation Accuracy': [accuracy_score(y_val, y_pred_val_rf), accuracy_score(y_val, y_pred_val_svc)],
    'Validation Precision': [precision_score(y_val, y_pred_val_rf, average='weighted'), precision_score(y_val, y_pred_val_svc, average='weighted', zero_division=0)],
    'Validation Recall': [recall_score(y_val, y_pred_val_rf, average='weighted'), recall_score(y_val, y_pred_val_svc, average='weighted')],
}
df_metrics = pd.DataFrame(metrics)

In [64]:
# Store output to p4-s3-laxman/output object
metrics_str = df_metrics.to_string(index=False)

cm_rf_str = "Confusion Matrix for RandomForestClassifier:\n\n" + df_cm_rf.to_string()
cm_svc_str = "Confusion Matrix for SVC:\n\n" + df_cm_svc.to_string()

metrics_str += "\n" + cm_rf_str + "\n" + cm_svc_str

s3 = boto3.resource('s3')
s3_object = s3.Object('p4-s3-laxman', 'output/output-metrics.txt')
s3_object.put(Body=metrics_str)

{'ResponseMetadata': {'RequestId': '7GGAHJ9VXFPAHZ77',
  'HostId': 'yFk7rAYGms8nKXkQuqzTVcbBoOT8xtnl5pyZ81gfATew7xe9eitI2Mi4OKjf3d0r7PDeRecP0kU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'yFk7rAYGms8nKXkQuqzTVcbBoOT8xtnl5pyZ81gfATew7xe9eitI2Mi4OKjf3d0r7PDeRecP0kU=',
   'x-amz-request-id': '7GGAHJ9VXFPAHZ77',
   'date': 'Sun, 21 Apr 2024 16:16:46 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"c9cd10fb7089bdefe1364e871d23366b"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"c9cd10fb7089bdefe1364e871d23366b"',
 'ServerSideEncryption': 'AES256'}

[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   2.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   2.1s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   2.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   1.0s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   2.0s
[CV] END .......................C=1, degree=1, kernel=linear; total time=   0.2s
[CV] END .........................C=1, degree=1, kernel=poly; total time=   0.2s
[CV] END .