In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# SVM Library
from sklearn import svm

# Libraries to scale the data
from sklearn.preprocessing import StandardScaler

# Cross Validation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

from sklearn.decomposition import PCA

# Bayesian Optimization libraries
from functools import partial
from skopt import gp_minimize
from skopt import space
from sklearn import model_selection

# Performance Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Ignores runtime warnings
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline

### Analysis on K-means Imputed and Outlier Presence dataset

In [None]:
df1 = pd.read_csv("../processed-datasets/data_outliers_handled_kmeans_imputed.csv")

In [None]:
df1.head()

In [None]:
df1.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df1.sample(5)

In [None]:
X = df1.drop('Feature', axis=1).values
y = df1.Feature.values.astype(int)

### Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

### Scaling the data using Standard Scaler

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled.shape

### Base Model without outliers and K-means Imputed

In [None]:
svm_base = svm.SVC()

In [None]:
svm_base.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_base.predict(X_test_scaled)

In [None]:
print("Accuracy of base SVM model on test data: ", round(accuracy_score(y_test, y_pred), 2))

In [None]:
print("Classification Report of SVM base model over test data: \n\n", classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(cm, display_labels=['Barren', 'Fertile']).plot()

### Hyper-Parameter Optimization - Grid Search and Stratified CV

In [None]:
Stratified_kf = StratifiedKFold(n_splits=10, shuffle=True)

In [None]:
param_grid = {
    'C': np.arange(0, 6, 1),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': np.arange(0, 3, 1),
    'gamma': ['scale', 'auto']
}

In [None]:
clf = GridSearchCV(estimator = svm.SVC(), param_grid = param_grid, scoring = 'accuracy', n_jobs=-1, 
                   cv = Stratified_kf, verbose = 10, return_train_score=True)

In [None]:
clf.fit(X_train_scaled, y_train)

In [None]:
clf.best_estimator_

In [None]:
best_params = clf.best_params_

In [None]:
best_params

In [None]:
clf.best_score_

In [None]:
best_model_svc = svm.SVC(**best_params)

In [None]:
best_model_svc.fit(X_train_scaled, y_train)

In [None]:
y_pred_best = best_model_svc.predict(X_test_scaled)

In [None]:
print("Accuracy of SVC tuned model over test set: ", round(accuracy_score(y_test, y_pred_best), 2))

In [None]:
print("Classification Report of SVM tuned model over test data: \n\n", classification_report(y_test, y_pred_best))

In [None]:
cm = confusion_matrix(y_test, y_pred_best)

ConfusionMatrixDisplay(cm, display_labels=['Barren', 'Fertile']).plot()