In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

# --------Now load in the dataset with Pandas-----------

df = pd.read_csv('Datasets/BreastCancerDataset.csv')
df = df.drop('Unnamed: 32', axis=1)
del df["id"]

# ----------Plot proportion of the labels --------------

print(df.diagnosis.value_counts())
sns.countplot(x=df['diagnosis']).set_title('Count of benign vs. malignant samples')
plt.show()

# ---------Best Features -------------------------------

df.diagnosis = df.diagnosis.map({'M':1,'B':0})
corr_matrix = df.corr()

threshold = 0.6
corr_features = list(corr_matrix.columns[np.abs(corr_matrix['diagnosis'])>=threshold])
high_corr_matrix = corr_matrix.loc[:,corr_features].loc[corr_features,:]

sns.heatmap(high_corr_matrix, annot=True)
plt.show()

# -------------Split Dataset to train and Test ---------

train, test = train_test_split(df[corr_features], test_size=0.15, random_state=31)
X_train, y_train = train.iloc[:,1:].to_numpy(), train.iloc[:,0].to_numpy()
X_test, y_test = test.iloc[:,1:].to_numpy(), test.iloc[:,0].to_numpy() # We do not touch this until we finish our modeling

# -----------------Scaleing the Features----------------

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Define the parameter grid for grid search
param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 10, 100, 1000, 2000],
    'gamma': ['scale', 'auto', 0.1, 1, 10, 100],
}
# 

# Perform grid search with testing
grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=3)
# grid_search = RandomizedSearchCV(SVC(), param_grid, cv=5, n_iter=10, verbose=3)
                
grid_search.fit(X_train, y_train)

# Evaluate the best model on the testing set
best_svm = grid_search.best_estimator_

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm,annot=True)
plt.show()

# Print the best hyperparameters and the corresponding mean squared error
print('Best hyperparameters:', grid_search.best_params_)
print('Best mean CV accuracy:', grid_search.best_score_)
acc = round(accuracy_score(y_test, y_pred), 3)
print('The accuracy of the model =', acc)