In [None]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix

In [None]:
#import data set and convert diagnosis to numerical values
df = pd.read_csv(r'breast-cancer.csv')
df['diagnosis'].replace(['M','B'],[1,0],inplace=True) 
df.head()

In [None]:
x=df.drop(columns=['id','diagnosis']) #setting feature space
y=df['diagnosis'] #setting target column

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=109) #creating data split

#script to eliminate features with low correlation
d1= df.drop(columns=['id']) 
correlation_matrix = d1.corr()
cor_target = abs(correlation_matrix['diagnosis']) 
features_to_drop = cor_target[cor_target<0.1] 
to_drop_frame = features_to_drop.to_frame()
row_names = to_drop_frame.index
row_names_list = list(row_names)
row_names_list.append('diagnosis')
row_names_list.append('id')
y = df['diagnosis'].values
X = df.drop(row_names_list, axis=1)

X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.3,random_state=109) 

In [None]:
#Create a svm Classifier
clf1 = make_pipeline(StandardScaler(),SVC(gamma='auto',kernel='linear',class_weight='balanced')) # Linear Kernel
clf2 = make_pipeline(StandardScaler(),SVC(gamma='auto',kernel='linear',class_weight='balanced'))

#Train the model using the training sets
clf1.fit(x_train, y_train) #no pre-processing
clf2.fit(X_train,Y_train) #w/ preprocessing

#Predict the response for test dataset
y_pred = clf1.predict(x_test)
Y_pred = clf2.predict(X_test)

In [None]:
# Model Accuracy: how often is the classifier correct?
print(f'Model with no pre-processing has an accuracy of {accuracy_score(y_test, y_pred):.3f}') 
print(f' Model with pre-processing has an accuracy of {accuracy_score(Y_test, Y_pred):.3f}') 

In [None]:
#confusion matrix of model with no pre-processing
plot_confusion_matrix(clf1,x_test, y_test)
plt.show()

In [None]:
#confusion matrix for model with pre-processing
plot_confusion_matrix(clf2,X_test, Y_test)
plt.show()