In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
cancer = pd.read_csv("cancer.csv")

cancer.dropna(thresh = 3, inplace = True)
cancer.dropna(axis = 1, inplace = True)

#renaming the rows to have more descriptive names
cancer.columns = ["id","diagnosis","radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean","radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se","concave points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst","perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst","concave points_worst","symmetry_worst","fractal_dimension_worst",
]

cancer.drop('id', axis=1, inplace=True)

cancer.head()


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
#sns.pairplot(cancer, hue = "diagnosis", vars = ["radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean","radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se","concave points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst","perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst","concave points_worst","symmetry_worst","fractal_dimension_worst"] )


In [4]:
plt.figure(figsize=(20,12)) 
#sns.heatmap(cancer.corr(), annot=True) 

<Figure size 2000x1200 with 0 Axes>

<Figure size 2000x1200 with 0 Axes>

In [5]:
x = cancer.drop(["diagnosis"], axis = 1) # We drop our "target" feature and use all the remaining features in our dataframe to train the model.
y = cancer["diagnosis"]

In [6]:
#creating the training and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=20)

#making a pipeline
pipeline = Pipeline([
    ('min_max_scaler', MinMaxScaler()),
    ('std_scaler', StandardScaler())
])

x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)
x_val = pipeline.transform(x_val)

In [7]:
param_grid = {'C': [0.01, 0.1, 0.5, 1, 10, 15, 20, 100], 
              'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001], 
              'kernel': ['linear']} 

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(x_train, y_train)

best_params = grid.best_params_
print(f"Best parameters: {best_params}")

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Best parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}


In [8]:
param_grid = {'C': [0.01, 0.1, 0.5, 1, 10, 15, 20, 100], 
              'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001], 
              'kernel': ['poly']} 

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(x_train, y_train)

best_params = grid.best_params_
print(f"Best parameters: {best_params}")

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Best parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'poly'}


In [9]:
param_grid = {'C': [0.01, 0.1, 0.5, 1, 10, 15, 20, 100], 
              'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001], 
              'kernel': ['rbf']} 

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1, cv=5)
grid.fit(x_train, y_train)

best_params = grid.best_params_
print(f"Best parameters: {best_params}")

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Best parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [10]:
lin_model = SVC(kernel= 'linear', C = 0.1, gamma = 1)
lin_model.fit(x_train, y_train)
y_test_predict = lin_model.predict(x_test)
y_val_predict = lin_model.predict(x_val)

print("Testing set report:\n")
print(classification_report(y_test, y_test_predict))
print("Validating set report:\n")
print(classification_report(y_val, y_val_predict))

Testing set report:

              precision    recall  f1-score   support

           B       0.99      1.00      0.99        66
           M       1.00      0.98      0.99        48

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Validating set report:

              precision    recall  f1-score   support

           B       0.99      0.98      0.98        81
           M       0.94      0.97      0.96        33

    accuracy                           0.97       114
   macro avg       0.96      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [11]:

cm = np.array(confusion_matrix(y_test, y_test_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])

print("\nConfusion matrix for testing set:\n")
confusion



Confusion matrix for testing set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,66,0
Healthy,1,47


In [12]:
cm = np.array(confusion_matrix(y_val, y_val_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])

print("\nConfusion matrix for validating set:\n")
confusion



Confusion matrix for validating set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,79,2
Healthy,1,32


In [13]:
poly_model = SVC(kernel ='poly', degree = 3, coef0 = 1, C = 1, gamma = 0.1)

poly_model.fit(x_train, y_train)

y_test_predict = poly_model.predict(x_test)
y_val_predict = poly_model.predict(x_val)

print("Testing set report:\n")
print(classification_report(y_test, y_test_predict))
print("Validating set report:\n")
print(classification_report(y_val, y_val_predict))

Testing set report:

              precision    recall  f1-score   support

           B       1.00      0.98      0.99        66
           M       0.98      1.00      0.99        48

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Validating set report:

              precision    recall  f1-score   support

           B       0.99      0.96      0.97        81
           M       0.91      0.97      0.94        33

    accuracy                           0.96       114
   macro avg       0.95      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114



In [14]:
cm = np.array(confusion_matrix(y_test, y_test_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])
print("\nConfusion matrix for testing set:\n")
confusion


Confusion matrix for testing set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,65,1
Healthy,0,48


In [15]:
cm = np.array(confusion_matrix(y_val, y_val_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])

print("\nConfusion matrix for validating set:\n")
confusion


Confusion matrix for validating set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,78,3
Healthy,1,32


In [16]:
poly_model = SVC(kernel ='poly', degree = 10, coef0 = 100, C = 1, gamma = 0.1)

poly_model.fit(x_train, y_train)

y_test_predict = poly_model.predict(x_test)
y_val_predict = poly_model.predict(x_val)

print("Testing set report:\n")
print(classification_report(y_test, y_test_predict))
print("Validating set report:\n")
print(classification_report(y_val, y_val_predict))

Testing set report:

              precision    recall  f1-score   support

           B       0.99      1.00      0.99        66
           M       1.00      0.98      0.99        48

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Validating set report:

              precision    recall  f1-score   support

           B       0.97      0.94      0.96        81
           M       0.86      0.94      0.90        33

    accuracy                           0.94       114
   macro avg       0.92      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114



In [21]:
cm = np.array(confusion_matrix(y_test, y_test_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])
print("\nConfusion matrix for testing set:\n")
confusion


Confusion matrix for testing set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,66,0
Healthy,1,47


In [19]:
cm = np.array(confusion_matrix(y_val, y_val_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])

print("\nConfusion matrix for validating set:\n")
confusion


Confusion matrix for validating set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,79,2
Healthy,2,31


In [18]:
radial_model = SVC(kernel ='rbf', gamma = 0.01, C = 10)

radial_model.fit(x_train, y_train)

y_test_predict = radial_model.predict(x_test)
y_val_predict = radial_model.predict(x_val)

print("Testing set report:\n")
print(classification_report(y_test, y_test_predict))
print("Validating set report:\n")
print(classification_report(y_val, y_val_predict))


Testing set report:

              precision    recall  f1-score   support

           B       0.99      1.00      0.99        66
           M       1.00      0.98      0.99        48

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Validating set report:

              precision    recall  f1-score   support

           B       0.98      0.98      0.98        81
           M       0.94      0.94      0.94        33

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



In [22]:
cm = np.array(confusion_matrix(y_test, y_test_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])
print("\nConfusion matrix for testing set:\n")
confusion


Confusion matrix for testing set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,66,0
Healthy,1,47


In [20]:
cm = np.array(confusion_matrix(y_val, y_val_predict))
confusion = pd.DataFrame(cm, index=['Has cancer', 'Healthy'],
                         columns=['Predicted cancer','Predicted healthy'])

print("\nConfusion matrix for validating set:\n")
confusion


Confusion matrix for validating set:



Unnamed: 0,Predicted cancer,Predicted healthy
Has cancer,79,2
Healthy,2,31
