In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb


df = pd.read_csv("dataset.csv")

# Drop uneeded columns
df.drop(['id', 'Unnamed: 32'], axis = 1, inplace = True)

# Print the first 5 rows of the data
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [2]:
# Split the data into training and testing sets
all_inputs = df.drop(['diagnosis'], axis=1).values
all_labels = df['diagnosis'].values

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
all_inputs, all_labels = ros.fit_resample(all_inputs, all_labels)

# print count of each class
print("Count of each class before oversampling: ")

# count elements that are of class 'M'
print("Malignant: ", sum(all_labels == 'M'))

# count elements that are of class 'B'
print("Benign: ", sum(all_labels == 'B'))




Count of each class before oversampling: 
Malignant:  357
Benign:  357


In [3]:
# standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
all_inputs_scaled = scaler.fit_transform(all_inputs)

In [4]:
from sklearn.model_selection import train_test_split

(training_inputs,
    testing_inputs,
    training_classes,
    testing_classes) = train_test_split(all_inputs_scaled, all_labels, test_size=0.25, random_state=1)

# create a CVM classifier and use 10-fold cross validation with grid search to find the best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameter_grid = [{'kernel': ['rbf'],
                     'gamma': [1e-3, 1e-4],
                        'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'],
                        'C': [1, 10, 100, 1000]}]

clf = SVC()

grid_search = GridSearchCV(clf, 
                           param_grid=parameter_grid,
                            cv=10)
                            
grid_search.fit(training_inputs, training_classes)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))


Best score: 0.9776030747728861
Best parameters: {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}


In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier

# suppress the ConvergenceWarning from the MLPClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

clf = MLPClassifier()

# parameter_grid = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
#                     'solver': ['lbfgs', 'sgd', 'adam'],
#                     'alpha': [0.0001, 0.001, 0.01, 0.1],
#                     'learning_rate': ['constant', 'invscaling', 'adaptive'],
#                     'max_iter': [100, 200, 300, 400, 500]}

parameter_grid = {'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50), (100, 100)],
                  'early_stopping': [True, False]}


cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(clf,
                            param_grid=parameter_grid,
                            cv=cross_validation)

grid_search.fit(training_inputs, training_classes)

print('Best score: {}'.format(grid_search.best_score_))


In [None]:
# logical regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold


clf = LogisticRegression(max_iter=1000)

parameter_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
                    'penalty': ['l2'],
                    'C': [100, 10, 1.0, 0.1, 0.01]}
cross_validation = StratifiedKFold(n_splits=10)

grid_search = GridSearchCV(clf,
                            param_grid=parameter_grid,
                            cv=cross_validation)

grid_search.fit(training_inputs, training_classes)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))



Best score: 0.9757861635220125
Best parameters: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
