In [13]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb


# df = pd.read_csv("dataset.csv")
# 
# # Drop uneeded columns
# df.drop(['id', 'Unnamed: 32'], axis = 1, inplace = True)

df = pd.read_csv("dataset_clean.csv") # ... até melhora o score de alguns modelos


# Print the first 5 rows of the data
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
3,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
4,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151


In [14]:
# Split the data into training and testing sets
all_inputs = df.drop(['diagnosis'], axis=1).values
all_labels = df['diagnosis'].values

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
all_inputs, all_labels = ros.fit_resample(all_inputs, all_labels)

# print count of each class
print("Count of each class before oversampling: ")

# count elements that are of class 'M'
print("Malignant: ", sum(all_labels == 'M'))

# count elements that are of class 'B'
print("Benign: ", sum(all_labels == 'B'))




Count of each class before oversampling: 
Malignant:  314
Benign:  314


In [15]:
# standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
all_inputs_scaled = scaler.fit_transform(all_inputs)

In [16]:
from sklearn.model_selection import train_test_split

(training_inputs,
    testing_inputs,
    training_classes,
    testing_classes) = train_test_split(all_inputs_scaled, all_labels, test_size=0.25, random_state=1)

# create a CVM classifier and use 10-fold cross validation with grid search to find the best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameter_grid = [{'kernel': ['rbf'],
                     'gamma': [1e-3, 1e-4],
                        'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'],
                        'C': [1, 10, 100, 1000]}]

clf = SVC()

grid_search = GridSearchCV(clf, 
                           param_grid=parameter_grid,
                            cv=10)
                            
grid_search.fit(all_inputs_scaled, all_labels) # n devia ser so os training inputs ... all_inputs_scaled all_labels

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))


Best score: 0.9793394777265746
Best parameters: {'C': 1000, 'kernel': 'linear'}


In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier

# suppress the ConvergenceWarning from the MLPClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

clf = MLPClassifier()

# Best score: 0.988785046728972 com o de baixo mas demora 45 minutos xD
# parameter_grid = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
#                     'solver': ['lbfgs', 'sgd', 'adam'],
#                     'alpha': [0.0001, 0.001, 0.01, 0.1],
#                     'learning_rate': ['constant', 'invscaling', 'adaptive'],
#                     'max_iter': [100, 200, 300, 400, 500]}
parameter_grid = {'hidden_layer_sizes': [(10,), (50,), (100,), (10, 10), (50, 50), (100, 100)],
                  'early_stopping': [True, False]}


cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(clf,
                            param_grid=parameter_grid,
                            cv=cross_validation)

grid_search.fit(all_inputs_scaled, all_labels)

print('Best score: {}'.format(grid_search.best_score_))

# confusion matrix
from sklearn.metrics import confusion_matrix

# use the best parameters
clf = grid_search.best_estimator_

# fit the model
clf.fit(training_inputs, training_classes)

# get predictions
predictions = clf.predict(testing_inputs)

# print the confusion matrix
print(confusion_matrix(testing_classes, predictions))



Best score: 0.9920380952380953
[[72  2]
 [ 0 83]]


In [18]:
# logical regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold


clf = LogisticRegression(max_iter=1000)

parameter_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
                    'penalty': ['l2'],
                    'C': [100, 10, 1.0, 0.1, 0.01]}
cross_validation = StratifiedKFold(n_splits=10)

grid_search = GridSearchCV(clf,
                            param_grid=parameter_grid,
                            cv=cross_validation)

grid_search.fit(all_inputs_scaled, all_labels)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))



Best score: 0.977700972862263
Best parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
