In [3]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("data/breast_cancer.csv")
dataset.dtypes

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     

In [4]:
dataset.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
X = dataset.iloc[:,2:]
y = dataset.iloc[:,1]
y = pd.get_dummies(y)
y = y.iloc[:, -1]

In [7]:
!pip install imblearn
!pip install delayed
!pip install scikit-learn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
Collecting scikit-learn>=1.0.1
  Downloading scikit_learn-1.0.2-cp38-cp38-win_amd64.whl (7.2 MB)
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed imbalanced-learn-0.9.0 imblearn-0.0 scikit-learn-1.0.2
Collecting delayed
  Downloading delayed-0.11.0b1-py2.py3-none-any.whl (19 kB)
Collecting hiredis
  Downloading hiredis-2.0.0-cp38-cp38-win_amd64.whl (18 kB)
Collecting redis
  Downloading redis-4.1.2-py3-none-any.whl (173 kB)
Collecting deprecated>=1.2.3
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: deprecated, redis, hiredis, delayed
Successfully installe

In [14]:
from imblearn.over_sampling import SVMSMOTE  

sampler = SVMSMOTE(random_state=42)
X_res, y_res = sampler.fit_resample(X.values, y)

In [17]:
y_res.shape

(714,)

Let's scale the data for better accuracy

In [9]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_res)
X_scaled = scaler.transform(X.values)
X_res_scaled = scaler.transform(X_res)

We will use cross validation because the dataset is relatively small

In [10]:
from sklearn.model_selection import cross_val_score

def rmse_cv(model, X, y):
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [12]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=200)
clf.fit(X_scaled,y)
print(rmse_cv(clf, X_scaled, y))
print(clf.score(X_scaled,y))
clf.fit(X_res_scaled,y_res)
print(rmse_cv(clf, X_res_scaled, y_res))
print(clf.score(X_res_scaled, y_res))

[0.20942695 0.18731716 0.09365858 0.13245324 0.09407209]
1.0
[0.20483662 0.11826248 0.1672484  0.22124884 0.20555661]
1.0


In [13]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

train_score = []
test_score = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf = AdaBoostClassifier(n_estimators=500)
    clf.fit(X_train, y_train)
    train_score.append(clf.score(X_train, y_train))
    test_score.append(clf.score(X_test, y_test))
print("train: " + str(sum(train_score) / len(train_score)))
print("test: " + str(sum(test_score) / len(test_score)))
train: 1.0

train: 1.0
test: 0.9789473684210526
