In [3]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("./augmented_data.csv")
data = data.drop(['Unnamed: 32', 'id'], axis=1)
data.diagnosis = [1 if each == 'M' else 0 for each in data.diagnosis]
x_data = data.drop(['diagnosis'], axis=1)
y_data = data['diagnosis']


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x_data)


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y_data, test_size=0.15, random_state=42)

print("x_train shape: ", x_train.shape)
print("x_test shape: ", x_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)


from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')  
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)


import xgboost as xgb


xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')


xgb_model.fit(x_train_imputed, y_train)


train_accuracy = xgb_model.score(x_train_imputed, y_train)
test_accuracy = xgb_model.score(x_test_imputed, y_test)

print("train accuracy: {}".format(train_accuracy))
print("test accuracy: {}".format(test_accuracy))


y_pred = xgb_model.predict(x_test_imputed)
y_pred_mapped = np.where(y_pred == 1, 'M', 'B')


print("Predictions for x_test: {}".format(y_pred_mapped))


x_train shape:  (4250, 30)
x_test shape:  (750, 30)
y_train shape:  (4250,)
y_test shape:  (750,)
train accuracy: 1.0
test accuracy: 1.0
Predictions for x_test: ['B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'M'
 'B' 'M' 'M' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'M' 'B'
 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'M'
 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'M' 'M' 'M' 'B'
 'M' 'M' 'B' 'M' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'B' 'B' 'M' 'M'
 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'M' 'M' 'M' 'M' 'M'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B'
 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'M' 'M' 'B' 'M' 'B' 'B' 'M'
 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B'
 'B' 'B' 'M' 'M' 'B' 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'M'
 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'M' 'B'