In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load your data
data = pd.read_csv('data/breast/breast-cancer.csv')

# Encode categorical features
label_encoders = {}
for column in data.columns:
    if data[column].dtype == object:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Handle missing values if any
# data = data.fillna(method='ffill')  # Forward fill
# data = data.fillna(method='bfill')  # Backward fill
# data = data.dropna()  # Drop rows with missing values

# Split the data into features and target
X = data.drop('class', axis=1)
y = data['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (optional, depends on the classifier you will use)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Now, X_train_scaled and y_train are ready to be used in a machine learning model for training.


In [2]:
X_train

Unnamed: 0,age,menopause,tsize,nodecaps,deg-malig,breast,breast-quad,irradiat
no-recurrence-events,2,5,0,1,2,0,4,0
no-recurrence-events,2,4,0,1,2,0,3,0
no-recurrence-events,0,2,0,1,2,1,2,0
no-recurrence-events,0,8,5,2,3,0,1,0
no-recurrence-events,2,4,0,1,1,1,4,0
...,...,...,...,...,...,...,...,...
no-recurrence-events,2,1,0,1,2,1,3,0
no-recurrence-events,2,4,0,1,2,1,3,0
no-recurrence-events,2,3,0,1,2,0,3,0
recurrence-events,0,5,5,2,3,0,4,0


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, \
    precision_recall_curve

C = 12
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True, random_state=3),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=150),
    'ADA': AdaBoostClassifier(n_estimators=150)
}

In [4]:
import numpy as np

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 43.1% 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         9
           2       0.45      1.00      0.62        15
           3       0.31      0.29      0.30        17
           4       0.56      0.33      0.42        15
           5       0.00      0.00      0.00         1

    accuracy                           0.43        58
   macro avg       0.22      0.27      0.22        58
weighted avg       0.35      0.43      0.36        58

Accuracy (train) for KNN classifier: 36.2% 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.40      0.22      0.29         9
           2       0.41      0.73      0.52        15
           3       0.24      0.24      0.24        17
           4       0.44      0.27      0.33        15
           5       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy (train) for ADA: 39.7% 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.25      0.56      0.34         9
           2       0.33      0.33      0.33        15
           3       0.40      0.12      0.18        17
           4       0.71      0.67      0.69        15
           5       0.25      1.00      0.40         1

    accuracy                           0.40        58
   macro avg       0.32      0.45      0.32        58
weighted avg       0.43      0.40      0.38        58


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
