In [45]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bc_data = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = bc_data.data.features 
y = bc_data.data.targets 
  
# metadata 
#print(bc_data.metadata) 
  
# variable information 
print(bc_data.variables)

                  name     role         type demographic description units  \
0                   ID       ID  Categorical        None        None  None   
1            Diagnosis   Target  Categorical        None        None  None   
2              radius1  Feature   Continuous        None        None  None   
3             texture1  Feature   Continuous        None        None  None   
4           perimeter1  Feature   Continuous        None        None  None   
5                area1  Feature   Continuous        None        None  None   
6          smoothness1  Feature   Continuous        None        None  None   
7         compactness1  Feature   Continuous        None        None  None   
8           concavity1  Feature   Continuous        None        None  None   
9      concave_points1  Feature   Continuous        None        None  None   
10           symmetry1  Feature   Continuous        None        None  None   
11  fractal_dimension1  Feature   Continuous        None        

In [46]:
import numpy as np
import sys
sys.path.append("..")
from knn import KNearestNeighbours # KNN implementation
from classification_tree import ClassificationTree #Tree implementation
from cross_validation import k_folds_accuracy_score, leave_one_out_score #Model Eval
from train_test_split import train_test_split #Splitting training and testing
from preprocessing import MinMaxScaler #Basic Normalisation

In [47]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Create transformers for the pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # or median
    ('scaler', MinMaxScaler())  # Normalizing numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Use OrdinalEncoder for ordinal data
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing pipeline to your data
X_preprocessed = preprocessor.fit_transform(X)

#Converting labels into a 1-D array
y = y['Diagnosis'].values

In [48]:
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.25, seed=42)

# Step 3: Train the model
k = 5
knn = KNearestNeighbours(k)
knn.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = knn.predict(X_test)
accuracy = np.mean(y_test == y_pred)
print(y_test == y_pred)
print(f"Accuracy: {accuracy}")

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True False  True  True  True
  True  True  True  True  True  True  True  True  True  True]
Accuracy: 0.9436619718309859


In [49]:
tree = ClassificationTree(max_depth=5)
tree.fit(X_train, y_train)

y_pred_tree = tree.predict(X_test)
accuracy = np.mean(y_test == y_pred_tree)
print(y_test == y_pred_tree)
print(f"Accuracy: {accuracy}")
feature_names = X.columns.tolist()
print(tree.print_tree(feature_names = feature_names))

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True False  True  True  True  True False  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]
Accuracy: 0.9366197183098591
area3 <= 0.17290110106173812
Left:
|   concave_points3 <= 0.552233676975945
|   Left:
|   |   concave_poin