In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load the dataset
# The dataset is being loaded from a URL. It contains data on breast cancer diagnostics.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
column_names = ['ID', 'Diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=column_names)

# Drop the ID column
# The ID column is not useful for the analysis and is dropped.
data.drop('ID', axis=1, inplace=True)

# Print the entire dataset
print(data)

# Alternatively, print the first few rows of the dataset
print(data.head())


    Diagnosis  feature_1  feature_2  feature_3  feature_4  feature_5  \
0           M      17.99      10.38     122.80     1001.0    0.11840   
1           M      20.57      17.77     132.90     1326.0    0.08474   
2           M      19.69      21.25     130.00     1203.0    0.10960   
3           M      11.42      20.38      77.58      386.1    0.14250   
4           M      20.29      14.34     135.10     1297.0    0.10030   
..        ...        ...        ...        ...        ...        ...   
564         M      21.56      22.39     142.00     1479.0    0.11100   
565         M      20.13      28.25     131.20     1261.0    0.09780   
566         M      16.60      28.08     108.30      858.1    0.08455   
567         M      20.60      29.33     140.10     1265.0    0.11780   
568         B       7.76      24.54      47.92      181.0    0.05263   

     feature_6  feature_7  feature_8  feature_9  ...  feature_21  feature_22  \
0      0.27760    0.30010    0.14710     0.2419  ...   

In [3]:
# Encode the target variable
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

# Split the data into features and target variable
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Print the training and testing datasets
print("Training Features (X_train):")
print(pd.DataFrame(X_train).head())  # Print the first 5 rows of the training features

print("\nTesting Features (X_test):")
print(pd.DataFrame(X_test).head())  # Print the first 5 rows of the testing features

print("\nTraining Target (y_train):")
print(y_train.head())  # Print the first 5 rows of the training target

print("\nTesting Target (y_test):")
print(y_test.head())  # Print the first 5 rows of the testing target

Training Features (X_train):
         0         1         2         3         4         5         6   \
0 -1.440753 -0.435319 -1.362085 -1.139118  0.780573  0.718921  2.823135   
1  1.974096  1.733026  2.091672  1.851973  1.319843  3.426275  2.013112   
2 -1.399982 -1.249622 -1.345209 -1.109785 -1.332645 -0.307355 -0.365558   
3 -0.981797  1.416222 -0.982587 -0.866944  0.059390 -0.596788 -0.820203   
4 -1.117700 -1.010259 -1.125002 -0.965942  1.269511 -0.439002 -0.983341   

         7         8         9   ...        20        21        22        23  \
0 -0.119150  1.092662  2.458173  ... -1.232861 -0.476309 -1.247920 -0.973968   
1  2.665032  2.127004  1.558396  ...  2.173314  1.311279  2.081617  2.137405   
2 -0.696502  1.930333  0.954379  ... -1.295284 -1.040811 -1.245220 -0.999715   
3 -0.845115  0.313264  0.074041  ... -0.829197  1.593530 -0.873572 -0.742947   
4 -0.930600  3.394436  0.950213  ... -1.085129 -1.334616 -1.117138 -0.896549   

         24        25        26        

In [6]:

# Define models
# Several machine learning models are defined for comparison.
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
}

# Train and evaluate each model
# Each model is trained on the training data and evaluated on the test data.
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Print evaluation metrics for each model
    print(f"Model: {model_name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
    print("="*60, "\n")

Model: Logistic Regression
Confusion Matrix:
 [[70  1]
 [ 2 41]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Accuracy Score: 0.9736842105263158

Model: Support Vector Machine
Confusion Matrix:
 [[71  0]
 [ 2 41]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114


Accuracy Score: 0.9824561403508771

