In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# Step 1: Load the previously pre-processed dataset
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(r"C:\Users\marca\Downloads\data_refined.csv")
print("Dataset loaded. Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Step 2: Feature Selection
# Encode the target 'Diagnosed' column ('M' -> 1, 'B' -> 0)
le = LabelEncoder()
df['Diagnosed'] = le.fit_transform(df['diagnosis'])  # Assuming 'diagnosis' is the column name in the data

# Calculate correlation of all features to the target
X_full = df.drop(columns=['diagnosis', 'Diagnosed'])  # All features (excluding target)
y = df['Diagnosed']
correlations = X_full.corrwith(y).abs()  # Absolute correlation with target

# Choose features with correlation > 0.5
threshold = 0.5
important_features = correlations[correlations > threshold].index.tolist()
print("\nImportant features (correlation > 0.5):", important_features)

# Create reduced feature set
X_reduced = X_full[important_features]

# Step 3: Splitting the Data
# Split into 80% train, 10% validation, 10% test
# First split: 90% train+val, 10% test for both full and reduced sets
train_idx, test_idx = train_test_split(range(len(y)), test_size=0.1, random_state=42)
X_train_full = X_full.iloc[train_idx]
X_test_full = X_full.iloc[test_idx]
X_train_reduced = X_reduced.iloc[train_idx]
X_test_reduced = X_reduced.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]

# Second split: From 90% (train+val), split into 80% train and 10% val (0.1111 of 90% = 10% of total)
train_idx_2, val_idx = train_test_split(range(len(y_train)), test_size=0.1111, random_state=42)
X_train_full_final = X_train_full.iloc[train_idx_2]
X_val_full = X_train_full.iloc[val_idx]
X_train_reduced_final = X_train_reduced.iloc[train_idx_2]
X_val_reduced = X_train_reduced.iloc[val_idx]
y_train_final = y_train.iloc[train_idx_2]
y_val = y_train.iloc[val_idx]

print("\nData split shapes:")
print("Full: Train:", X_train_full_final.shape, "Val:", X_val_full.shape, "Test:", X_test_full.shape)
print("Reduced: Train:", X_train_reduced_final.shape, "Val:", X_val_reduced.shape, "Test:", X_test_reduced.shape)

# Step 4: Training Classifiers
# a) KNN with cross-validation to find optimal k
k_values = range(1, 21)
k_scores = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_full_final, y_train_final, cv=5)  # 5-fold cross-validation
    k_scores.append(scores.mean())

best_k = k_values[np.argmax(k_scores)]
print(f"\nBest k for KNN: {best_k} with cross-val accuracy: {max(k_scores):.4f}")

# Train and evaluate classifiers on full and reduced sets
classifiers = {
    "KNN": KNeighborsClassifier(n_neighbors=best_k),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVC": SVC(random_state=42)
}

print("\nClassification Results:")
for name, clf in classifiers.items():
    # Full features
    clf.fit(X_train_full_final, y_train_final)
    y_pred_full = clf.predict(X_test_full)
    acc_full = accuracy_score(y_test, y_pred_full)
    cm_full = confusion_matrix(y_test, y_pred_full)
    
    # Reduced features
    clf.fit(X_train_reduced_final, y_train_final)
    y_pred_reduced = clf.predict(X_test_reduced)
    acc_reduced = accuracy_score(y_test, y_pred_reduced)
    cm_reduced = confusion_matrix(y_test, y_pred_reduced)
    
    print(f"\n{name}:")
    print(f"Full Features - Accuracy: {acc_full:.4f}, Confusion Matrix:\n{cm_full}")
    print(f"Reduced Features - Accuracy: {acc_reduced:.4f}, Confusion Matrix:\n{cm_reduced}")

# Step 5: Challenge Yourself - Alternative Feature Reduction
# Use Random Forest feature importance to select top 10 features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_full_final, y_train_final)
importances = pd.Series(rf.feature_importances_, index=X_full.columns)
top_features = importances.nlargest(10).index.tolist()  # Top 10 features
print("\nTop 10 features by Random Forest importance:", top_features)

# Create new reduced set with top features
X_top = X_full[top_features]
X_train_top = X_top.iloc[train_idx]  # Reuse train_idx from first split
X_test_top = X_top.iloc[test_idx]    # Reuse test_idx from first split
X_train_top_final = X_train_top.iloc[train_idx_2]  # Reuse train_idx_2 from second split
X_val_top = X_train_top.iloc[val_idx]  # Reuse val_idx from second split

print("\nData split shapes (Top 10 features):")
print("Train:", X_train_top_final.shape, "Val:", X_val_top.shape, "Test:", X_test_top.shape)

# Train and evaluate classifiers on new reduced set
print("\nClassification Results with Top 10 Features:")
for name, clf in classifiers.items():
    clf.fit(X_train_top_final, y_train_final)
    y_pred_top = clf.predict(X_test_top)
    acc_top = accuracy_score(y_test, y_pred_top)
    cm_top = confusion_matrix(y_test, y_pred_top)
    print(f"{name} - Accuracy: {acc_top:.4f}, Confusion Matrix:\n{cm_top}")

# Compare results
print("\nComparison: Look at accuracy scores above. Full features may perform better due to more information, "
      "but reduced sets can achieve >94% accuracy with fewer features, improving efficiency.")

Dataset loaded. Shape: (569, 31)
Columns: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'diagnosis']

Important features (correlation > 0.5): ['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'perimeter_worst', 'area_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']

Data split shapes:
Full: Train: (455, 30) Val: (57, 30) Test: (57, 30)
Reduced: Train: (455, 15)

[WinError 2] The system cannot find the file specified
  File "C:\Users\marca\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\marca\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\marca\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\marca\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



Best k for KNN: 4 with cross-val accuracy: 0.9626

Classification Results:

KNN:
Full Features - Accuracy: 0.9474, Confusion Matrix:
[[39  1]
 [ 2 15]]
Reduced Features - Accuracy: 0.9474, Confusion Matrix:
[[39  1]
 [ 2 15]]

Random Forest:
Full Features - Accuracy: 0.9649, Confusion Matrix:
[[39  1]
 [ 1 16]]
Reduced Features - Accuracy: 0.9649, Confusion Matrix:
[[39  1]
 [ 1 16]]

SVC:
Full Features - Accuracy: 0.9649, Confusion Matrix:
[[39  1]
 [ 1 16]]
Reduced Features - Accuracy: 0.9474, Confusion Matrix:
[[39  1]
 [ 2 15]]

Top 10 features by Random Forest importance: ['area_worst', 'concave points_worst', 'radius_worst', 'concave points_mean', 'perimeter_worst', 'perimeter_mean', 'concavity_mean', 'area_mean', 'radius_mean', 'concavity_worst']

Data split shapes (Top 10 features):
Train: (455, 10) Val: (57, 10) Test: (57, 10)

Classification Results with Top 10 Features:
KNN - Accuracy: 0.9474, Confusion Matrix:
[[39  1]
 [ 2 15]]
Random Forest - Accuracy: 0.9649, Confusion 