In [None]:
import numpy as np
import pandas as pd
# from sklearn.base import accuracy_score
from sklearn.calibration import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.discriminant_analysis import StandardScaler
# from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


# STEP 1: Download the Dataset
# Step 2: Read the data into a dataframe
df = pd.read_csv('data_refined.csv')
print("DATASET", df)
print("DATASET INFO", df.info())
print("type...",type(df))
print("\nCalculated the correlation of all features with the target.")
print(df.corr())

"""
To select the most important features, a correlation threshold of 0.5 was applied. Features with an absolute correlation greater than 0.5 were selected for the reduced dataset.
"""

# Calculate correlation of all features with the 'diagnosis' column
correlations = df.corr()['diagnosis'].sort_values(ascending=False)

# Remove the target column itself from the list
correlations = correlations.drop('diagnosis')

# Select features with an absolute correlation > 0.5
important_features = correlations[abs(correlations) > 0.5].index.tolist()

print("Selected Features:", important_features)

selected_features = [
    'concave points_worst', 
    'perimeter_worst', 
    'concave points_mean', 
    'radius_worst', 
    'perimeter_mean', 
    'area_worst', 
    'radius_mean', 
    'area_mean', 
    'concavity_mean', 
    'concavity_worst', 
    'compactness_mean', 
    'compactness_worst', 
    'radius_se', 
    'perimeter_se', 
    'area_se'
]


# Create a new DataFrame with only the selected columns
df_subset = df[selected_features]

print(f"New DataFrame shape: {df_subset.shape}")
print(df_subset.head())


# Separate features (X) and target (y)
X = df[selected_features]
y = df['diagnosis']
print("label..", y)


# Final Data Assembly and Inspection
# Combine features and target for the final model-ready DataFrame
df_processed_final = pd.concat([X, y], axis=1)

print("\n--- Final Preprocessing Summary ---")
print(f"Final DataFrame shape: {df_processed_final.shape}")
print(f"Number of features after One-Hot Encoding and Scaling: {df_processed_final.shape[1] - 1}")
print("\nFirst 5 rows of the fully processed DataFrame (Note the scaled and encoded values):")
print(df_processed_final.head())
print("\nFinal DataFrame Info:")
df_processed_final.info()

# Step 5:  Splitting the Data

"""

Split  data as follows:
80% training set
10% validation set
10% test set
using the 80/10/10 split as it provides a dedicated Validation Set for fine-tuning without touching the final Test Set.

"""

# Assuming X and y are the features and target

# Step 1: Split into 80% Training and 20% Temp (Validation + Test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 2: Split Temp (20%) into 10% Validation and 10% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# # Assuming X contains all features
# # Assuming y contains the target ('diagnosis')

# Step 1: Split into 80% Training and 20% Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 2: Split Temp (20%) into 10% Validation and 10% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# # NOW X_train, X_val, and X_test are all numerical and ready to use!

print(len(X_train),len(y_train),len(X_test),len(y_test),len(X_val),len(y_val))

# # Step 6: Training Classifiers

# """
# Use the KNN-classifier model to train  data.
# Choose the best k for the KNN algorithm by trying different values and validating performance on the validation set.
# Note:
# choosing the best k is an example of hyper-parameter tuning.
# Classification Metrics
# Print the accuracy score of  final classifier.
# print the confusion matrix.

# """

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix # <-- CORRECT IMPORT LOCATION

model = KNeighborsClassifier(n_neighbors = 5)

model.fit(X_train, y_train)
print(X_test[:1])
model.predict(X_test[:1])
predictions = model.predict(X_test) #y_test is our knowns!
print("predictions", predictions)

print(confusion_matrix(y_true = y_test,
                       y_pred = predictions))

print(type(confusion_matrix(y_true = y_test,
                       y_pred = predictions)))
confusion_matrix_array = confusion_matrix(y_true = y_test,
                       y_pred = predictions)

# Create DataFrame for better readability
df_confusion = pd.DataFrame(
confusion_matrix(y_true = y_test,
                    y_pred = predictions),
columns=["Predicted 0 (≤50K)", "Predicted 1 (>50K)"],
index=["Actual 0 (≤50K)", "Actual 1 (>50K)"]
)

print("confusion matrix data fram",df_confusion)

# """
# Confusion MatrixThe matrix [[3395 7], [41 1080]] is the core of evaluation on the Test Set X_test and y_test.
# Assuming 0 is the negative class (<=50K) and 1 is the positive class (> 50K)):Predicted 0 (≤50K)Predicted 1 (>50K)Actual 0 ($\le 50\text{K}$)3395 (True Negatives)7 (False Positives)Actual 1 ($\gt 50\text{K}$)41 (False Negatives)1080 (True Positives)

# What Each Number Means
# 1. True Negatives (TN) = 3395

# These are people who actually earn ≤50K.

# model correctly predicted they earn ≤50K.

# High TN means the model is very good at identifying low-income individuals.

# 2. False Positives (FP) = 7

# These are people who actually earn ≤50K.

# The model incorrectly predicted they earn >50K.

# Only 7 mistakes — extremely low FP rate.

# This means the model is not overpredicting high income.

# 3. False Negatives (FN) = 41

# These are people who actually earn >50K.

# The model incorrectly predicted they earn ≤50K.

# This is more significant than FP because FN means you missed high-income individuals.

# 4. True Positives (TP) = 1080

# These are people who actually earn >50K.

# model correctly predicted they earn >50K.

# High TP means the successfully captures most high-income cases.
# """

# """
# When you run an evaluation metric (like accuracy_score or confusion_matrix, you provide two pieces of information:
# y_true (True Labels): These are the actual, known outcomes (e.g., the person's real income class, (<=50K) or (>50K) from the target column of  evaluation set (eithery_val or y_test).

# y_pred (Predicted Labels): These are the model's guesses predictions} generated by running the features X_val or X_test through the trained model.
# confusion_matrix(y_true = y_test, y_pred = predictions)

# y_true was set to y_test, which are the known true labels for the test set.
# y_pred was set to predictions, which were the model's guesses for the X_test features.
# """

# # Make a loop to run a range of nearest neighbours to find the best model

# Initialize lists to store metrics
err_rate = []
accuracy = []

# # Convert y_val to a NumPy array for clean element-wise comparison
# # This is a good practice to ensure the comparison works reliably.
# y_val_np = y_val.values if isinstance(y_val, pd.Series) else y_val

# # Calculate multiple models:
# # Test k from 1 up to 100
print("Starting hyperparameter tuning for K...")
for i in range(1,101):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    # Make predictions on the VALIDATION set
    predictions_i = knn.predict(X_val)

#     # and Track the results!
#     # Calculate the error rate: (Number of incorrect predictions) / (Total predictions)
    err_rate.append(np.mean(predictions_i != y_val)) # another way to calculate the error rate
    accuracy.append(knn.score(X_val, y_val))

    
#     # --- Find the Best K using the requested list methods ---

# # Find the minimum error rate value
min_error = min(err_rate)
# Find the index of that minimum error
best_err_index = err_rate.index(min_error)
# Calculate the actual K value (Index + 1, since the loop started at K=1)
best_k = best_err_index + 1 

# # Run the best model

best_knn_model = KNeighborsClassifier(n_neighbors = best_k)
best_knn_model.fit(X_train, y_train)


print(best_knn_model.score(X_test, y_test))

print("\n--- Tuning Results Summary ---")
print(f"Optimal k found (lowest error): {best_k}")
print(f"Lowest Validation Error Rate: {min_error:.4f}")
print(f"Highest Validation Accuracy at k={best_k}: {accuracy[best_err_index]:.4f}")


# # --- Final Model Training and Evaluation on TEST SET ---

# # 1. Train the final model using the BEST k on the full Training Set
print(f"\nTraining final model with Optimal k = {best_k}...")
best_knn_model = KNeighborsClassifier(n_neighbors=best_k)
best_knn_model.fit(X_train, y_train)

# # 2. Evaluate the final model on the independent TEST SET
final_predictions = best_knn_model.predict(X_test)

# # --- Classification Metrics ---

# # Final Accuracy Score
final_accuracy = accuracy_score(y_test, final_predictions)
print(f"\nFinal Test Set Accuracy (k={best_k}): {final_accuracy:.4f}")

# # Final Confusion Matrix
final_conf_matrix = confusion_matrix(y_true=y_test, y_pred=final_predictions)
print("\nFinal Confusion Matrix (Test Set):")
print(final_conf_matrix)

# ---- SVM-----
# Access the module in sklearn to do the SVM analysis (must be capitalized, it's a class)
from sklearn.svm import SVC
# Create an instance of 'svm_model' with the RBF Kernel
svm_model = SVC(max_iter = 1000,
               kernel = "rbf",
               C = 1.0,
               gamma = 'scale'
               ) # max_iter is not required, but can prevent crashing when models don't converge as the default is unlimited iterations

# Training and fitting the model now
svm_model.fit(X_train, y_train) 

# Check the model score!
svm_model.score(X_test,y_test)

from sklearn.metrics import classification_report, confusion_matrix
predictions = svm_model.predict(X_test)

# predictions
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


# Linear Model
svm_model = SVC(max_iter = 1000,
               kernel = "linear",
               C= 1.0,
               )

svm_model.fit(X_train, y_train)
svm_model.score(X_test,y_test)

# optimize model
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf',]}

# We need to import GridSearchCV from model_selection. ** Must be capitalized in GridSearchCV! **
from sklearn.model_selection import GridSearchCV

# refit = True by default, this re-trains the best model and saves it in 'grid' for further use. 
grid = GridSearchCV(SVC(max_iter = 1000000000), param_grid, verbose=3, refit = True) 

#grid is an instance of the gridsearch. 
# Run a model and save the results for each combination of variables in the grid
grid.fit(X_train,y_train)

print('The best parameters (on the training data) are %s with a score of %0.2f.' 
      % (grid.best_params_, grid.best_score_))

# The best parameters (on the training data) are {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'} 
print("grid.best_params_",grid.best_params_)

#Now we have everything in grid, including the best one, so we can just use that:
grid_predictions = grid.predict(X_train)

print(confusion_matrix(y_train,grid_predictions))
print(classification_report(y_train,grid_predictions))

print("Compare the training score of %0.2f, with a score of %0.2f \nwhen the test data is passed through the model." 
      % (grid.best_score_, grid.score(X_test, y_test)))

"""
Here is the breakdown of what those numbers mean and why this specific comparison is important.

1. The Confusion Matrix & Report (Training Data)
The confusion matrix and classification report you pasted correspond to Training Set (455 samples).

Accuracy: 98% (Confusion Matrix shows 444 correct out of 455).

Precision/Recall: Both are very high (>0.95), meaning the model learned the training patterns almost perfectly without completely memorizing them (which would be 100% accuracy, often leading to overfitting).

2. The Comparison: "Training Score 0.96 vs. Test Score 0.96"
This is the most critical part of evaluation.

"Training Score of 0.96" (grid.best_score_): This is not the accuracy on the entire training set (which was 98%), but the Cross-Validation Score. It is the average accuracy model achieved on the "validation folds" during the grid search. This is a realistic estimate of how the model should perform on unseen data.

"Score of 0.96" (grid.score(X_test, y_test)): This is the accuracy on  Test Set (the data the model had never seen before).

Conclusion: Why this is good
The fact that  Cross-Validation score (0.96) matches  Test score (0.96) is the ideal outcome in machine learning.

No Overfitting: If the training score was much higher than the test score (e.g., 0.98 vs 0.85),  model would be overfitting (memorizing data).

No Underfitting: If both scores were low (e.g., 0.70), it would be underfitting.

High Generalization: Because the scores are identical and high (>94%),  model is robust. It creates a reliable boundary between Benign and Malignant tumors that works just as well on new patients as it did on the study data.

Successfully met and exceeded the 94% accuracy requirement with a model that generalizes perfectly.

"""

# STEP 5: Find another way to reduce the set of features.
# Implement your classification using the new reduced set of features.


DATASET            id  radius_mean  texture_mean  perimeter_mean  area_mean  \
0   -0.236405     1.097064     -2.073335        1.269934   0.984375   
1   -0.236403     1.829821     -0.353632        1.685955   1.908708   
2    0.431741     1.579888      0.456187        1.566503   1.558884   
3    0.432121    -0.768909      0.253732       -0.592687  -0.764464   
4    0.432201     1.750297     -1.151816        1.776573   1.826229   
..        ...          ...           ...             ...        ...   
564 -0.235732     2.110995      0.721473        2.060786   2.343856   
565 -0.235730     1.704854      2.085134        1.615931   1.723842   
566 -0.235727     0.702284      2.045574        0.672676   0.577953   
567 -0.235725     1.838341      2.336457        1.982524   1.735218   
568 -0.242406    -1.808401      1.221792       -1.814389  -1.347789   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0           1.568466          3.283515        2.652874      



[CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.901 total time=   0.0s
[CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.967 total time=   0.0s
[CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.923 total time=   0.0s
[CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.901 total time=   0.0s
[CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.945 total time=   0.0s
[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.912 total time=   0.0s
[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.934 total time=   0.0s
[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.901 total time=   0.0s
[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.923 total time=   0.0s
[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.868 total time=   0.0s
[CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.956 total time=   0.0s
[CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.923 total time=   0.0s
[CV 3/5] END ......C=100, ga

'\nHere is the breakdown of what those numbers mean and why this specific comparison is important.\n\n1. The Confusion Matrix & Report (Training Data)\nThe confusion matrix and classification report you pasted correspond to Training Set (455 samples).\n\nAccuracy: 98% (Confusion Matrix shows 444 correct out of 455).\n\nPrecision/Recall: Both are very high (>0.95), meaning the model learned the training patterns almost perfectly without completely memorizing them (which would be 100% accuracy, often leading to overfitting).\n\n2. The Comparison: "Training Score 0.96 vs. Test Score 0.96"\nThis is the most critical part of evaluation.\n\n"Training Score of 0.96" (grid.best_score_): This is not the accuracy on the entire training set (which was 98%), but the Cross-Validation Score. It is the average accuracy model achieved on the "validation folds" during the grid search. This is a realistic estimate of how the model should perform on unseen data.\n\n"Score of 0.96" (grid.score(X_test, y_t