In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# --- 1. Read in the dataset from the Excel file ---
df = pd.read_excel('K8.xlsx', header=None, nrows=1000)

In [5]:
# --- 2. Clean the dataset ---
# Replace any '?' with NaN
df.replace('?', np.nan, inplace=True)

# The feature columns are all except the last column.
feature_cols = df.columns[:-1]
target_col = df.columns[-1]

# Convert the feature columns to numeric (floats); non-convertible values become NaN.
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce')

# Remove any row that has NaN in any feature column (this removes the row with '?' values)
df.dropna(axis=0, subset=feature_cols, inplace=True)

# --- 3. Process the target column ---
# Convert the target column to string, lowercase it, and map "active" to 1 and "inactive" to 0.
df[target_col] = df[target_col].astype(str).str.lower().map({'active': 1, 'inactive': 0})
# Keep only rows with valid targets (0 or 1)
df = df[df[target_col].isin([0, 1])]

# --- 4. Create a balanced version of the dataset ---
class_counts = df[target_col].value_counts()
min_count = class_counts.min()
df_balanced = df.groupby(target_col).apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)

# --- 5. Define modeling parameters ---
weighting_types = ['uniform', 'distance']
n_neighbors_list = [1, 3, 5, 9, 11, 15, 21]
dataset_types = {'unbalanced': df, 'balanced': df_balanced}

results = []  # list to store results

# --- 6. Loop over each dataset type and parameter combination ---
for balance_type, dataset in dataset_types.items():
    # Separate features (all columns except the target) and target (last column)
    X = dataset.iloc[:, :-1].values
    y = dataset[target_col].values

    # Split into training (70%) and test (30%) partitions (using stratification)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    
    # Normalize (standardize) the features: fit scaler on training, then transform both sets.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Loop over weighting types and n_neighbors values
    for weight in weighting_types:
        for n in n_neighbors_list:
            clf = KNeighborsClassifier(n_neighbors=n, weights=weight)
            clf.fit(X_train_scaled, y_train)
            
            train_score = clf.score(X_train_scaled, y_train)
            test_score = clf.score(X_test_scaled, y_test)
            
            results.append({
                'Dataset': balance_type,
                'Weighting': weight,
                'n_neighbors': n,
                'Train Accuracy': train_score,
                'Test Accuracy': test_score,
                'Difference (Train - Test)': train_score - test_score
            })
            
            # Print the model parameters and corresponding accuracies
            print(f"Dataset: {balance_type}, Weighting: {weight}, n_neighbors: {n}, "
                  f"Train Acc: {train_score:.4f}, Test Acc: {test_score:.4f}")

# --- 7. Create a summary table of results ---
results_df = pd.DataFrame(results)
print("\nSummary Table:")
print(results_df)

# --- 8. Identify the best parameter combinations ---
best_test = results_df.loc[results_df['Test Accuracy'].idxmax()]
largest_diff = results_df.loc[results_df['Difference (Train - Test)'].abs().idxmax()]
smallest_diff = results_df.loc[results_df['Difference (Train - Test)'].abs().idxmin()]

print("\nBest Test Performance:")
print(best_test)

print("\nLargest Difference (between Train and Test performance):")
print(largest_diff)

print("\nSmallest Difference (between Train and Test performance):")
print(smallest_diff)


Dataset: unbalanced, Weighting: uniform, n_neighbors: 1, Train Acc: 1.0000, Test Acc: 0.8333
Dataset: unbalanced, Weighting: uniform, n_neighbors: 3, Train Acc: 0.9412, Test Acc: 0.9000
Dataset: unbalanced, Weighting: uniform, n_neighbors: 5, Train Acc: 0.8971, Test Acc: 0.9000
Dataset: unbalanced, Weighting: uniform, n_neighbors: 9, Train Acc: 0.8971, Test Acc: 0.9000
Dataset: unbalanced, Weighting: uniform, n_neighbors: 11, Train Acc: 0.8971, Test Acc: 0.9000
Dataset: unbalanced, Weighting: uniform, n_neighbors: 15, Train Acc: 0.8971, Test Acc: 0.9000
Dataset: unbalanced, Weighting: uniform, n_neighbors: 21, Train Acc: 0.8971, Test Acc: 0.9000
Dataset: unbalanced, Weighting: distance, n_neighbors: 1, Train Acc: 1.0000, Test Acc: 0.8333
Dataset: unbalanced, Weighting: distance, n_neighbors: 3, Train Acc: 1.0000, Test Acc: 0.9000
Dataset: unbalanced, Weighting: distance, n_neighbors: 5, Train Acc: 1.0000, Test Acc: 0.9000
Dataset: unbalanced, Weighting: distance, n_neighbors: 9, Train 

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 14, n_neighbors = 15