In [69]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,accuracy_score,mean_absolute_error,root_mean_squared_error,mean_squared_error,confusion_matrix,classification_report,roc_curve,RocCurveDisplay,roc_auc_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,LogisticRegression
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from warnings import filterwarnings
from sklearn.neighbors import KNeighborsClassifier

In [37]:
import os
os.chdir("D:/meridianthe4/PML/Datasets")

In [38]:
# try multiple candidate paths to locate dataset
candidates = [
	'D:/meridianthe4/PML/Cases/Wisconsin/BreastCancer.csv',
	'D:/meridianthe4/PML/Cases/WisconsinBreastCancer.csv',
	os.path.join(os.getcwd(), 'Cases', 'Wisconsin', 'BreastCancer.csv'),
	os.path.join(os.getcwd(), 'Cases', 'WisconsinBreastCancer.csv'),
	alt_path if 'alt_path' in globals() else None,
	file_path if 'file_path' in globals() else None,
]

# filter out None and duplicates, normalize paths
candidates = [os.path.normpath(p) for p in dict.fromkeys(p for p in candidates if p)]

for p in candidates:
	if os.path.exists(p):
		df = pd.read_csv(p, index_col=0)
		break
else:
	raise FileNotFoundError(f"Could not find dataset. Tried paths: {candidates}")

df

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign
...,...,...,...,...,...,...,...,...,...,...
1369821,10,10,10,10,5,10,10,10,7,Malignant
1371026,5,10,10,10,4,10,5,6,3,Malignant
1371920,5,1,1,1,2,1,3,2,1,Benign
8233704,4,1,1,1,1,1,2,1,1,Benign


In [39]:
le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])
X,y = df.drop('Class',axis=1),df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25,stratify=y)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [40]:
print("Accuracy:", knn.score(X_test, y_test))
print(accuracy_score(y_test,y_pred))
y_pred_proba = knn.predict_proba(X_test)
print("ROC AUC:",roc_auc_score(y_test,y_pred_proba[:,1]))

Accuracy: 0.9761904761904762
0.9761904761904762
ROC AUC: 0.9908917069243157


In [41]:
Ks = np.arange(1,16)
scores = []
for k in Ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred_prob = knn.predict_proba(X_test)
    scores.append([k,roc_auc_score(y_test,y_pred_prob[:,1])])
df_scores = pd.DataFrame(scores,columns=['k','score'])
df_scores.sort_values('score', ascending=False)

Unnamed: 0,k,score
3,4,0.991194
4,5,0.990892
6,7,0.990841
5,6,0.990791
7,8,0.99064
8,9,0.990489
9,10,0.990338
10,11,0.990137
11,12,0.989785
12,13,0.989634


In [42]:
import os
os.chdir("D:\\meridianthe4\\PML\\Cases")

In [43]:
hr = pd.read_csv('HR_comma_sep.csv')

In [44]:
X, y = hr.drop('left',axis=1),hr['left']

In [45]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
col_transformer = ColumnTransformer([("OHE", ohe, make_column_selector(dtype_include=object))], 
                                    remainder='passthrough', 
                                    verbose_feature_names_out=False)
col_transformer = col_transformer.set_output(transform="pandas")
X = col_transformer.fit_transform(X)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25,stratify=y)

In [47]:
Ks = np.arange(1,16)
scores = []
for k in Ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred_prob = knn.predict_proba(X_test)
    scores.append([k,roc_auc_score(y_test,y_pred_prob[:,1])])
df_scores = pd.DataFrame(scores,columns=['k','score'])
df_scores.sort_values('score', ascending=False)

Unnamed: 0,k,score
7,8,0.974952
6,7,0.974829
8,9,0.974618
9,10,0.974376
11,12,0.974147
12,13,0.974071
5,6,0.974021
10,11,0.973785
13,14,0.973776
14,15,0.973707


In [48]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor

In [49]:
boston = pd.read_csv("D:\\meridianthe4\\PML\\Datasets\\Boston.csv")

In [50]:
boston

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [51]:
X, y = boston.drop('medv',axis=1),boston['medv']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [53]:
Ks = np.arange(1,16)
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    scores.append([k,mean_absolute_error(y_test,y_pred)])
df_scores = pd.DataFrame(scores,columns=['k','score'])
df_scores.sort_values('score', ascending=True)

Unnamed: 0,k,score
4,5,4.276316
0,1,4.3
5,6,4.362829
3,4,4.384868
7,8,4.414145
6,7,4.443609
9,10,4.481118
8,9,4.496345
1,2,4.5
10,11,4.530801


In [60]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print("MAE:", mean_absolute_error(y_test,y_pred))

MAE: 4.276315789473684


In [61]:
scaler = StandardScaler().set_output(transform="pandas")
X_train_scaled = scaler.fit_transform(X_train)
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled,y_train)
X_test_scaled = scaler.transform(X_test)
y_pred = knn.predict(X_test_scaled)
print("MAE:", mean_absolute_error(y_test,y_pred))

MAE: 2.8689473684210527


In [63]:
Ks = np.arange(1,16)
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train_scaled,y_train)
    y_pred = knn.predict(X_test_scaled)
    scores.append([k,mean_absolute_error(y_test,y_pred)])
df_scores = pd.DataFrame(scores,columns=['k','score'])
df_scores.sort_values('score', ascending=True)

Unnamed: 0,k,score
0,1,2.65
2,3,2.689912
3,4,2.719737
1,2,2.807566
5,6,2.864583
4,5,2.868947
14,15,2.928509
7,8,2.928783
10,11,2.941268
11,12,2.942654


In [64]:
scaler = StandardScaler().set_output(transform="pandas")
X_train_scaled = scaler.fit_transform(X_train)
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train_scaled,y_train)
X_test_scaled = scaler.transform(X_test)
y_pred = knn.predict(X_test_scaled)
print("MAE:", mean_absolute_error(y_test,y_pred))

MAE: 2.6899122807017544


### Housing Dataset

In [67]:
housing = pd.read_csv("D:\\meridianthe4\\PML\\Datasets\\Housing.csv")

In [68]:
housing

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,yes,yes,no,no,yes,0,no
542,94000.0,6000,3,2,4,yes,no,no,no,yes,0,no
543,103000.0,6000,3,2,4,yes,yes,no,no,yes,1,no
544,105000.0,6000,3,2,2,yes,yes,no,no,yes,1,no


In [71]:
X, y = housing.drop('price', axis=1),housing['price']

In [72]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
col_transformer = ColumnTransformer([("OHE", ohe, make_column_selector(dtype_include=object))], 
                                    remainder='passthrough', 
                                    verbose_feature_names_out=False)
col_transformer = col_transformer.set_output(transform="pandas")
X = col_transformer.fit_transform(X)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [74]:
Ks = np.arange(1,16)
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    scores.append([k,mean_absolute_error(y_test,y_pred)])
df_scores = pd.DataFrame(scores,columns=['k','score'])
df_scores.sort_values('score', ascending=True)

Unnamed: 0,k,score
2,3,16956.605691
4,5,16974.496341
5,6,17033.884146
13,14,17095.358014
3,4,17185.521341
14,15,17270.135772
11,12,17344.276931
10,11,17398.662417
12,13,17427.580675
6,7,17518.982578


In [75]:
scaler = StandardScaler().set_output(transform="pandas")
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [76]:
Ks = np.arange(1,16)
scores = []
for k in Ks:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train_scaled,y_train)
    y_pred = knn.predict(X_test_scaled)
    scores.append([k,mean_absolute_error(y_test,y_pred)])
df_scores = pd.DataFrame(scores,columns=['k','score'])
df_scores.sort_values('score', ascending=True)

Unnamed: 0,k,score
3,4,12407.129573
4,5,12642.542683
9,10,12731.168293
14,15,12776.493902
10,11,12787.108647
13,14,12793.261324
12,13,12814.899156
11,12,12855.385671
8,9,12867.924119
7,8,12935.713415
