<a href="https://colab.research.google.com/github/mgmeti/BasicsOfML/blob/main/knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [152]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

* ### KNN from Scratch

In [168]:
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, x):
        distances = [self.euclidean_distance(x, x_train) for _, x_train in self.X_train.iterrows()]
        k_neighbors_indices = np.argsort(distances)[:self.k]
        k_neighbor_labels = [self.y_train.iloc[i] for i in k_neighbors_indices]
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]

    def euclidean_distance(self, row1, row2):
        distance = 0.0
        for i in range(len(row1)):
            if isinstance(row1[i], str) and isinstance(row2[i], str):
                distance += int(row1[i] != row2[i])
            else:
                distance += (float(row1[i]) - float(row2[i]))**2
        return np.sqrt(distance)


In [127]:
def tune_k_knn(X, y, k_values):
    best_k = None
    best_accuracy = 0.0
    best_classifier = None

    for k in k_values:
        knn_classifier = KNNClassifier(k=k)
        X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=42)
        knn_classifier.fit(X_train, y_train)

        # Perform cross-validation and get accuracy
        predictions_cv = [knn_classifier.predict(x) for _, x in X_cv.iterrows()]
        accuracy_cv = accuracy_score(y_cv, predictions_cv)

        if accuracy_cv > best_accuracy:
            best_accuracy = accuracy_cv
            best_k = k
            best_classifier = knn_classifier

    return best_k, best_classifier


In [118]:
def k_fold_cross_validation_knn(knn_classifier, X, y, k_folds=5):
    n = len(X)
    fold_size = n // k_folds
    indices = np.arange(n)

    accuracies = []

    for i in range(k_folds):
        test_indices = indices[i * fold_size:(i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])

        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        knn_classifier.fit(X_train, y_train)
        predictions = [knn_classifier.predict(x) for _, x in X_test.iterrows()]

        accuracy_fold = accuracy_score(y_test, predictions)
        accuracies.append(accuracy_fold)

    return np.mean(accuracies)

 ### Implementation

* ### car

In [169]:


# Step 1: Read CSV into DataFrame without column names
file_path = '/content/car.csv'  # Replace with the path to your CSV file
df1 = pd.read_csv(file_path, header=None)

# Step 2: Print the head of the DataFrame
print("Original DataFrame:")
print(df1.head())

# Step 3: Remove NA values
df1 = df1.dropna()

# Step 4: Assign column names (you can replace these with your actual column names)
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety","class"]  # Replace with your column names
df1.columns = column_names

# Step 5: Encode categorical values
le = LabelEncoder()
for column in df1.columns:
    if df1[column].dtype == 'object':  # Check if the column contains categorical values
        df1[column] = le.fit_transform(df1[column])

# Step 6: Print head of the cleaned and encoded DataFrame
print("\nDataFrame after cleaning and encoding:")
print(df1.head())


Original DataFrame:
       0      1  2  3      4     5      6
0  vhigh  vhigh  2  2  small   low  unacc
1  vhigh  vhigh  2  2  small   med  unacc
2  vhigh  vhigh  2  2  small  high  unacc
3  vhigh  vhigh  2  2    med   low  unacc
4  vhigh  vhigh  2  2    med   med  unacc

DataFrame after cleaning and encoding:
   buying  maint  doors  persons  lug_boot  safety  class
0       3      3      0        0         2       1      2
1       3      3      0        0         2       2      2
2       3      3      0        0         2       0      2
3       3      3      0        0         1       1      2
4       3      3      0        0         1       2      2


In [170]:
# Extract features (X) and target (y)
X = df1.drop('class', axis=1)  # Drop the 'Class' column to get features
y = df1['class']  # Target variable

# Calculate mean and standard deviation for each feature
means = X.mean()
std_devs = X.std()

# Normalize features manually to have mean 0 and variance 1
X_normalized = (X - means) / std_devs

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)


# Specify a range of k values to tune
k_values_to_tune = [1, 3, 5, 7, 9]

# Perform tuning and get the best k and classifier
best_k, best_classifier = tune_k_knn(X_train, y_train, k_values_to_tune)

# Use the best classifier to make predictions on X_test
# (Assuming X_test is already defined)
predictions_test = [best_classifier.predict(x) for _, x in X_test.iterrows()]

# Calculate accuracy
best_accuracy = accuracy_score(y_test, predictions_test)

print(f"Best k: {best_k}")
print(f"Best Accuracy: {best_accuracy:.2f}")
print(f"Predictions on X_test: {predictions_test}")

Best k: 5
Best Accuracy: 0.92
Predictions on X_test: [2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 0, 3, 2, 0, 0, 2, 2, 2, 2, 2, 1, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3, 0, 2, 0, 3, 3, 2, 1, 2, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 3, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 3, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 3, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 1, 3, 0, 2, 2, 2, 2, 2, 3, 0, 0, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 2, 2, 2

In [171]:


# Extract features (X) and target (y)
X = df1.drop('class', axis=1)
y = df1['class']



# Normalize features using scikit-learn's StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)

# Perform k-fold cross-validation using scikit-learn
knn_classifier = KNeighborsClassifier()
cv_scores = cross_val_score(knn_classifier, X_normalized, y, cv=10)

# Print mean accuracy across folds
print(f"Mean Accuracy with 10-fold Cross-Validation: {np.mean(cv_scores):.2f}")

# Tune k using GridSearchCV
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best k from grid search
best_k = grid_search.best_params_['n_neighbors']
print(f"Best k: {best_k}")

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
predictions_test_skl = best_model.predict(X_test)

# Calculate accuracy on the test set
accuracy_test = accuracy_score(y_test, predictions_test_skl)
print(f"Accuracy on the Test Set: {accuracy_test:.2f}")


Mean Accuracy with 10-fold Cross-Validation: 0.83
Best k: 5
Accuracy on the Test Set: 0.92


In [172]:
# Paired t-test
t_stat, p_value = ttest_rel(predictions_test, predictions_test_skl)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is a significant difference.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference.")

Reject the null hypothesis. There is a significant difference.


* ### cancer

In [173]:

# Step 1: Read CSV into DataFrame without column names
file_path = '/content/breast-cancer.csv'  # Replace with the path to your CSV file
df2 = pd.read_csv(file_path, header=None)

# Step 2: Print the head of the DataFrame
print("Original DataFrame:")
print(df2.head())

# Step 3: Remove NA values
df2 = df2.dropna()

# Step 4: Assign column names (you can replace these with your actual column names)
column_names = [
    'class',
    'age',
    'menopause',
    'tumor-size',
    'inv-nodes',
    'node-caps',
    'deg-malig',
    'breast',
    'breast-quad',
    'irradiat'
]
df2.columns = column_names

# Step 5: Encode categorical values
le = LabelEncoder()
for column in df2.columns:
    if df2[column].dtype == 'object':  # Check if the column contains categorical values
        df2[column] = le.fit_transform(df2[column])

# Step 6: Print head of the cleaned and encoded DataFrame
print("\nDataFrame after cleaning and encoding:")
print(df2.head())


Original DataFrame:
                      0      1        2      3    4   5  6      7          8  \
0  no-recurrence-events  30-39  premeno  30-34  0-2  no  3   left   left_low   
1  no-recurrence-events  40-49  premeno  20-24  0-2  no  2  right   right_up   
2  no-recurrence-events  40-49  premeno  20-24  0-2  no  2   left   left_low   
3  no-recurrence-events  60-69     ge40  15-19  0-2  no  2  right    left_up   
4  no-recurrence-events  40-49  premeno    0-4  0-2  no  2  right  right_low   

    9  
0  no  
1  no  
2  no  
3  no  
4  no  

DataFrame after cleaning and encoding:
   class  age  menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  \
0      0    1          2           5          0          1          3       0   
1      0    2          2           3          0          1          2       1   
2      0    2          2           3          0          1          2       0   
3      0    4          0           2          0          1          2       1   
4     

In [174]:
# Extract features (X) and target (y)
X = df2.drop('class', axis=1)  # Drop the 'Class' column to get features
y = df2['class']  # Target variable

# Calculate mean and standard deviation for each feature
means = X.mean()
std_devs = X.std()

# Normalize features manually to have mean 0 and variance 1
X_normalized = (X - means) / std_devs

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)


# Specify a range of k values to tune
k_values_to_tune = [1, 3, 5, 7, 9]

# Perform tuning and get the best k and classifier
best_k, best_classifier = tune_k_knn(X_train, y_train, k_values_to_tune)

# Use the best classifier to make predictions on X_test
# (Assuming X_test is already defined)
predictions_test = [best_classifier.predict(x) for _, x in X_test.iterrows()]

# Calculate accuracy
best_accuracy = accuracy_score(y_test, predictions_test)

print(f"Best k: {best_k}")
print(f"Best Accuracy: {best_accuracy:.2f}")
print(f"Predictions on X_test: {predictions_test}")

Best k: 7
Best Accuracy: 0.67
Predictions on X_test: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [175]:


# Extract features (X) and target (y)
X = df2.drop('class', axis=1)
y = df2['class']



# Normalize features using scikit-learn's StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)

# Perform k-fold cross-validation using scikit-learn
knn_classifier = KNeighborsClassifier()
cv_scores = cross_val_score(knn_classifier, X_normalized, y, cv=10)

# Print mean accuracy across folds
print(f"Mean Accuracy with 10-fold Cross-Validation: {np.mean(cv_scores):.2f}")

# Tune k using GridSearchCV
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best k from grid search
best_k = grid_search.best_params_['n_neighbors']
print(f"Best k: {best_k}")

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
predictions_test_skl = best_model.predict(X_test)

# Calculate accuracy on the test set
accuracy_test = accuracy_score(y_test, predictions_test_skl)
print(f"Accuracy on the Test Set: {accuracy_test:.2f}")


Mean Accuracy with 10-fold Cross-Validation: 0.69
Best k: 7
Accuracy on the Test Set: 0.67


In [176]:
# Paired t-test
t_stat, p_value = ttest_rel(predictions_test, predictions_test_skl)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is a significant difference.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference.")

Fail to reject the null hypothesis. There is no significant difference.


* ### roth

In [177]:

# Step 1: Read CSV into DataFrame without column names
file_path = '/content/hayes-roth.csv'  # Replace with the path to your CSV file
df3 = pd.read_csv(file_path, header=None)

# Step 2: Print the head of the DataFrame
print("Original DataFrame:")
print(df3.head())

# Step 3: Remove NA values
df3 = df3.dropna()

# Step 4: Assign column names (you can replace these with your actual column names)

column_names

column_names = [
    'name',
    'hobby',
    'age',
    'educational_level',
    'marital_status',
    'class'
]

df3.columns = column_names

# Step 5: Encode categorical values
le = LabelEncoder()
for column in df3.columns:
    if df3[column].dtype == 'object':  # Check if the column contains categorical values
        df3[column] = le.fit_transform(df3[column])

# Step 6: Print head of the cleaned and encoded DataFrame
print("\nDataFrame after cleaning and encoding:")
print(df3.head())


Original DataFrame:
     0  1  2  3  4  5
0   92  2  1  1  2  1
1   10  2  1  3  2  2
2   83  3  1  4  1  3
3   61  2  4  2  2  3
4  107  1  1  3  4  3

DataFrame after cleaning and encoding:
   name  hobby  age  educational_level  marital_status  class
0    92      2    1                  1               2      1
1    10      2    1                  3               2      2
2    83      3    1                  4               1      3
3    61      2    4                  2               2      3
4   107      1    1                  3               4      3


In [178]:
# Extract features (X) and target (y)
X = df3.drop('class', axis=1)  # Drop the 'Class' column to get features
y = df3['class']  # Target variable

# Calculate mean and standard deviation for each feature
means = X.mean()
std_devs = X.std()

# Normalize features manually to have mean 0 and variance 1
X_normalized = (X - means) / std_devs

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)


# Specify a range of k values to tune
k_values_to_tune = [1, 3, 5, 7, 9]

# Perform tuning and get the best k and classifier
best_k, best_classifier = tune_k_knn(X_train, y_train, k_values_to_tune)

# Use the best classifier to make predictions on X_test
# (Assuming X_test is already defined)
predictions_test = [best_classifier.predict(x) for _, x in X_test.iterrows()]

# Calculate accuracy
best_accuracy = accuracy_score(y_test, predictions_test)

print(f"Best k: {best_k}")
print(f"Best Accuracy: {best_accuracy:.2f}")
print(f"Predictions on X_test: {predictions_test}")

Best k: 1
Best Accuracy: 0.58
Predictions on X_test: [2, 2, 2, 2, 1, 2, 2, 2, 3, 1, 1, 2, 1, 3, 3, 1, 2, 2, 1, 2, 2, 2, 1, 3, 3, 1, 1, 2, 2, 1, 1, 3, 1]


Sk-learn

In [179]:


# Extract features (X) and target (y)
X = df3.drop('class', axis=1)
y = df3['class']



# Normalize features using scikit-learn's StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.25, random_state=42)

# Perform k-fold cross-validation using scikit-learn
knn_classifier = KNeighborsClassifier()
cv_scores = cross_val_score(knn_classifier, X_normalized, y, cv=10)

# Print mean accuracy across folds
print(f"Mean Accuracy with 10-fold Cross-Validation: {np.mean(cv_scores):.2f}")

# Tune k using GridSearchCV
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best k from grid search
best_k = grid_search.best_params_['n_neighbors']
print(f"Best k: {best_k}")

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
predictions_test_skl = best_model.predict(X_test)

# Calculate accuracy on the test set
accuracy_test = accuracy_score(y_test, predictions_test_skl)
print(f"Accuracy on the Test Set: {accuracy_test:.2f}")


Mean Accuracy with 10-fold Cross-Validation: 0.43
Best k: 1
Accuracy on the Test Set: 0.61


In [180]:
# Paired t-test
t_stat, p_value = ttest_rel(predictions_test, predictions_test_skl)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is a significant difference.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference.")

Fail to reject the null hypothesis. There is no significant difference.
