In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [3]:
df = pd.read_csv("/Users/myronmoskalyk/Library/CloudStorage/OneDrive-UniversityofToronto/Lectures/Applied Machine Learning/LungDS.csv")
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol Usage,Genetic Risk,Lung Disease,Obesity,Smoking,Passive Smoker,Chest Pain,Coughing of Blood,Severity
0,33,1,2,4,3,2,4,3,2,2,4,0
1,17,1,3,1,4,2,2,2,4,2,3,1
2,35,1,4,5,5,4,7,2,3,4,8,2
3,37,1,7,7,6,7,7,7,7,7,8,2
4,46,1,6,8,7,6,7,8,7,7,9,2


In [4]:
# Split data into features and label
X = df[['Age', 'Alcohol Usage', 'Obesity', 'Smoking']]
y = df['Severity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

"""

Note1: test_size=0.2: This parameter determines the proportion of the dataset to include in the test
split. In this case, 20% of the data will be used for testing and the remaining 80% will be used for training.

Note2: random_state=42: This parameter is used to seed the random number generator.
By setting a specific value (like 42), the split will be reproducible. If you run the code multiple times
with the same random_state, you'll get the same train/test split. This is useful for ensuring consistency across runs.

"""

X.head()

Unnamed: 0,Age,Alcohol Usage,Obesity,Smoking
0,33,4,4,3
1,17,1,2,2
2,35,5,7,2
3,37,7,7,7
4,46,8,7,8


In [5]:
# Scale the features
scaler = StandardScaler() # Assign an instance of the StandardScaler class and assign it to the var "scaler"
X_train = scaler.fit_transform(X_train) # Two things happen: fit = Scaler learns the mean and SD from the training data; transform = Scaler uses the learned parameters to scale the training data
X_test = scaler.transform(X_test) # The Scaler uses the learned parameters to scale the training data

""" 

Note 1: StandardScaler standardizes features by removing the mean and scaling to unit variance. 

Note 2: While the train set has both fit and transform used on it, the test set only has transform 
used on it because you should always use the scaling parameters learned from the training set to scale the testing set (or any new data).
This ensures that the model sees the same distribution during training and testing

"""

' \n\nNote 1: StandardScaler standardizes features by removing the mean and scaling to unit variance. \n\nNote 2: While the train set has both fit and transform used on it, the test set only has transform \nused on it because you should always use the scaling parameters learned from the training set to scale the testing set (or any new data).\nThis ensures that the model sees the same distribution during training and testing\n\n'

In [6]:
def accuracy(k, X_train, y_train, X_test, y_test):
    '''
    compute accuracy of the classification based on k values 
    '''
    # use an instance of the learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)

    # predict the response
    pred = knn.predict(X_test)

    # evaluate and return  accuracy
    return accuracy_score(y_test, pred)


# Calculate the maximum k value as the square root of rows_nbr
max_k = int(np.sqrt(df.shape[0]))
print ("The length of the dataframe is: " + str(df.shape[0]))
print ("The max K value is: " + str(max_k))
best_n_neighbours = np.argmax(np.array([accuracy(k, X_train, y_train, X_test, y_test) for k in range(1, max_k + 1)])) + 1
print (best_n_neighbours)


The length of the dataframe is: 1465
The max K value is: 38
1


In [7]:
# Apply KNN
knn = KNeighborsClassifier(n_neighbors=1)  # sqrt(1465 n), pick an odd number so round down
knn.fit(X_train, y_train)

# Predict using the test data
y_pred = knn.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[56  0  0  1]
 [ 0 57  0  1]
 [ 0  0 80  0]
 [ 3  4  0 91]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        57
           1       0.93      0.98      0.96        58
           2       1.00      1.00      1.00        80
           3       0.98      0.93      0.95        98

    accuracy                           0.97       293
   macro avg       0.97      0.97      0.97       293
weighted avg       0.97      0.97      0.97       293

