In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing 
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score

#Read in Data
df = pd.read_csv("/Users/myronmoskalyk/Library/CloudStorage/OneDrive-UniversityofToronto/Lectures/Applied Machine Learning/LungDS.csv")
df.head()


In [None]:
""" 
Step 1: Exploration of Data

Includes for each RQ:
Correlation Table
Heatmap
Pairplots

"""

In [None]:
#General Correlation Table:
corr=df.corr()
round(corr,2)
print("Correlation Table for the original dataset: ")
print(corr)

#Research Question 1:
df_cancerpresent = df.copy()
df_cancerpresent.rename(columns={'Severity': 'Cancer Present'}, inplace=True)
df_cancerpresent.loc[df_cancerpresent['Cancer Present'].isin([0,1,2]), 'Cancer Present'] = 1 # 1 = Positive
df_cancerpresent.loc[df_cancerpresent['Cancer Present'] == 3, 'Cancer Present'] = 0 # 0 = Negative
#Correlation Table
corr1=df_cancerpresent.corr()
round(corr1,2)
print("\n\nCorrelation Table for the RQ1 dataset: ")
print(corr1)
#Heatmap
sns.heatmap(corr1)
#Pairplot
sns.pairplot(df_cancerpresent, hue='Cancer Present', vars=['Age','Gender', 'Air Pollution', 'Lung Disease', 'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Alcohol Usage', 'Obesity', 'Smoking','Genetic Risk'], palette="pastel")
plt.suptitle('Pair Plots of Features', y=1.02)
plt.show()

#Research Question 2:
canceronly = df[df['Severity'] != 3]
#Correlation Table
corr2=canceronly.corr()
round(corr2,2)
print("\n\nCorrelation Table for the RQ2 dataset: ")
print(corr2)
#Heatmap
sns.heatmap(corr2)
#Pairplot
sns.pairplot(canceronly, hue='Severity', vars=['Age','Gender','Air Pollution','Alcohol Usage','Genetic Risk',
        'Lung Disease','Obesity','Smoking',	'Passive Smoker', 'Chest Pain', 'Coughing of Blood'], palette="pastel")
plt.suptitle('Pair Plots of Features', y=1.02)
plt.show

In [None]:
"""
Step 2: Research Question 1

Building a model to help doctors identify patients that may be at risk for lung cancer.

Note: The output of the model is not meant to directly recommend a patient for testing,
but instead should be used to help guide doctors to patients that might need to be investigated. 
Only after a doctor has met with and looked into a patient's medical records more thorougly should 
patients be recommended (by the doctor) for testing.

Associated Steps:

1. Import libraries (see above)
2. Read in dataset (see above)
3. Convert Severity to Presence
4. Split (RQ1 variables only) data into features and label 
5. Find optimal K to maximize Class 1 (C1) Precision and Overall Accuracy
    a. Plot average C1 precision over multiple random states (RS) against K
    b. Plot average accuracy over multiple RS against K
    c. Choose optimal K based on a local maximum of accuracy/local minimum of error to avoid overfitting/underfitting
6. Apply KNN to create a classification report and confusion matrix

"""



In [None]:
#STEP 1: See above

#STEP 2: See above


#STEP 3: Convert Severity to Presence
df_cancerpresent = df.copy()
df_cancerpresent.rename(columns={'Severity': 'Cancer Present'}, inplace=True)
df_cancerpresent.loc[df_cancerpresent['Cancer Present'].isin([0,1,2]), 'Cancer Present'] = 1 # 1 = Positive
df_cancerpresent.loc[df_cancerpresent['Cancer Present'] == 3, 'Cancer Present'] = 0 # 0 = Negative


#STEP 4: Split data into features and label
X = df_cancerpresent[['Genetic Risk','Alcohol Usage', 'Obesity', 'Smoking']]
y = df_cancerpresent['Cancer Present']


#STEP 5: Find Optimal K

# Calculate the maximum k value as the square root of rows_nbr
max_k = int(np.sqrt(df_cancerpresent.shape[0]))
print ("The length of the dataframe is: " + str(df_cancerpresent.shape[0]))
print ("The max K value is: " + str(max_k))

# Create an empty list of zeros for acc and prec(we will replace it later)
sum_accuracies = np.zeros(max_k-1)
sum_precision = np.zeros(max_k-1)

# Find Optimal K for ACCURACY
plt.figure(figsize=(12,6))
for RS in range (1, 11):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RS)
    scaler = StandardScaler() # Assign an instance of the StandardScaler class and assign it to the var "scaler"
    X_train = scaler.fit_transform(X_train) # Two things happen: fit = Scaler learns the mean and SD from the training data; transform = Scaler uses the learned parameters to scale the training data
    X_test = scaler.transform(X_test) # The Scaler uses the learned parameters to scale the training data
    
    accuracy_list=[] # Empty list that will store the accuracy score value of k
    for i in range (1,max_k):  # Took the range of k from 1 to maxk
        knn=KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        y_pred =knn.predict(X_test)
        curr_accuracy = accuracy_score(y_test, y_pred)
        accuracy_list.append(curr_accuracy)
        
        sum_accuracies[i-1] += curr_accuracy

    # Plot accuracy curve for the current random state
    plt.plot(range(1, max_k), accuracy_list, marker="o", linestyle="dashed", markersize=3, label=f"RS = {RS}")
    
# Build plot for Accuracy vs K across 10 RS
plt.title("Accuracy vs K Value for Different Random States", fontsize=20)
plt.xlabel("K - Values", fontsize=15)
plt.ylabel("Accuracy Score", fontsize=15)
plt.xticks(range(1, max_k))
plt.legend(loc="lower right")  # Display legend to indicate random states
plt.show()

# Average findings
avg_accuracy = sum_accuracies / 10   
plt.figure(figsize = (12,6))
plt.plot(range(1, max_k), avg_accuracy, marker="o", linestyle="-", color="black",markersize = 5, linewidth=2, label="Average Accuracy")

# Build Averaged plot
plt.title("Averaged Accuracy vs K Value", fontsize=20)
plt.xlabel("K - Values", fontsize=15)
plt.ylabel("Average Accuracy Score", fontsize=15)
plt.xticks(range(1, max_k))
plt.legend(loc="lower right", fontsize=10)  # Reduced fontsize for legend
plt.show()


# Find Optimal K for PRECISION
plt.figure(figsize=(12,6))
for RS in range (1, 11):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RS)
    scaler = StandardScaler() # Assign an instance of the StandardScaler class and assign it to the var "scaler"
    X_train = scaler.fit_transform(X_train) # Two things happen: fit = Scaler learns the mean and SD from the training data; transform = Scaler uses the learned parameters to scale the training data
    X_test = scaler.transform(X_test) # The Scaler uses the learned parameters to scale the training data
        
    # Create a list to store precision values for Class 1
    class_1_preclist = []

    # Loop through possible k values and compute precision for each
    for j in range(1, max_k):
        knn = KNeighborsClassifier(n_neighbors=j)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        curr_precision = precision_score(y_test, y_pred, pos_label=1)  # The label being = 1 means Class 1 precision
        class_1_preclist.append(curr_precision)

        sum_precision[j-1] += curr_precision

    # Plot prec curve for the current random state
    plt.plot(range(1, max_k), class_1_preclist, marker="o", linestyle="dashed", markersize=3, label=f"RS = {RS}")
    
# Build plot for C1 Prec and K across 10 RS
plt.title("Class 1 Precision vs K Value for Different Random States", fontsize=20)
plt.xlabel("K - Values", fontsize=15)
plt.ylabel("Precision Score", fontsize=15)
plt.xticks(range(1, max_k))
plt.legend(loc="lower right")  # Display legend to indicate random states
plt.show()

# Average Findings
avg_prec = sum_precision / 10   
plt.figure(figsize = (12,6))
plt.plot(range(1, max_k), avg_prec, marker="o", linestyle="-", color="black",markersize = 5, linewidth=2, label="Average Precision")

# Build Averaged Plot
plt.title("Averaged Precision vs K Value", fontsize=20)
plt.xlabel("K - Values", fontsize=15)
plt.ylabel("Average Precision Score", fontsize=15)
plt.xticks(range(1, max_k))
plt.legend(loc="lower right", fontsize=10)  # Reduced fontsize for legend
plt.show()


#STEP 6: Apply KNN

# Final split into train and test RS 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

# The best K value from a combination of class 1 Prec and Accuracy seems to be at K=20
knn = KNeighborsClassifier(n_neighbors=20) 
knn.fit(X_train, y_train)

# Predict using the test data
y_pred = knn.predict(X_test)

# Train accuracy
y_train_pred = knn.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")

# Test accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# C1 Precision
class1_prec = precision_score(y_test, y_pred, pos_label=1)
print(f"Class 1 Precision: {class1_prec * 100:.2f}%")

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Create confusion matrix
# Get unique classes
classes = df_cancerpresent['Cancer Present'].unique()

# Plotting the confusion matrix
plt.figure(figsize=(7,5))
sns.heatmap(confusion_matrix(y_test, y_pred, labels=classes), cmap='Blues', annot=True, fmt='g', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
"""

Step 3: Research Question 2

Building a model to identify lung cancer cases with the highest severity to support clinical findings on patients undergoing lung cancer screening/diagnosis.

Associated Steps:

1. Import libraries (see above)
2. Read in dataset (see above)
3. Create a new column for Severity without healthy patients
4. Split (RQ2 variables only) data into features and label 
5. Find optimal K to maximize Overall Accuracy
    a. Choose optimal K based on a local maximum of accuracy/local minimum of error to avoid overfitting/underfitting
6. Apply KNN to create a classification report and confusion matrix

"""

In [None]:
#STEP 1+2 are completed above


#STEP 3: Convert Severity to not include Severity = 3
canceronly = df[df['Severity'] != 3]


#STEP 4: Split RQ2 data into features and label
X = canceronly[[ 'Alcohol Usage', 'Genetic Risk', 'Air Pollution',
                'Lung Disease', 'Obesity', 'Smoking', 'Passive Smoker',
                'Chest Pain', 'Coughing of Blood']]
y = canceronly['Severity']


#STEP 5:

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

error_rate =[] # list that will store the accuracy score value of k
for i in range (1, max_k):  #Took the range of k from 1 to max_k
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    y_pred =knn.predict(X_test)
    error_rate.append(1 - accuracy_score(y_test, y_pred))

# Plotting the accuracy vs k graph
plt.figure(figsize=(12,6))
plt.plot(range(1, max_k),error_rate,marker="o",
         markerfacecolor="green",
         linestyle="dashed",color="red")
plt.title("Error rate vs k value",fontsize=20)
plt.xlabel("k values",fontsize=20)
plt.ylabel("error rate",fontsize=20)
plt.xticks(range(1, max_k))
plt.show()

# Apply KNN, the Optimal K was 20 based on the plot
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

# Predict using the test data
y_pred = knn.predict(X_test)

# Train accuracy
y_train_pred = knn.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")

# Test accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Create Confusion Matrix
# Get unique classes
classes = canceronly['Severity'].unique()

# Plotting the confusion matrix
plt.figure(figsize=(7,5))
sns.heatmap(confusion_matrix(y_test, y_pred, labels=classes),
            cmap='Purples', annot=True, fmt='g', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
"""
Clustering Analysis

We did this analysis to investigate how the data distributes itself with no labelled and to apply the code we learned in class!

"""

In [None]:
# Required libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# k mode
!pip install kmodes
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
%matplotlib inline

df['Cancer Status'] = df['Severity'] != 3

features = ['Age', 'Alcohol Usage', 'Obesity', 'Smoking', 'Chest Pain', 'Coughing of Blood']
# Convert the selected features from DataFrame to a NumPy array for easier processing with sklearn
data =  df[features].to_numpy()

# k-mode
# kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)
kmode = KModes(n_clusters = 2, init = "random", verbose=1)
clusters = kmode.fit_predict(data)
# clusters
# plot_clusters(data, clusters, 3)

# proportion of cancer patient that got clustered in cluster 1
p1 = sum(df['Cancer Status'][clusters[:]==0] == True)/sum(df['Cancer Status'] == True)

# proportion of healthy patient that got clustered in cluster 1
p2 = sum(df['Cancer Status'][clusters[:]==0] == False)/sum(df['Cancer Status'] == False)


# proportion of cancer patient that got clustered in cluster 2
p3 = sum(df['Cancer Status'][clusters[:]==1] == True)/sum(df['Cancer Status'] == True)

# proportion of healthy patient that got clustered in cluster 2
p4 = sum(df['Cancer Status'][clusters[:]==1] == False)/sum(df['Cancer Status'] == False)

print(f"{p1*100}% cancer patients and {p2*100}% healthy patients are in cluster 1. \
{p3*100}% cancer patients and {p4*100}% healthy patients are in cluster 2 .")

In [None]:
# Creating a proportional healthy and cancer patient dataset
df_cancer = df.loc[df['Cancer Status'] == True]
df_cancer = df_cancer.sample(frac = 0.465, random_state=42)
df_healthy = df.loc[df['Cancer Status'] == False]
balanced_df = pd.concat([df_cancer, df_healthy])
     

In [None]:
features = ['Alcohol Usage', 'Obesity', 'Smoking', 'Chest Pain', 'Coughing of Blood']

balanced_data =  balanced_df[features].to_numpy()

kmode = KModes(n_clusters = 2, init = "random", verbose=1)
cluster = kmode.fit_predict(balanced_data)
cluster
# balanced_df['Cancer Status'][cluster[:]==0] == True
cluster_1 = balanced_df[cluster[:]==0]
cluster_2 = balanced_df[cluster[:]==1]
p1 = sum(cluster_1['Cancer Status']== True)/sum(balanced_df['Cancer Status'] == True)
p2 = sum(cluster_1['Cancer Status']== False)/sum(balanced_df['Cancer Status'] == False)
p3 = sum(cluster_2['Cancer Status']== True)/sum(balanced_df['Cancer Status'] == True)
p4 = sum(cluster_2['Cancer Status']== False)/sum(balanced_df['Cancer Status'] == False)

print(f"{p1*100}% cancer patients and {p2*100}% healthy patients are in cluster 1. \
{p3*100}% cancer patients and {p4*100}% healthy patients are in cluster 2 .")
     

In [14]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# clf = LinearDiscriminantAnalysis()
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[55  0  0]
 [ 0 61  2]
 [ 0  0 82]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       1.00      0.97      0.98        63
           2       0.98      1.00      0.99        82

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



In [None]:
"""

Appendix:

RQ1:
Variables Included: Easily obtainable data, mostly distinct datapoints
Alcohol Usage
Obesity
Smoking
Genetic Risk
Already flags patient for being at risk, however patients won’t be tested simply because they have family history

Variables Excluded: Irrelevance, redundancy, difficult to obtain
Gender: Not relevant
Air Pollution: Not typically known by patients
Age: Performed poorly in the initial correlation heatmap
Lung Disease: Already flags patients for being at risk and testing
Passive Smoker: Not always known or recorded
Chest Pain: Already flags patient for being at risk, can also be attributed to many other things, might not be disclosed, etc.
Coughing of Blood: Already flags patients for testing

RQ2:
Variables Indluded: High Correlation in heatmap and distinctness of data
Exclusion: Irrelevance and poor correlation
