# Importing libraries and modules (Data Collection)


In [None]:
"""
Importing the necessary modules to begin creating our model.

1. Numpy (for array conversion)
2. Pandas (for reading the csv and other operations)
3. Matplotlib (for graph repreentation)
4. Seaborn (for graph representation)
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
"""
We then store the dataset into a variable to utilise for future operations called "data_frame"
"""
data_frame = pd.read_csv('Dataset_7.csv')
data_frame

# Data Preprocessing


In [2]:
"""
The code cell here takes care of handling missing/empty values

1. The amount of missing/empty values are first attained
2. An if statement is inserted to see if there are missing values.
  - If there is missing values, then we notify to the console saying there's no missing values.
  - If there are missing values, then we drop the the missing values from the dataset inplace.
"""

# 1. Taking care of missing data in the dataset.

# Attaining the sum of empty/missing values.
missing_values = data_frame.isnull().sum().sum()

if missing_values == 0:
  print("There is no missing data in the dataset.")
else:
  data_frame.dropna(inplace=True) # Dropping the missing entries.

NameError: name 'data_frame' is not defined

In [None]:
"""
The module "re" is then imported for the use of regular expressions

Function: clean_sytmptoms
Argument: symptoms
Return Value: A filtered list of symptoms, ridding of any mistakes/typos present in symptoms.

This is ran through as a lambda function in the next cell.
"""

import re

# Function to clean symptom names dynamically (regular expressions)
def clean_symptoms(symptoms):
    # Removing trailing numeric suffixes and unwanted symbols
    return [re.sub(r'\d+$', '', re.sub(r'[^\w\s]', '', symptom.strip())) for symptom in symptoms]

In [None]:
""" 
The multi label binarizer module is then imported in order to incorporate one-hot encoding 
on multiple labels

These labels will be the unique symptoms listed in the data entries, where they will be converted to
features. This means that each symptom will be its own feature with binary values dictating whether they're present or no.
"""

from sklearn.preprocessing import MultiLabelBinarizer  # Reference: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

"""
# 2. Applying MultiLabelBinarizer (version of multi-labeled one-hot encoding)
   - Used for encoding multi-labeled categorical data (e.g., multiple symptoms).
"""
# Initialize MultiLabelBinarizer for multi-labeled one-hot encoding
mlb = MultiLabelBinarizer()

# Splitting the symptoms column into lists for encoding
# Clean_symptoms is assumed to handle any preprocessing such as trimming or standardization
query_symptom_column = data_frame['Symptom'].str.split(',').apply(lambda x: clean_symptoms(x))

# Using MultiLabelBinarizer for encoding
query_encoded = mlb.fit_transform(query_symptom_column)

# Extracting the encoded column names from MultiLabelBinarizer
encoded_columns = mlb.classes_

# Converting the encoded data back to a DataFrame for better visualization
query_encoded_df = pd.DataFrame(query_encoded, columns=encoded_columns)

# Concatenating the one-hot encoded symptoms back to the original dataset
# Replacing the original 'Symptom' column with the encoded features
data_frame = pd.concat([data_frame.drop(columns=['Symptom']), query_encoded_df], axis=1)

In [None]:
"""
At this point, the layout of the dataset consists of the label followed by the features.

The code below reverses this, where the label will be last in the data_frame, while the features will be displayed first

On top of this, the column "Unnamed: 0" is dropped, as it adds no significant value and just contains indexing, which is already
automated in jupyter.
"""

# Ensuring the 'Disease' column is the last column (as it is the label)
columns = [col for col in data_frame.columns if col != 'Disease'] + ['Disease']
data_frame = data_frame[columns]
data_frame.drop(columns=['Unnamed: 0'], inplace=True)
data_frame

In [None]:
# Converting the data frame into a numpy array.

numpy_array = data_frame.to_numpy()
numpy_array

In [5]:
""" 
The train_test_split module is then imported in to split the dataset into training and testing sets.

This split is done using a 70/30 split, where the parameters are modified with the modified:
    - stratify: Which balances the split between classes.
    - random_state: Which randomises the split.

"""

# Splitting the dataset into training set and testing set (70/30 split)
from sklearn.model_selection import train_test_split  # Reference: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

X = data_frame.drop(columns=['Disease'])  # Features
y = data_frame['Disease']  # Label

# 70/30 split, where stratify ensures the class distribution remains consistent across splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Confirming the class distribution in training and testing sets
# Useful for verifying that the stratify parameter preserved class proportions
train_distribution = y_train.value_counts()
test_distribution = y_test.value_counts()

print("Training Set Distribution:\n", train_distribution)
print("\nTesting Set Distribution:\n", test_distribution)

ModuleNotFoundError: No module named 'sklearn'

# Model Selection

In [None]:
# Importing metrics for prediction results
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
# References:
# - Classification Report: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
# - Confusion Matrix: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# - Accuracy Score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
# - F1 Score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

## First Model (Decision Trees)

In [7]:
"""
DecisionTreeClassifier is imported in to use as the first machine learning model.
Other modules from the sklearn.metrics package is used to import metric evaluation modules.
"""

from sklearn.tree import DecisionTreeClassifier  # Reference: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
# Reference for metrics: https://scikit-learn.org/stable/modules/model_evaluation.html

# Initialising the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)  # Decision Tree with default parameters

# Training the model
dt_model.fit(X_train, y_train)

# Making predictions
dt_predictions = dt_model.predict(X_test)

# Evaluate on training data
train_predictions = dt_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print("Training Accuracy:", train_accuracy)

# Evaluate on testing data
test_predictions = dt_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Testing Accuracy:", test_accuracy)

# Compare results
# Overfitting check: A large difference between training and testing accuracy indicates overfitting.
if train_accuracy - test_accuracy > 0.1:  # Example threshold
    print("Potentially Overfitted")
else:
    print("Model appears to generalize well.")

# Actual and predicted classes for F1-score calculation
y_true = ["panic disorder", "acute pancreatitis", "asthma", "heart attack"]
y_pred = ["panic disorder", "acute pancreatitis", "heart attack", "asthma"]

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, dt_predictions))
print("\nClassification Report:\n", classification_report(y_test, dt_predictions))  # Scikit-learn documentation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, dt_predictions))  # Scikit-learn documentation
f1_macro = f1_score(y_true, y_pred, average='macro')  # Scikit-learn documentation
print(f"F1-Score (Macro): {f1_macro:.4f}")

ModuleNotFoundError: No module named 'sklearn'

# Second Model (Support Vector Machines)

In [None]:
"""
DecisionTreeClassifier is imported in to use as the first machine learning model.
Other modules from the sklearn.metrics package is used to import metric evaluation modules.
"""

from sklearn.svm import SVC  # Reference: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
# Reference for metrics: https://scikit-learn.org/stable/modules/model_evaluation.html

# Initializing the Support Vector Machine Classifier
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)  # Radial Basis Function kernel is used by default

# Training the model
svm_model.fit(X_train, y_train)

# Making predictions on the test set
svm_predictions = svm_model.predict(X_test)

# Evaluating on training data
svm_train_predictions = svm_model.predict(X_train)
svm_train_accuracy = accuracy_score(y_train, svm_train_predictions)
print("Training Accuracy:", svm_train_accuracy)

# Evaluating on testing data
svm_test_accuracy = accuracy_score(y_test, svm_predictions)
print("Testing Accuracy:", svm_test_accuracy)

# Checking for potential overfitting
# Overfitting check: A large difference between training and testing accuracy indicates overfitting.
if svm_train_accuracy - svm_test_accuracy > 0.1:
    print("Potentially Overfitted")
else:
    print("Model appears to generalize well.")

# Actual and predicted classes for F1-score calculation
y_true = ["panic disorder", "acute pancreatitis", "asthma", "heart attack"]
y_pred = ["panic disorder", "acute pancreatitis", "heart attack", "asthma"]

# Evaluate overall model performance
print("\nAccuracy:", svm_test_accuracy)
print("\nClassification Report:\n", classification_report(y_test, svm_predictions))  # Scikit-learn documentation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, svm_predictions))  # Scikit-learn documentation
f1_macro = f1_score(y_true, y_pred, average='macro')  # Scikit-learn documentation
print(f"F1-Score (Macro): {f1_macro:.4f}")

# Third Model (K-Nearest Neighbours)

In [None]:
"""
KNeighboursClassifier is imported in to use as the second machine learning model with neighbours being set to 5.
Other modules from the sklearn.metrics package is used to import metric evaluation modules.
"""

from sklearn.neighbors import KNeighborsClassifier  # Reference: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
# Reference for metrics: https://scikit-learn.org/stable/modules/model_evaluation.html

# Initializing the K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # Uses the default distance metric (Minkowski)

# Training the model
knn_model.fit(X_train, y_train)

# Making predictions on the test set
knn_predictions = knn_model.predict(X_test)

# Evaluating on training data
knn_train_predictions = knn_model.predict(X_train)
knn_train_accuracy = accuracy_score(y_train, knn_train_predictions)
print("Training Accuracy:", knn_train_accuracy)

# Evaluating on testing data
knn_test_accuracy = accuracy_score(y_test, knn_predictions)
print("Testing Accuracy:", knn_test_accuracy)

# Checking for potential overfitting
# Overfitting check: A large difference between training and testing accuracy indicates overfitting.
if knn_train_accuracy - knn_test_accuracy > 0.1:
    print("Potentially Overfitted")
else:
    print("Model appears to generalize well.")

# Actual and predicted classes for F1-score calculation
y_true = ["panic disorder", "acute pancreatitis", "asthma", "heart attack"]
y_pred = ["panic disorder", "acute pancreatitis", "heart attack", "asthma"]

# Evaluate overall model performance
print("\nAccuracy:", knn_test_accuracy)
print("\nClassification Report:\n", classification_report(y_test, knn_predictions))  # Scikit-learn documentation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, knn_predictions))  # Scikit-learn documentation
f1_macro = f1_score(y_true, y_pred, average='macro')  # Scikit-learn documentation
print(f"F1-Score (Macro): {f1_macro:.4f}")