## BE
- [Kaggle](https://www.kaggle.com/datasets/walterconway/covid-flu-cold-symptoms/data)

1. Data Preprocessing

In [None]:
# Import all the necessary libraries for data cleaning and machine learning algorithms (decision tree)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import joblib as joblib

Get rid of unnecessary rows and columns

In [None]:
df = pd.read_csv('large_data.csv')
print(df['TYPE'].count())
# Drop rows that are not type of FLU (eg. COVID-19, )
df = df[df['TYPE'] == 'FLU']
# Reset the index of DataFrame
df = df.reset_index(drop=True)
df.head()

trimming whitespaces if theres any

In [None]:
# Trim the whitespaces in the column
df.columns = df.columns.str.strip()
df.info()

checking missing values

In [None]:
df.isna().sum()

counting & cleaning feature values

In [None]:
cols = df.columns

for col in cols:
	print(df[col].value_counts())
	print("")

# Drop columns that are only zeros
for col in cols:
	# If the count of the most frequentt value is equal to the length of the dataframe (25k), then drop the column
	if df[col].value_counts()[0] == len(df):
		df.drop(col, axis=1, inplace=True)
df.info()

duplicates

In [None]:
df.duplicated().sum()
df.duplicated().value_counts()
df.info()

# write all the column header to a txt file with newline delimiter
with open('column_headers.txt', 'w') as f:
	for col in df.columns:
		f.write(col + '\n')

create new column to fill in severity data

In [None]:
def classify_symptoms(row) -> int:
    # Apply your classification rules to the row
    if row['SHORTNESS_OF_BREATH'] == 1 or row['DIFFICULTY_BREATHING'] == 1:
        return 1
    else:
        return 0

# Create a new column 'SEVERITY' in the DataFrame 'df' using the 'apply()' method
df['SEVERITY'] = df.apply(classify_symptoms, axis=1)
df.info()


In [None]:
df['SEVERITY'].value_counts()
df.head()

save dataframe to csv

In [None]:
# save dataframe to csv file
df.to_csv('cleaned_data.csv', index=False)

correlation

In [None]:
# Ignore TYPE column
df_corr = df.corr()
df_corr
df.info()

Splitting and training the data (Decision Tree)

In [None]:
symptoms = [i for i in df.columns if i != 'SEVERITY']
severity = ['SEVERITY']
# Split the data into training and testing sets
# 30% of the data will be used for testing
X_train, X_test, y_train, y_test = train_test_split(df[symptoms], df[severity], test_size=0.2, random_state=42)

# Create a Decision Tree Classifier object
dt = DecisionTreeClassifier()

# Train the model on the training set
dt.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = dt.predict(X_test)

# create a function to evaluate the performance of the model hack
def evaluate_model(y_true, y_pred):
	# Calculate and print confusion matrix plot
	disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
	disp.plot()
	plt.show()
	# Calculate and print the classification report
	report = classification_report(y_true, y_pred)
	print(report)
evaluate_model(y_test, y_pred)

joblib.dump(dt, 'decision_tree_model.pkl')

Training the data using Random Forest Classifier

In [None]:
# Train the data using random classifier from scikit
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train.values.ravel())
y_pred = rf.predict(X_test)
evaluate_model(y_test, y_pred)
print(f"Accuracy: {rf.score(X_test, y_test)}")

Training the data using SVM

In [None]:
# Train the data using SVM from scikit
from sklearn.svm import SVC
svm = SVC(random_state=42)
svm.fit(X_train, y_train.values.ravel())
y_pred = svm.predict(X_test)
evaluate_model(y_test, y_pred)
print(f"Accuracy: {svm.score(X_test, y_test)}")

In [None]:
# Prepare your input data
sym = [1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]
my_input = pd.DataFrame([sym], columns=symptoms)
print(my_input)

# Load the trained model
model = joblib.load('decision_tree_model.pkl')

# Make predictions on the input data
y_pred = model.predict(my_input)

# # Print the predicted label
from sklearn import metrics
print("Predicted label:", y_pred[0])

Cross-fold validation

In [None]:
from sklearn.model_selection import cross_val_score

cv_amt = 10
# Perform 20-fold cross-validation on decision tree (dt)
dt_scores = cross_val_score(dt, df[symptoms], df[severity], cv=cv_amt, n_jobs=-1)
rf_scores = cross_val_score(rf, df[symptoms], df[severity].values.ravel(), cv=cv_amt, n_jobs=-1)
svm_scores = cross_val_score(svm, df[symptoms], df[severity].values.ravel(), cv=cv_amt, n_jobs=-1)

# Print the average score and standard deviation
scores = pd.DataFrame({'ML Techniques' : ['Decision Tree', 'Random Forest', 'SVM'], 'Score' : [dt_scores.mean(), rf_scores.mean(), svm_scores.mean()], 'Standard Deviation' : [dt_scores.std(), rf_scores.std(), svm_scores.std()]})
print(scores.sort_values(by='Score', ascending=False))

Export the decision tree to an image

In [None]:
from sklearn.tree import export_graphviz
import pydotplus
from io import StringIO
dt = joblib.load('decision_tree_model.pkl')

# Export the decision tree to a DOT file
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data, 
    feature_names=symptoms,  
    class_names=['0', '1'], 
    special_characters=True, 
    rounded=True, 
    proportion=False,
    filled=True)

# export the decision tree to a png file
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision_tree.png')
print(df['SEVERITY'].value_counts())