## BE
- [Kaggle](https://www.kaggle.com/datasets/walterconway/covid-flu-cold-symptoms/data)

1. Data Preprocessing

In [None]:
# Import all the necessary libraries for data cleaning and machine learning algorithms (decision tree)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import joblib as joblib

In [None]:
# df = pd.read_csv('large_flu_data.csv')
df = pd.read_csv('large_data.csv')

# Drop rows that are not type of FLU
df = df[df['TYPE'] == 'FLU']
df.info()

checking missing values

In [None]:
df.isna().sum()

counting & cleaning feature values

In [None]:
cols = df.columns

for col in cols:
	print(df[col].value_counts())
	print("")

# Drop columns that are only zeros
for col in cols:
	# If the count of the most frequentt value is equal to the length of the dataframe (25k), then drop the column
	if df[col].value_counts()[0] == len(df):
		df.drop(col, axis=1, inplace=True)
df.info()

duplicates

In [None]:
df.duplicated().sum()
df.duplicated().value_counts()
df.info()

# write all the column header to a txt file with newline delimiter
with open('column_headers.txt', 'w') as f:
	for col in df.columns:
		f.write(col + '\n')

create new column to fill in severity data

In [None]:
def classify_symptoms(row) -> int:
    # Apply your classification rules to the row
    if row['SHORTNESS_OF_BREATH'] == 1 or row['DIFFICULTY_BREATHING'] == 1:
        return 1
    else:
        return 0

# Create a new column 'SEVERITY' in the DataFrame 'df' using the 'apply()' method
df['SEVERITY'] = df.apply(classify_symptoms, axis=1)
df.info()


In [None]:
df['SEVERITY'].value_counts()
df.info()

save dataframe to csv

In [None]:
# save dataframe to csv file
df.to_csv('cleaned_data.csv', index=False)

correlation

In [None]:
# Ignore TYPE column
df_corr = df.corr()
df_corr
df.info()

Splitting and training the data

In [None]:
symptoms = [i for i in df.columns if i != 'SEVERITY']
severity = ['SEVERITY']
# Split the data into training and testing sets
# 30% of the data will be used for testing
X_train, X_test, y_train, y_test = train_test_split(df[symptoms], df[severity], test_size=0.2, random_state=42)

# Create a Decision Tree Classifier object
clf = DecisionTreeClassifier()

# Train the model on the training set
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# create a function to evaluate the performance of the model
def evaluate_model(y_true, y_pred):
	# Calculate and print confusion matrix plot
	disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred))
	disp.plot()
	plt.show()
	# Calculate and print the classification report
	report = classification_report(y_true, y_pred)
	print(report)
evaluate_model(y_test, y_pred)

joblib.dump(clf, 'decision_tree_model.pkl')

In [None]:
# # Prepare your input data
# my_input = pd.DataFrame([[1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]], columns=symptoms)

# # Load the trained model
# model = joblib.load('decision_tree_model.pkl')

# # Make predictions on the input data
# y_pred = model.predict(my_input)

# # Print the predicted label
from sklearn import metrics
print("Predicted label:", y_pred[0])
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

# Perform 10-fold cross-validation
scores = cross_val_score(clf, df[symptoms], df[severity], cv=10)

# Print the average score and standard deviation
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(df[symptoms], df[severity], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

# Train the model on the training set
clf.fit(X_train, y_train)

# Evaluate the model on the validation set
val_score = clf.score(X_val, y_val)

# Evaluate the model on the test set
test_score = clf.score(X_test, y_test)

# Print the validation and test scores
print("Validation score: %0.2f" % val_score)
print("Test score: %0.2f" % test_score)

In [None]:
from sklearn.tree import export_graphviz
import pydotplus
from io import StringIO
clf = joblib.load('decision_tree_model.pkl')

# Export the decision tree to a DOT file
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, 
    feature_names=symptoms,  
    class_names=['0','1'], 
    special_characters=True, 
    rounded=True, 
    proportion=False,
    filled=True)

# export the decision tree to a png file
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision_tree.png')
df.head()
