# Data Cleaning and Prep

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
url = 'https://raw.githubusercontent.com/alexdlilly/DS6050_Project/main/predicted%20data.csv'
data = pd.read_csv(url)

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,index,filename,covid,score,uuid,datetime,cough_detected,SNR,latitude,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,0,0,001d8e33-a4af-4edb-98ba-b03f891d9a6c,1,0.986853,001d8e33-a4af-4edb-98ba-b03f891d9a6c,2020-05-13T01:27:42.552773+00:00,0.0306,12.71348,13.8,...,,,,,,,,,,
1,1,1,00357712-dd5a-4c0a-90a4-39f1f4b9d5fd,1,0.005177,00357712-dd5a-4c0a-90a4-39f1f4b9d5fd,2020-04-10T12:48:17.554497+00:00,0.0576,0.0,46.8,...,,,,,,,,,,
2,2,2,00e0a33c-6561-406e-b543-4c9f07b860f7,1,-0.124921,00e0a33c-6561-406e-b543-4c9f07b860f7,2020-04-15T07:01:51.945775+00:00,0.2798,12.695962,,...,,,,,,,,,,
3,3,3,01424527-9c3b-4b6e-96f1-9eea3150819b,1,-0.02227,01424527-9c3b-4b6e-96f1-9eea3150819b,2020-10-18T15:51:28.858079+00:00,0.0,16.047069,48.9,...,no_cough,,False,False,False,False,False,False,,
4,4,4,015576c8-2b03-4dcb-b251-78a984fe86fe,1,-0.116273,015576c8-2b03-4dcb-b251-78a984fe86fe,2020-06-22T00:21:42.091345+00:00,1.0,12.265701,,...,good,unknown,False,False,False,False,False,True,obstructive_disease,mild


In [4]:
data.columns

Index(['Unnamed: 0', 'index', 'filename', 'covid', 'score', 'uuid', 'datetime',
       'cough_detected', 'SNR', 'latitude', 'longitude', 'age', 'gender',
       'respiratory_condition', 'fever_muscle_pain', 'status', 'quality_1',
       'cough_type_1', 'dyspnea_1', 'wheezing_1', 'stridor_1', 'choking_1',
       'congestion_1', 'nothing_1', 'diagnosis_1', 'severity_1', 'quality_2',
       'cough_type_2', 'dyspnea_2', 'wheezing_2', 'stridor_2', 'choking_2',
       'congestion_2', 'nothing_2', 'diagnosis_2', 'severity_2', 'quality_3',
       'cough_type_3', 'dyspnea_3', 'wheezing_3', 'stridor_3', 'choking_3',
       'congestion_3', 'nothing_3', 'diagnosis_3', 'severity_3', 'quality_4',
       'cough_type_4', 'dyspnea_4', 'wheezing_4', 'stridor_4', 'choking_4',
       'congestion_4', 'nothing_4', 'diagnosis_4', 'severity_4'],
      dtype='object')

In [5]:
cleaned = data.drop(['Unnamed: 0', 'index', 'filename', 'covid', 'uuid', 'datetime'], axis = 1)
## drop the columns that will not be used in prediction
## dropping covid column -- keeping status column and score column

In [6]:
cleaned.head()

Unnamed: 0,score,cough_detected,SNR,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,status,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,0.986853,0.0306,12.71348,13.8,-89.6,,female,False,True,COVID-19,...,,,,,,,,,,
1,0.005177,0.0576,0.0,46.8,6.6,,male,True,False,COVID-19,...,,,,,,,,,,
2,-0.124921,0.2798,12.695962,,,,other,False,False,COVID-19,...,,,,,,,,,,
3,-0.02227,0.0,16.047069,48.9,2.5,39.0,female,False,False,COVID-19,...,no_cough,,False,False,False,False,False,False,,
4,-0.116273,1.0,12.265701,,,27.0,male,False,False,COVID-19,...,good,unknown,False,False,False,False,False,True,obstructive_disease,mild


In [7]:
cleaned.status.unique()

array(['COVID-19', 'healthy'], dtype=object)

In [8]:
cleaned = cleaned.fillna(method = 'bfill')

Remove nans with backfill

In [9]:
cleaned.head()

Unnamed: 0,score,cough_detected,SNR,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,status,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,0.986853,0.0306,12.71348,13.8,-89.6,39.0,female,False,True,COVID-19,...,no_cough,unknown,False,False,False,False,False,False,obstructive_disease,mild
1,0.005177,0.0576,0.0,46.8,6.6,39.0,male,True,False,COVID-19,...,no_cough,unknown,False,False,False,False,False,False,obstructive_disease,mild
2,-0.124921,0.2798,12.695962,48.9,2.5,39.0,other,False,False,COVID-19,...,no_cough,unknown,False,False,False,False,False,False,obstructive_disease,mild
3,-0.02227,0.0,16.047069,48.9,2.5,39.0,female,False,False,COVID-19,...,no_cough,unknown,False,False,False,False,False,False,obstructive_disease,mild
4,-0.116273,1.0,12.265701,41.6,2.0,27.0,male,False,False,COVID-19,...,good,unknown,False,False,False,False,False,True,obstructive_disease,mild


In [10]:
cleaneddummied = pd.get_dummies(cleaned)

In [11]:
cleaneddummied.columns

Index(['score', 'cough_detected', 'SNR', 'latitude', 'longitude', 'age',
       'respiratory_condition', 'fever_muscle_pain', 'gender_female',
       'gender_male',
       ...
       'nothing_4_True', 'diagnosis_4_COVID-19', 'diagnosis_4_healthy_cough',
       'diagnosis_4_lower_infection', 'diagnosis_4_obstructive_disease',
       'diagnosis_4_upper_infection', 'severity_4_mild',
       'severity_4_pseudocough', 'severity_4_severe', 'severity_4_unknown'],
      dtype='object', length=119)

In [12]:
cleaneddummied.head()

Unnamed: 0,score,cough_detected,SNR,latitude,longitude,age,respiratory_condition,fever_muscle_pain,gender_female,gender_male,...,nothing_4_True,diagnosis_4_COVID-19,diagnosis_4_healthy_cough,diagnosis_4_lower_infection,diagnosis_4_obstructive_disease,diagnosis_4_upper_infection,severity_4_mild,severity_4_pseudocough,severity_4_severe,severity_4_unknown
0,0.986853,0.0306,12.71348,13.8,-89.6,39.0,False,True,True,False,...,False,False,False,False,True,False,True,False,False,False
1,0.005177,0.0576,0.0,46.8,6.6,39.0,True,False,False,True,...,False,False,False,False,True,False,True,False,False,False
2,-0.124921,0.2798,12.695962,48.9,2.5,39.0,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
3,-0.02227,0.0,16.047069,48.9,2.5,39.0,False,False,True,False,...,False,False,False,False,True,False,True,False,False,False
4,-0.116273,1.0,12.265701,41.6,2.0,27.0,False,False,False,True,...,True,False,False,False,True,False,True,False,False,False


In [13]:
cleaneddummied.rename(columns = {'status_COVID-19':'statusCOVID19'}, inplace=True)

In [14]:
cleaneddummied.columns.tolist()

['score',
 'cough_detected',
 'SNR',
 'latitude',
 'longitude',
 'age',
 'respiratory_condition',
 'fever_muscle_pain',
 'gender_female',
 'gender_male',
 'gender_other',
 'statusCOVID19',
 'status_healthy',
 'quality_1_good',
 'quality_1_no_cough',
 'quality_1_ok',
 'quality_1_poor',
 'cough_type_1_dry',
 'cough_type_1_unknown',
 'cough_type_1_wet',
 'dyspnea_1_False',
 'dyspnea_1_True',
 'wheezing_1_False',
 'wheezing_1_True',
 'stridor_1_False',
 'choking_1_False',
 'congestion_1_False',
 'congestion_1_True',
 'nothing_1_False',
 'nothing_1_True',
 'diagnosis_1_COVID-19',
 'diagnosis_1_healthy_cough',
 'diagnosis_1_lower_infection',
 'diagnosis_1_obstructive_disease',
 'diagnosis_1_upper_infection',
 'severity_1_mild',
 'severity_1_pseudocough',
 'severity_1_severe',
 'severity_1_unknown',
 'quality_2_good',
 'quality_2_no_cough',
 'quality_2_ok',
 'quality_2_poor',
 'cough_type_2_dry',
 'cough_type_2_unknown',
 'cough_type_2_wet',
 'dyspnea_2_False',
 'dyspnea_2_True',
 'wheezing_2

## What is the AUC if you use only the `score` data from the spectrograms to predict COVID status? 

In [24]:
score = roc_auc_score(cleaneddummied.statusCOVID19, cleaneddummied.score)
print(f"ROC AUC: {score:.4f}")

ROC AUC: 0.6886


## Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
#cleaneddummied = cleaneddummied[['score','SNR','age','respiratory_condition','fever_muscle_pain','gender_female','gender_male','gender_other','statusCOVID19','status_healthy']]

In [26]:
y = cleaneddummied.statusCOVID19

In [27]:
x = cleaneddummied.drop(['statusCOVID19', 'status_healthy'], axis=1)

In [28]:
np.all(np.isfinite(x))

False

In [29]:
np.isinf(x).values.sum()

5

In [30]:
x.columns.to_series()[np.isinf(x).any()]

SNR    SNR
dtype: object

Set all infinity values to the max real value

In [31]:
x.SNR.quantile(q=.999)

83.53848954151096

In [32]:
x.SNR[5447] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.SNR[5447] = 100


In [33]:
x.SNR[6252] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.SNR[6252] = 100


In [34]:
x.SNR[7972] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.SNR[7972] = 100


In [35]:
x.SNR[9124] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.SNR[9124] = 100


In [36]:
x.SNR[11716] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.SNR[11716] = 100


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.33, random_state=777
)

In [38]:
model = LogisticRegression(solver='liblinear', random_state=777)

In [39]:
model.fit(X_train,y_train)

In [40]:
model.score(X_test,y_test)

0.9267485822306238

In [41]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")



In [42]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.9267485822306238
F1 Score: 0.9399901963623717


In [43]:
confusion_matrix(y_test, model.predict(X_test))

array([[3816,   54],
       [ 256,  106]], dtype=int64)

In [44]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

       False       0.94      0.99      0.96      3870
        True       0.66      0.29      0.41       362

    accuracy                           0.93      4232
   macro avg       0.80      0.64      0.68      4232
weighted avg       0.91      0.93      0.91      4232



In [45]:
y_proba = model.predict_proba(X_test)

In [46]:
score = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC AUC: {score:.4f}")

ROC AUC: 0.8457


Huge improvement from 0.68!

Increasing regularization

In [39]:
model = LogisticRegression(solver='liblinear', C=10.0, random_state=777)
model.fit(X_train, y_train)

In [40]:
model.score(X_test,y_test)

0.9286389413988658

In [41]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.9286389413988658
F1 Score: 0.9408269110053435


In [42]:
confusion_matrix(y_test, model.predict(X_test))

array([[3816,   54],
       [ 248,  114]], dtype=int64)

In [43]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

       False       0.94      0.99      0.96      3870
        True       0.68      0.31      0.43       362

    accuracy                           0.93      4232
   macro avg       0.81      0.65      0.70      4232
weighted avg       0.92      0.93      0.92      4232



In [44]:
y_proba = model.predict_proba(X_test)

In [45]:
score = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC AUC: {score:.4f}")

ROC AUC: 0.8466


## Naive Bayes Classification

In [46]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, y_train)

In [47]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.47235349716446123
F1 Score: 0.37641921770442294


In [48]:
confusion_matrix(y_test, model.predict(X_test))

array([[1679, 2191],
       [  42,  320]], dtype=int64)

In [49]:
y_proba = model.predict_proba(X_test)

In [50]:
score = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC AUC: {score:.4f}")

ROC AUC: 0.7681


## LDA

In [51]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [52]:
model = LinearDiscriminantAnalysis(shrinkage=None)
model.fit(X_train, y_train)

In [53]:
model.score(X_test,y_test)

0.9224952741020794

In [54]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.9224952741020794
F1 Score: 0.9304861927144534


In [55]:
confusion_matrix(y_test, model.predict(X_test))

array([[3772,   98],
       [ 230,  132]], dtype=int64)

In [56]:
y_proba = model.predict_proba(X_test)

In [57]:
score = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC AUC: {score:.4f}")

ROC AUC: 0.8369


## QDA

In [58]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [59]:
model = QuadraticDiscriminantAnalysis()
model.fit(X_train, y_train)



In [60]:
model.score(X_test,y_test)

0.389413988657845

In [61]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.389413988657845
F1 Score: 0.2987239895029807


In [62]:
confusion_matrix(y_test, model.predict(X_test))

array([[1325, 2545],
       [  39,  323]], dtype=int64)

In [63]:
y_proba = model.predict_proba(X_test)

In [64]:
score = roc_auc_score(y_test, y_proba[:, 1])
print(f"ROC AUC: {score:.4f}")

ROC AUC: 0.6749
