In [0]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

! wget --quiet https://github.com/virufy/covid/tree/master/data/labels.csv

# Explore the data

In [0]:
labels_df = pd.read_csv('labels.csv')
labels_df = labels_df.drop(columns=["date", "cough_filename"])

In [3]:
labels_df.head(10)

Unnamed: 0,corona_test,age,gender,medical_history,smoker,patient_reported_symptoms
0,negative,53,male,"none,",yes,"none,"
1,positive,50,male,"Congestive heart failure,",no,"Shortness of breath,"
2,negative,43,male,"none,",no,"Sore throat,"
3,positive,65,male,"Asthma or chronic lung disease,",no,"Shortness of breath,New or worsening cough,"
4,positive,40,female,"none,",no,"Sore throat,Loss of taste,Loss of smell,"
5,negative,66,female,"Diabetes with complications,",no,"none,"
6,negative,20,female,"none,",no,"none,"
7,negative,17,female,"none,",no,"Shortness of breath,Sore throat,Body aches,"
8,negative,47,male,"none,",no,"New or worsening cough,"
9,positive,53,male,"none,",no,"Fever, chills, or sweating,Shortness of breath..."


In [4]:
labels_df.dtypes

corona_test                  object
age                           int64
gender                       object
medical_history              object
smoker                       object
patient_reported_symptoms    object
dtype: object

In [5]:
labels_df.isna().sum()

corona_test                  0
age                          0
gender                       0
medical_history              0
smoker                       0
patient_reported_symptoms    0
dtype: int64

In [6]:
corona_positive = len(labels_df[labels_df['corona_test'] == 'negative'])
corona_negative = len(labels_df[labels_df['corona_test'] == 'positive'])
print('Negative:', corona_negative)
print('Positive:', corona_positive)

Negative: 7
Positive: 9


In [7]:
labels_df['medical_history'].unique()

array(['none,', 'Congestive heart failure,',
       'Asthma or chronic lung disease,', 'Diabetes with complications,'],
      dtype=object)

In [8]:
labels_df['patient_reported_symptoms'].unique()

array(['none,', 'Shortness of breath,', 'Sore throat,',
       'Shortness of breath,New or worsening cough,',
       'Sore throat,Loss of taste,Loss of smell,',
       'Shortness of breath,Sore throat,Body aches,',
       'New or worsening cough,',
       'Fever, chills, or sweating,Shortness of breath,New or worsening cough,Sore throat,Loss of taste,Loss of smell,',
       'Fever, chills, or sweating,New or worsening cough,Sore throat,'],
      dtype=object)

# Text Model

In [0]:
# The values of the medical history feature
med_hisotry = ['none', 'Congestive heart failure',
       'Asthma or chronic lung disease', 'Diabetes with complications']
# The values of the symptom feature
symptoms = ['Fever, chills, or sweating', 'Shortness of breath', 'New or worsening cough',
       'Sore throat', 'Body aches', 'Loss of taste', 'Loss of smell', 'none']

In [0]:
class Text:
    def __init__(self, df):
        """
        @param df  Format "date, corona_test, age, gender, medical_history, 
        smoker, patient_reported_symptoms"
        """
        self.df_features = df.drop(columns = ["corona_test"])
        self.targets = df["corona_test"] 

    def preprocess(self):
        """
        @return labels_df in one-hot-encoded format
        """
        # Change the value type of targets to int
        self.targets.replace(to_replace ="negative", value =0, inplace=True)
        self.targets.replace(to_replace ="positive", value =1, inplace=True) 
        # Encode the information of medical history
        for m in med_hisotry:
            self.df_features[m] = self.df_features.medical_history.str.contains(m).astype(int)
        self.df_features.rename(columns={'none':'no_med_history'}, inplace=True)
        # Encode the information of symptoms
        for s in symptoms:
            self.df_features[s] = self.df_features.patient_reported_symptoms.str.contains(s).astype(int)
        self.df_features.rename(columns={'none':'no_symptoms'}, inplace=True)
        self.df_features = self.df_features.drop(columns = ["medical_history", "patient_reported_symptoms"])
        # Encode the gender and smoker information
        self.df_features['gender'] = LabelEncoder().fit_transform(self.df_features['gender'])
        self.df_features['smoker'] = LabelEncoder().fit_transform(self.df_features['smoker'])
   
    def split_train_test(self, size=0.25, rand=22, shuffle=True):
        """
        @return the X_train, X_test, y_train, y_test values
        """
        return train_test_split(self.df_features, self.targets, test_size=size, random_state=rand, shuffle=shuffle)

In [0]:
model = Text(labels_df.copy())
model.preprocess()
X_train, X_test, y_train, y_test = model.split_train_test()