In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from bs4 import BeautifulSoup

import os, datetime, json, re, requests

%matplotlib inline

current_directory = os.getcwd()

In [None]:
# https://datarepository.wolframcloud.com/resources/Patient-Medical-Data-for-Novel-Coronavirus-COVID-19
# Wolfram obtains this data from https://github.com/beoutbreakprepared/nCoV2019/blob/master/latest_data/latestdata.tar.gz
# Column explanations are in the "Data" section: https://www.nature.com/articles/s41597-020-0448-0#Sec4

# Using web parsing to isolate the download link for the dataset file:
URL_wolf = 'https://datarepository.wolframcloud.com/resources/Patient-Medical-Data-for-Novel-Coronavirus-COVID-19'
URL_wolf_get = requests.get(URL_wolf)
wolf_soup = BeautifulSoup(URL_wolf_get.text, 'html.parser')
URL_wolf_tsv = wolf_soup.find("a", text='TSV')['href'] # Isolating the download link from the HTML
URL_wolf_tsv_get = requests.get(URL_wolf_tsv)

# Creating a directory to store data:
## Because there is not much data to manipulate, all data will be stored in this folder.
if os.path.exists(current_directory + '\\Data\\') != True:
    os.mkdir(current_directory + '\\Data\\')

with open('data\\Patient-Medical-Data-for-Novel-Coronavirus-COVID-19.tsv', 'w') as writer:
    writer.write(URL_wolf_tsv_get.text) # Writing the downloaded file into a local file is required before loading into a DataFrame
    

data_df = pd.read_csv("data\\Patient-Medical-Data-for-Novel-Coronavirus-COVID-19.tsv", sep= '\t')
data_date = re.search(r'made computable on (.*?)\.\)', wolf_soup.text).group(1)
print('Date of the data extracted from Wolfram: ', data_date)
data_df.head()

In [None]:
# Loading in a CSV that ties countries to continents:
# https://datahub.io/JohnSnowLabs/country-and-continent-codes-list#resource-country-and-continent-codes-list-csv
# https://www.worldometers.info/geography/how-many-countries-are-there-in-the-world/  <<<<< This says that there are 195 countries but the length of the df is 262
country_continent_df = pd.read_csv('Data\\country-and-continent-codes-list-csv.csv')

## Adding a few missing countries/differently labelled countries reported in data_df:
missing_countries_dict = {'Country_Name':['Democratic Republic Congo','Ivory Coast','Kosovo','Republic Congo','South Korea'],
                         'Continent_Name': ['Africa', 'Africa', 'Europe', 'Africa', 'Asia']}
missing_countries_df = pd.DataFrame(missing_countries_dict)
country_continent_df = pd.concat([country_continent_df, missing_countries_df])

In [None]:
# Dropping City (since Country is present), AdministrativeDivision, GeoPosition, LivesInWuhan, LivesInWuhanComment, 
# TravelHistoryDates, TravelHistoryLocation, ReportedMarketExposure, ReportedMarketExposureComment, SequenceAvailable.
data_df = data_df.drop(['City', 'AdministrativeDivision', 'GeoPosition', 'LivesInWuhan', 'LivesInWuhanComment', 'TravelHistoryDates', 'TravelHistoryLocation', 'ReportedMarketExposure', 'ReportedMarketExposureComment', 'SequenceAvailable'],
                      axis = 1)

In [None]:
# Cleaning data_df & Adding Labels (see last section in this cell for numeric category definition):
for index, row in data_df.iterrows():
    
    ## Age
    if str(type(row.Age)) == "<class 'str'>":
        if 'Interval' in row.Age:
            data_df.loc[index, 'Age'] = np.NaN

        elif 'Missing' in row.Age:
            data_df.loc[index, 'Age'] = np.NaN
        
    ## Sex
    if 'Male' in row.Sex:
        data_df.loc[index, 'Sex'] = 0
        
    elif 'Female' in row.Sex:
        data_df.loc[index, 'Sex'] = 1
        
    else:
        data_df.loc[index, 'Sex'] = -1 # This is for missing data
        
    ## Country
    if 'Missing' in row.Country:
        data_df.loc[index, 'Country'] = np.NaN
    
    else:
        re_pattern = re.compile('Entity\["Country", "(.*?)"\]')
        re_search = re.search(re_pattern, row.Country)
        match = re_search.group(1)
        match_edit = re.sub(r"(\w)([A-Z])", r"\1 \2", match) # Countries with a space are originally one word
        data_df.loc[index, 'Country'] = match_edit
    
    ## DateOfOnsetSymptoms
    if 'DateInterval' in row.DateOfOnsetSymptoms:
        data_df.loc[index, 'DateOfOnsetSymptoms'] = np.NaN
        
    elif 'Missing' in row.DateOfOnsetSymptoms: # Separating from the first if for performance
        data_df.loc[index, 'DateOfOnsetSymptoms'] = np.NaN
        
    else:
        re_pattern = re.compile('DateObject\[{(.*?)}, "Day", "Gregorian", -5.\]')
        re_search = re.search(re_pattern, row.DateOfOnsetSymptoms)
        match = re_search.group(1)
        data_df.loc[index, 'DateOfOnsetSymptoms'] = datetime.datetime.strptime(match, '%Y, %m, %d').date()
    
    ## DateOfAdmissionHospital
    if 'DateInterval' in row.DateOfAdmissionHospital:
        data_df.loc[index, 'DateOfAdmissionHospital'] = np.NaN
        
    elif 'Missing' in row.DateOfAdmissionHospital: 
        data_df.loc[index, 'DateOfAdmissionHospital'] = np.NaN
        
    else:
        re_pattern = re.compile('DateObject\[{(.*?)}, "Day", "Gregorian", -5.\]')
        re_search = re.search(re_pattern, row.DateOfAdmissionHospital)
        match = re_search.group(1)
        data_df.loc[index, 'DateOfAdmissionHospital'] = datetime.datetime.strptime(match, '%Y, %m, %d').date()
    
    ## DateOfConfirmation
    if 'DateInterval' in row.DateOfConfirmation:
        data_df.loc[index, 'DateOfConfirmation'] = np.NaN
        
    elif 'Missing' in row.DateOfConfirmation:
        data_df.loc[index, 'DateOfConfirmation'] = np.NaN
        
    else:
        re_pattern = re.compile('DateObject\[{(.*?)}, "Day", "Gregorian", -5.\]')
        re_search = re.search(re_pattern, row.DateOfConfirmation)
        match = re_search.group(1)
        data_df.loc[index, 'DateOfConfirmation'] = datetime.datetime.strptime(match, '%Y, %m, %d').date()
        
    ## Symptoms
    if 'Missing' in row.Symptoms:
        data_df.loc[index, 'Symptoms'] = np.NaN
    
    elif 'Quantity' in row.Symptoms:
        try:
            re_pattern = re.compile(', Quantity\[(.*?)\]')
            re_search = re.search(re_pattern, row.Symptoms)
            match = re_search.group(0)
            string_edit = row.Symptoms.replace(match, '').replace('"', '').replace('{', '').replace('}', '')
            data_df.loc[index, 'Symptoms'] = string_edit
            
        except Exception:
            re_pattern = re.compile('Quantity\[(.*?)\], ')
            re_search = re.search(re_pattern, row.Symptoms)
            match = re_search.group(0)
            string_edit = row.Symptoms.replace(match, '').replace('"', '').replace('{', '').replace('}', '')
            data_df.loc[index, 'Symptoms'] = string_edit
        
    else:
        string_edit = row.Symptoms.replace('"', '').replace('{', '').replace('}', '')
        data_df.loc[index, 'Symptoms'] = string_edit
        
    ## ChronicDiseases
    if 'Missing' in row.ChronicDiseases:
        data_df.loc[index, 'ChronicDiseases'] = np.NaN
        
    else:
        string_edit = row.ChronicDiseases.replace('"', '').replace('{', '').replace('}', '')
        data_df.loc[index, 'ChronicDiseases'] = string_edit
    
    ## DischargedQ
    if 'Missing' in row.DischargedQ:
        data_df.loc[index, 'DischargedQ'] = 0
    
    else:
        data_df.loc[index, 'DischargedQ'] = 1
        
    ## DeathQ
    if 'Missing' in row.DeathQ:
        data_df.loc[index, 'DeathQ'] = 0
    
    else:
        data_df.loc[index, 'DeathQ'] = 1
        
    ## DateOfDeath
    if 'Missing' in row.DateOfDeath:
        data_df.loc[index, 'DateOfDeath'] = np.NaN
        
    else:
        re_pattern = re.compile('DateObject\[{(.*?)}, "Day", "Gregorian", -5.\]')
        re_search = re.search(re_pattern, row.DateOfDeath)
        match = re_search.group(1)
        data_df.loc[index, 'DateOfDeath'] = datetime.datetime.strptime(match, '%Y, %m, %d').date()
    
    ## DateOfDischarge
    if 'Missing' in row.DateOfDischarge: 
        data_df.loc[index, 'DateOfDischarge'] = np.NaN
        
    else:
        re_pattern = re.compile('DateObject\[{(.*?)}, "Day", "Gregorian", -5.\]')
        re_search = re.search(re_pattern, row.DateOfDischarge)
        match = re_search.group(1)
        data_df.loc[index, 'DateOfDischarge'] = datetime.datetime.strptime(match, '%Y, %m, %d').date()
    
    ## Associating Continents with the Country:
    country = data_df.loc[index, 'Country']
    if str(type(country)) == "<class 'float'>":
        data_df.loc[index, 'CONTINENT'] = np.NaN
    
    else:
        for temp_index, temp_row in country_continent_df.iterrows():
            if country in temp_row.Country_Name:
                data_df.loc[index, 'CONTINENT'] = temp_row.Continent_Name
                break
    
    ## Counting the days until the patient dies, DateOfDeath - DateOfOnsetSymptoms:
    if str(type(data_df.loc[index, 'DateOfDeath'])) == "<class 'float'>" or str(type(data_df.loc[index, 'DateOfOnsetSymptoms'])) == "<class 'float'>":
        data_df.loc[index, 'DAYS_DEATH'] = np.NaN
    
    else:
        start_date = data_df.loc[index, 'DateOfOnsetSymptoms']
        end_date = data_df.loc[index, 'DateOfDeath']
        
        data_df.loc[index, 'DAYS_DEATH'] = (end_date - start_date).days
        
    ## Counting the days until the patient recovers, DateOfDischarge - DateOfOnsetSymptoms:
    if str(type(data_df.loc[index, 'DateOfDischarge'])) == "<class 'float'>" or str(type(data_df.loc[index, 'DateOfOnsetSymptoms'])) == "<class 'float'>":
        data_df.loc[index, 'DAYS_DISCHARGE'] = np.NaN
    
    else:
        start_date = data_df.loc[index, 'DateOfOnsetSymptoms']
        end_date = data_df.loc[index, 'DateOfDischarge']
        
        data_df.loc[index, 'DAYS_DISCHARGE'] = (end_date - start_date).days
        
    ## Counting the days in the hospital, DateOfDeath - DateOfConfirmation:
    if str(type(data_df.loc[index, 'DateOfDeath'])) == "<class 'float'>" or str(type(data_df.loc[index, 'DateOfConfirmation'])) == "<class 'float'>":
        data_df.loc[index, 'DAYS_DEATH_CONFIRM'] = np.NaN
    
    else:
        start_date = data_df.loc[index, 'DateOfConfirmation']
        end_date = data_df.loc[index, 'DateOfDeath']
        
        data_df.loc[index, 'DAYS_DEATH_CONFIRM'] = (end_date - start_date).days
        
    ## Counting the days until in the hospital, DateOfDischarge - DateOfConfirmation:
    if str(type(data_df.loc[index, 'DateOfDischarge'])) == "<class 'float'>" or str(type(data_df.loc[index, 'DateOfConfirmation'])) == "<class 'float'>":
        data_df.loc[index, 'DAYS_DISCHARGE_CONFIRM'] = np.NaN
    
    else:
        start_date = data_df.loc[index, 'DateOfConfirmation']
        end_date = data_df.loc[index, 'DateOfDischarge']
        
        data_df.loc[index, 'DAYS_DISCHARGE_CONFIRM'] = (end_date - start_date).days
    
    ## Creating Labels:
    ### Discharged Patients = 1
    ### Death = -1
    ### Unknown = 0
    label = np.NaN
    if row.DischargedQ == 'True' or str(type(data_df.loc[index, 'DateOfDischarge'])) != "<class 'float'>":
        label = 1
        
    elif row.DeathQ == 'True' or str(type(data_df.loc[index, 'DateOfDeath'])) != "<class 'float'>":
        label = -1
        
    else:
        label = 0
        
    data_df.loc[index, 'LABEL'] = label

In [None]:
# Dropping rows with intervals as ages since most have a large age range; age intervals have been replaced with np.NaN 
# in the previous cell. This will also drop rows where the age value was missing.
data_df = data_df.dropna(subset= ['Age'])

# Changing the data types of the DataFrame columns:
data_df = data_df.astype({'Age': 'int32', 
                          'Sex': 'int32', 
                          'DischargedQ': 'int32', 
                          'DeathQ': 'int32'})

In [None]:
# Saving the transformed DataFrame:
data_df.to_csv(current_directory + '\\Data\\data_transformed.csv')

In [None]:
# Examing the reported deaths and recoveries:
deaths = len(data_df[data_df.LABEL == -1])
recovered = len(data_df[data_df.LABEL == 1])
total = deaths + recovered

# Global death rate of those infected (excluding those who have been not reported to have recovered or deceased):
global_death_rate = len(data_df[data_df.LABEL == -1]) / (len(data_df[data_df.LABEL == -1]) + len(data_df[data_df.LABEL == 1]))

# Global survival rate of those infected (excluding those who have been not reported to have recovered or deceased):
global_survival_rate = len(data_df[data_df.LABEL == 1]) / (len(data_df[data_df.LABEL == -1]) + len(data_df[data_df.LABEL == 1]))

print('Total Cases Reported of Deaths and Recovered Patients: ', total)
print('Reported Deaths: ', deaths)
print('Percentage of Deaths: ', global_death_rate)
print('Reported Recoverd: ', recovered)
print('Percentage of Recoverd: ', global_survival_rate)

In [None]:
# Comparing with global statistics:
# https://www.euronews.com/2020/05/25/covid-19-coronavirus-breakdown-of-deaths-and-infections-worldwide
# Article date for the numbers below: 5-25-2020
euronews_death = 349290
euronews_recovered = 2403857
euronews_active = 2616319
euronews_all = 5369466
euronews_death_recovered = euronews_death + euronews_recovered

print('Total Cases Reported of Deaths and Recovered Patients: ', euronews_death_recovered)
print('Reported Deaths: ', euronews_death)
print('Percentage of Deaths: ', euronews_death / euronews_death_recovered)
print('Reported Recoverd: ', euronews_recovered)
print('Percentage of Recoverd: ', euronews_recovered / euronews_death_recovered)

In [None]:
# Comparing with John Hopkins:
# https://github.com/CSSEGISandData/COVID-19
## Downloading file direct from the GitHub:
### Getting the newest updated data:
def search_dates(date_subtract= 0):
    if date_subtract != 0:
        day_subtract = datetime.timedelta(days= date_subtract) 
        date = datetime.datetime.now() - day_subtract
        
    else:
        date = datetime.datetime.now()
        
    month = date.month
    if len(str(month)) == 1:
        month = '0' + str(month)
    else:
        month = str(month)
        
    day = date.day
    if len(str(day)) == 1:
        day = '0' + str(day)
    else:
        day = str(day)
        
    return month, day
    
def URL_download(month, day):
    URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/" + month + "-" + day + "-2020.csv"
    URL_request = requests.get(URL)
    
    counter = 1
    while URL_request.status_code != 200:
        month, day = search_dates(date_subtract = counter)
        
        URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/" + month + "-" + day + "-2020.csv"
        URL_request = requests.get(URL)

    print('Month: ', month, '; Day: ', day)
    
    return URL_request, month, day


month, day = search_dates()
URL_request, month, day = URL_download(month, day)


with open(current_directory + '\\Data\\csse_covid_19_daily_reports - ' + month + '-' + day + '-2020.csv', "w") as file:
    file.write(URL_request.text) 
        
john_df = pd.read_csv(current_directory + '\\Data\\csse_covid_19_daily_reports - ' + month + '-' + day + '-2020.csv')

john_death = 0
john_recovered = 0
john_active = 0
john_all = 0

for john_index, john_row in john_df.iterrows():
    john_death += john_row.Deaths
    john_recovered += john_row.Recovered
    john_active += john_row.Active
    john_all += john_row.Confirmed

john_death_recovered = john_death + john_recovered

print('Total Cases Reported of Deaths and Recovered Patients: ', john_death_recovered)
print('Reported Deaths: ', john_death)
print('Percentage of Deaths: ', john_death / john_death_recovered)
print('Reported Recoverd: ', john_recovered)
print('Percentage of Recoverd: ', john_recovered / john_death_recovered)
print('Total Confirmed Cases: ', john_all)

In [None]:
# Visuallizing age distributions of patients who have survived and recovered:
age_df = pd.concat([data_df[data_df.LABEL == -1], data_df[data_df.LABEL == 1]])
sns.boxplot(x= 'LABEL', y= 'Age', data= age_df, boxprops={'facecolor':'None'})
sns.swarmplot(x= 'LABEL', y= 'Age', data= age_df)
plt.ylim([0,100])
plt.xticks([0, 1], ['Deceased', 'Recovered'])
plt.xlabel('Patient Status')
plt.show()

In [None]:
# Visuallizing the number of days of infection to the death or recovery of the patient (from the day they start experiencing 
# symptoms):
death_df = data_df[data_df.LABEL == -1]
recovered_df = data_df[data_df.LABEL == 1]

plt.subplot(1, 2, 1)
sns.boxplot(x= 'LABEL', y= 'DAYS_DEATH', data= death_df[pd.notna(death_df.DAYS_DEATH)], boxprops={'facecolor':'None'})
sns.swarmplot(x= 'LABEL', y= 'DAYS_DEATH', data= death_df[pd.notna(death_df.DAYS_DEATH)])
plt.ylim([0,50])
plt.xticks([0], ['Deceased'])
plt.ylabel('Length of Infection')
plt.xlabel('Patient Status')

plt.subplot(1, 2, 2)
sns.boxplot(x= 'LABEL', y= 'DAYS_DISCHARGE', data= recovered_df[pd.notna(recovered_df.DAYS_DISCHARGE)], boxprops={'facecolor':'None'})
sns.swarmplot(x= 'LABEL', y= 'DAYS_DISCHARGE', data= recovered_df[pd.notna(recovered_df.DAYS_DISCHARGE)])
plt.ylim([0,50])
plt.xticks([0], ['Recovered'])
plt.ylabel('Length of Infection')
plt.xlabel('Patient Status')

plt.tight_layout()
plt.show()

In [None]:
# Determining the statistics:

## Age
death_min_age = min(data_df.Age[data_df.LABEL == -1])
death_max_age = max(data_df.Age[data_df.LABEL == -1])
death_mean_age = np.mean(data_df.Age[data_df.LABEL == -1])
death_mean_std = np.std(data_df.Age[data_df.LABEL == -1])

recovered_min_age = min(data_df.Age[data_df.LABEL == 1])
recovered_max_age = max(data_df.Age[data_df.LABEL == 1])
recovered_mean_age = np.mean(data_df.Age[data_df.LABEL == 1])

## Infection Length
death_min_days = min(data_df.DAYS_DEATH[data_df.LABEL == -1])
death_max_days = max(data_df.DAYS_DEATH[data_df.LABEL == -1])
death_mean_days = np.mean(data_df.DAYS_DEATH[data_df.LABEL == -1])

recovered_min_days = min(data_df.DAYS_DISCHARGE[data_df.LABEL == 1])
recovered_max_days = max(data_df.DAYS_DISCHARGE[data_df.LABEL == 1])
recovered_mean_days = np.mean(data_df.DAYS_DISCHARGE[data_df.LABEL == 1])

## Of the patients who are are between the death_mean_age - death_mean_std and death_mean_age + death_mean_std, what is the 
## death rate:
start_range = death_mean_age - death_mean_std
end_range = death_mean_age + death_mean_std
counter_death = 0
counter_recover = 0

for age in data_df.Age[data_df.LABEL == -1]:
    if age >= start_range and age <= end_range:
        counter_death += 1
        
for age in data_df.Age[data_df.LABEL == 1]:
    if age >= start_range and age <= end_range:
        counter_recover += 1
        
death_rate_age_range = counter_death / (counter_death + counter_recover)

print('Minimun Recorded Age for Deceased Patients: ', death_min_age)
print('Maximum Recorded Age for Deceased Patients: ', death_max_age)
print('Average Recorded Age for Deceased Patients: ', death_mean_age)
print('Minimun Recorded Age for Recovered Patients: ', recovered_min_age)
print('Maximum Recorded Age for Recovered Patients: ', recovered_max_age)
print('Average Recorded Age for Recovered Patients: ', recovered_mean_age)

print('Minimun Recorded Length of Infection Until Deceased: ', death_min_days)
print('Maximum Recorded Length of Infection Until Deceased: ', death_max_days)
print('Average Recorded Length of Infection Until Deceased: ', death_mean_days)
print('Minimun Recorded Length of Infection Until Recovery: ', recovered_min_days)
print('Maximum Recorded Length of Infection Until Recovery: ', recovered_max_days)
print('Average Recorded Length of Infection Until Recovery: ', recovered_mean_days)

print('Death Rate for Patients who are Between Ages ', int(start_range), ' and ', int(end_range), ': ', death_rate_age_range)

In [None]:
# Using Random Forest to predict surivival outcome only using Age, Sex, ChronicDiseaseQ, and DAYS_TO_LABEL:
ml_df = age_df.copy() # This only considers patients who are deceased or recovered

# Combining the DAYS_DEATH & DAYS_DISCHARGE columns:
for ml_index, ml_row in ml_df.iterrows():
    if np.isnan(ml_row.DAYS_DEATH) and np.isnan(ml_row.DAYS_DISCHARGE):
        if ml_row.LABEL == -1:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = death_mean_days
            
        else:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = recovered_mean_days
    
    else:
        if ml_row.LABEL == -1:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = ml_row.DAYS_DEATH

        else:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = ml_row.DAYS_DISCHARGE
        
# Dropping: Country, DateOfOnsetSymptoms, DateOfAdmissionHospital, DateOfConfirmation, Symptoms, ChronicDiseases, 
# DischargedQ, DeathQ, DateOfDeath, DateOfDischarge, CONTINENT, DAYS_DEATH_CONFIRM, DAYS_DISCHARGE_CONFIRM
ml_df = ml_df.drop(['Country', 'DateOfOnsetSymptoms', 'DateOfAdmissionHospital', 'DateOfConfirmation', 'Symptoms', 'ChronicDiseases', 'DischargedQ', 'DeathQ', 'DateOfDeath', 'DateOfDischarge', 'CONTINENT', 'DAYS_DEATH', 'DAYS_DISCHARGE', 'DAYS_DEATH_CONFIRM', 'DAYS_DISCHARGE_CONFIRM'],
                   axis= 1)

ml_variables = ml_df.loc[:, ['Age', 'Sex', 'ChronicDiseaseQ', 'DAYS_TO_LABEL']]
ml_labels = ml_df['LABEL']

X_train, X_test, Y_train, Y_test = train_test_split(ml_variables, ml_labels, test_size= .25)

best_tree = 0
best_score = 0

for trees in range(64, 129):
    rf_model = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    rf_model.fit(X_train, Y_train)
    rf_score = rf_model.score(X_test, Y_test)
    
    if rf_score > best_score:
        best_tree = trees
        best_score = rf_score
        
rf_model = RandomForestClassifier(n_estimators= best_tree, max_features= 'sqrt')
rf_model.fit(X_train, Y_train)
rf_predict = rf_model.predict(X_test)

feature_importance = list(rf_model.feature_importances_)

feature_df = pd.DataFrame()
counter = 0

for variable in list(ml_variables.columns.values):
    feature_df.loc[0, variable] = feature_importance[counter]
    counter += 1

print('Best Accuracy Score (Classification): ', best_score)
print(classification_report(Y_test, rf_predict))
feature_df

In [None]:
# Using Random Forest to predict surivival outcome based on the Age, Sex, and symptoms:
ml_df = age_df.copy() # This only considers patients who are deceased or recovered

# Turning Symptoms and ChronicDiseases values into columns:
symptom_list = []

for entry in list(set(age_df.Symptoms)):
    if str(type(entry)) == "<class 'float'>":
        continue
    
    else:
        temp_list = entry.split(', ')
        symptom_list.extend(temp_list)
        
symptom_list = list(set(symptom_list))


for ml_index, ml_row in ml_df.iterrows():
    for symptom in symptom_list:
        if str(type(ml_row.Symptoms)) == "<class 'float'>":
            ml_df.loc[ml_index, symptom] = 0
        
        elif symptom in ml_row.Symptoms:
            ml_df.loc[ml_index, symptom] = 1
            
        else:
            ml_df.loc[ml_index, symptom] = 0
        


        
# Dropping: Country, DateOfOnsetSymptoms, DateOfAdmissionHospital, DateOfConfirmation, Symptoms, ChronicDiseases, 
# DischargedQ, DeathQ, DateOfDeath, DateOfDischarge, CONTINENT, ChronicDiseaseQ, DAYS_DEATH_CONFIRM, DAYS_DISCHARGE_CONFIRM
ml_df = ml_df.drop(['Country', 'ChronicDiseaseQ', 'DateOfOnsetSymptoms', 'DateOfAdmissionHospital', 'DateOfConfirmation', 'Symptoms', 'ChronicDiseases', 'DischargedQ', 'DeathQ', 'DateOfDeath', 'DateOfDischarge', 'CONTINENT', 'DAYS_DEATH', 'DAYS_DISCHARGE', 'DAYS_DEATH_CONFIRM', 'DAYS_DISCHARGE_CONFIRM'],
                   axis= 1)

variables_list = list(ml_df.columns.values)
del variables_list[2] # Removing the value 'LABEL' from the list

ml_variables = ml_df.loc[:, variables_list]
ml_labels = ml_df['LABEL']

X_train, X_test, Y_train, Y_test = train_test_split(ml_variables, ml_labels, test_size= .25)

best_tree = 0
best_score = 0

for trees in range(64, 129):
    rf_model = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    rf_model.fit(X_train, Y_train)
    rf_score = rf_model.score(X_test, Y_test)
    
    if rf_score > best_score:
        best_tree = trees
        best_score = rf_score
        
rf_model = RandomForestClassifier(n_estimators= best_tree, max_features= 'sqrt')
rf_model.fit(X_train, Y_train)
rf_predict = rf_model.predict(X_test)

feature_importance = list(rf_model.feature_importances_)

feature_df = pd.DataFrame()
counter = 0

for variable in variables_list:
    feature_df.loc[0, variable] = feature_importance[counter]
    counter += 1

print('Best Accuracy Score (Classification): ', best_score)
print(classification_report(Y_test, rf_predict))
feature_df

In [None]:
# Using Random Forest to predict surivival outcome based on the Age, Sex, and prior chronic illnesses:
ml_df = age_df.copy() # This only considers patients who are deceased or recovered

# Turning ChronicDiseases values into columns:
chronic_list = []

for entry in list(set(age_df.ChronicDiseases)):
    if str(type(entry)) == "<class 'float'>":
        continue
    
    else:
        temp_list = entry.split(', ')
        chronic_list.extend(temp_list)
        
chronic_list = list(set(chronic_list))

for ml_index, ml_row in ml_df.iterrows():

        
    for chronic in chronic_list:
        if str(type(ml_row.ChronicDiseases)) == "<class 'float'>":
            ml_df.loc[ml_index, chronic] = 0
        
        elif chronic in ml_row.ChronicDiseases:
            ml_df.loc[ml_index, chronic] = 1
            
        else:
            ml_df.loc[ml_index, chronic] = 0

        
# Dropping: Country, DateOfOnsetSymptoms, DateOfAdmissionHospital, DateOfConfirmation, Symptoms, ChronicDiseases, 
# DischargedQ, DeathQ, DateOfDeath, DateOfDischarge, CONTINENT, ChronicDiseaseQ, DAYS_DEATH_CONFIRM, DAYS_DISCHARGE_CONFIRM
ml_df = ml_df.drop(['Country', 'ChronicDiseaseQ', 'DateOfOnsetSymptoms', 'DateOfAdmissionHospital', 'DateOfConfirmation', 'Symptoms', 'ChronicDiseases', 'DischargedQ', 'DeathQ', 'DateOfDeath', 'DateOfDischarge', 'CONTINENT', 'DAYS_DEATH', 'DAYS_DISCHARGE', 'DAYS_DEATH_CONFIRM', 'DAYS_DISCHARGE_CONFIRM'],
                   axis= 1)

variables_list = list(ml_df.columns.values)
del variables_list[2] # Removing the value 'LABEL' from the list

ml_variables = ml_df.loc[:, variables_list]
ml_labels = ml_df['LABEL']

X_train, X_test, Y_train, Y_test = train_test_split(ml_variables, ml_labels, test_size= .25)

best_tree = 0
best_score = 0

for trees in range(64, 129):
    rf_model = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    rf_model.fit(X_train, Y_train)
    rf_score = rf_model.score(X_test, Y_test)
    
    if rf_score > best_score:
        best_tree = trees
        best_score = rf_score
        
rf_model = RandomForestClassifier(n_estimators= best_tree, max_features= 'sqrt')
rf_model.fit(X_train, Y_train)
rf_predict = rf_model.predict(X_test)

feature_importance = list(rf_model.feature_importances_)

feature_df = pd.DataFrame()
counter = 0

for variable in variables_list:
    feature_df.loc[0, variable] = feature_importance[counter]
    counter += 1

print('Best Accuracy Score (Classification): ', best_score)
print(classification_report(Y_test, rf_predict))
feature_df

In [None]:
# Using Random Forest to predict surivival outcome based on the Age, Sex, symptoms, and prior chronic illnesses:
ml_df = age_df.copy() # This only considers patients who are deceased or recovered

# Turning Symptoms and ChronicDiseases values into columns:
symptom_list = []

for entry in list(set(age_df.Symptoms)):
    if str(type(entry)) == "<class 'float'>":
        continue
    
    else:
        temp_list = entry.split(', ')
        symptom_list.extend(temp_list)
        
symptom_list = list(set(symptom_list))

chronic_list = []

for entry in list(set(age_df.ChronicDiseases)):
    if str(type(entry)) == "<class 'float'>":
        continue
    
    else:
        temp_list = entry.split(', ')
        chronic_list.extend(temp_list)
        
chronic_list = list(set(chronic_list))

for ml_index, ml_row in ml_df.iterrows():
    for symptom in symptom_list:
        if str(type(ml_row.Symptoms)) == "<class 'float'>":
            ml_df.loc[ml_index, symptom] = 0
        
        elif symptom in ml_row.Symptoms:
            ml_df.loc[ml_index, symptom] = 1
            
        else:
            ml_df.loc[ml_index, symptom] = 0
        
    for chronic in chronic_list:
        if str(type(ml_row.ChronicDiseases)) == "<class 'float'>":
            ml_df.loc[ml_index, chronic] = 0
        
        elif chronic in ml_row.ChronicDiseases:
            ml_df.loc[ml_index, chronic] = 1
            
        else:
            ml_df.loc[ml_index, chronic] = 0

        
# Dropping: Country, DateOfOnsetSymptoms, DateOfAdmissionHospital, DateOfConfirmation, Symptoms, ChronicDiseases, 
# DischargedQ, DeathQ, DateOfDeath, DateOfDischarge, CONTINENT, ChronicDiseaseQ, DAYS_DEATH_CONFIRM, DAYS_DISCHARGE_CONFIRM
ml_df = ml_df.drop(['Country', 'ChronicDiseaseQ', 'DateOfOnsetSymptoms', 'DateOfAdmissionHospital', 'DateOfConfirmation', 'Symptoms', 'ChronicDiseases', 'DischargedQ', 'DeathQ', 'DateOfDeath', 'DateOfDischarge', 'CONTINENT', 'DAYS_DEATH', 'DAYS_DISCHARGE', 'DAYS_DEATH_CONFIRM', 'DAYS_DISCHARGE_CONFIRM'],
                   axis= 1)

variables_list = list(ml_df.columns.values)
del variables_list[2] # Removing the value 'LABEL' from the list

ml_variables = ml_df.loc[:, variables_list]
ml_labels = ml_df['LABEL']

X_train, X_test, Y_train, Y_test = train_test_split(ml_variables, ml_labels, test_size= .25)

best_tree = 0
best_score = 0

for trees in range(64, 129):
    rf_model = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    rf_model.fit(X_train, Y_train)
    rf_score = rf_model.score(X_test, Y_test)
    
    if rf_score > best_score:
        best_tree = trees
        best_score = rf_score
        
rf_model = RandomForestClassifier(n_estimators= best_tree, max_features= 'sqrt')
rf_model.fit(X_train, Y_train)
rf_predict = rf_model.predict(X_test)

feature_importance = list(rf_model.feature_importances_)

feature_df = pd.DataFrame()
counter = 0

for variable in variables_list:
    feature_df.loc[0, variable] = feature_importance[counter]
    counter += 1

print('Best Accuracy Score (Classification): ', best_score)
print(classification_report(Y_test, rf_predict))
feature_df

In [None]:
# Using Random Forest to predict surivival outcome based on the variables from the previous cells used to for predictions:
ml_df = age_df.copy() # This only considers patients who are deceased or recovered

# Turning Symptoms and ChronicDiseases values into columns:
symptom_list = []

for entry in list(set(age_df.Symptoms)):
    if str(type(entry)) == "<class 'float'>":
        continue
    
    else:
        temp_list = entry.split(', ')
        symptom_list.extend(temp_list)
        
symptom_list = list(set(symptom_list))

chronic_list = []

for entry in list(set(age_df.ChronicDiseases)):
    if str(type(entry)) == "<class 'float'>":
        continue
    
    else:
        temp_list = entry.split(', ')
        chronic_list.extend(temp_list)
        
chronic_list = list(set(chronic_list))

for ml_index, ml_row in ml_df.iterrows():
    for symptom in symptom_list:
        if str(type(ml_row.Symptoms)) == "<class 'float'>":
            ml_df.loc[ml_index, symptom] = 0
        
        elif symptom in ml_row.Symptoms:
            ml_df.loc[ml_index, symptom] = 1
            
        else:
            ml_df.loc[ml_index, symptom] = 0
        
    for chronic in chronic_list:
        if str(type(ml_row.ChronicDiseases)) == "<class 'float'>":
            ml_df.loc[ml_index, chronic] = 0
        
        elif chronic in ml_row.ChronicDiseases:
            ml_df.loc[ml_index, chronic] = 1
            
        else:
            ml_df.loc[ml_index, chronic] = 0
            
# Combining the DAYS_DEATH & DAYS_DISCHARGE columns:
for ml_index, ml_row in ml_df.iterrows():
    if np.isnan(ml_row.DAYS_DEATH) and np.isnan(ml_row.DAYS_DISCHARGE):
        if ml_row.LABEL == -1:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = death_mean_days
            
        else:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = recovered_mean_days
    
    else:
        if ml_row.LABEL == -1:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = ml_row.DAYS_DEATH

        else:
            ml_df.loc[ml_index, 'DAYS_TO_LABEL'] = ml_row.DAYS_DISCHARGE

        
# Dropping: Country, DateOfOnsetSymptoms, DateOfAdmissionHospital, DateOfConfirmation, Symptoms, ChronicDiseases, 
# DischargedQ, DeathQ, DateOfDeath, DateOfDischarge, CONTINENT, ChronicDiseaseQ
ml_df = ml_df.drop(['Country', 'ChronicDiseaseQ', 'DateOfOnsetSymptoms', 'DateOfAdmissionHospital', 'DateOfConfirmation', 'Symptoms', 'ChronicDiseases', 'DischargedQ', 'DeathQ', 'DateOfDeath', 'DateOfDischarge', 'CONTINENT', 'DAYS_DEATH', 'DAYS_DISCHARGE', 'DAYS_DEATH_CONFIRM', 'DAYS_DISCHARGE_CONFIRM'],
                   axis= 1)

variables_list = list(ml_df.columns.values)
del variables_list[2]

ml_variables = ml_df.loc[:, variables_list]
ml_labels = ml_df['LABEL']



X_train, X_test, Y_train, Y_test = train_test_split(ml_variables, ml_labels, test_size= .25)

best_tree = 0
best_score = 0

for trees in range(64, 129):
    rf_model = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    rf_model.fit(X_train, Y_train)
    rf_score = rf_model.score(X_test, Y_test)
    
    if rf_score > best_score:
        best_tree = trees
        best_score = rf_score
        
rf_model = RandomForestClassifier(n_estimators= best_tree, max_features= 'sqrt')
rf_model.fit(X_train, Y_train)
rf_predict = rf_model.predict(X_test)

feature_importance = list(rf_model.feature_importances_)

feature_df = pd.DataFrame()
counter = 0

for variable in variables_list:
    feature_df.loc[0, variable] = feature_importance[counter]
    counter += 1

print('Best Accuracy Score (Classification): ', best_score)
print(classification_report(Y_test, rf_predict))
feature_df