In [15]:
"""import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install graphviz"""
from sklearn import tree
import graphviz
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import datetime
from pprint import pprint
import pandas as pd
import numpy as np
import re
import math

possible_symptoms = ['fever', 'cough', 'pneumonia', 'pulmonary inflammation', 'sore throat', 'body ache', 'headache', 'nausea', 'pneumonitis', 'fatigue', 'diarrhea', 'dizziness', 'chills', 'chest pain', 'chest tightness', 'runny nose']
def format_age(age):
    age = str(age).replace(" ", "")
    if (re.match(r"^\d+$", age)): # one number
        return float(age)
    elif(re.match(r"^\d+\.\d+$", age)): # decimal
        return float(age)
    elif (re.match(r"^\d+-\d+$", age)): # "number-number"
        ages = age.split('-')
        float(ages[0])+float(ages[1])/2.0
    elif (re.match(r"^\d+-$", age)): # "number-"
        return float(age[:-1])
    else:
        return np.NaN
    
def format_symptoms(symptoms_string):
    symptoms_string = symptoms_string.lower()
    retval = []
    for symptom in possible_symptoms:
        if symptom in symptoms_string:
            retval.append(symptom)
    return tuple(retval)

# features + date_admission_hospital (target)
columns = ['age', 'sex', 'date_admission_hospital', 'symptoms', 'chronic_disease_binary', 'country', 'date_onset_symptoms', 'date_confirmation']
df = pd.read_csv('latestdata.csv', usecols=columns)

# require age, sex and symptoms
df = df[df['age'].notnull() & df['sex'].notnull() & df['symptoms'].notnull() & df['date_onset_symptoms'].notnull() & df['date_confirmation'].notnull()]

# format age and symptoms
df['age'] = df['age'].apply(format_age)
df['symptoms'] = df['symptoms'].apply(format_symptoms)
df = df[df['age'].notnull()]
df['age'] = df['age'].apply(lambda x: int(math.ceil(x / 10.0)) * 10)

# format chronic disease binary
df['chronic_disease_binary'] = df['chronic_disease_binary'].apply(lambda x: 1 if (x is True) else 0)

# get dummy columns for male and female
df = pd.get_dummies(df, columns=['sex'])

# rename target variable to 'hospitalized'
# ***assume lack of hospital date means no hospitalization
df.rename(columns={'date_admission_hospital': 'hospitalized'}, inplace=True)
df['hospitalized'] = df['hospitalized'].notnull().astype('int')

# binarize symptoms
multilabelbinarizer = MultiLabelBinarizer()
multilabel_encoded_results = multilabelbinarizer.fit_transform(df['symptoms'])
df_multilabel_data = pd.DataFrame(multilabel_encoded_results, columns=multilabelbinarizer.classes_)
df.reset_index(drop=True, inplace=True)
df = df.assign(**df_multilabel_data)
del df['symptoms']

# map countries to integers
country_id_map = {}
for country in df['country']:
    if (country not in country_id_map):
        country_id_map[country] = len(country_id_map)
df['country'] = df['country'].apply(lambda x: country_id_map[x])
print(df['date_onset_symptoms'])

# compute days between symptoms and confirmation
onset_to_confirmation_time = []
for i in range(0, len(df['date_onset_symptoms'])):
    onset = datetime.strptime(df['date_onset_symptoms'][i], "%d.%m.%Y")
    confirmation = datetime.strptime(df['date_confirmation'][i], "%d.%m.%Y")
    onset_to_confirmation_time.append((confirmation-onset).days)
df['onset_to_confirmation_time'] = onset_to_confirmation_time
del df['date_onset_symptoms']
del df['date_confirmation']

print(set(df['outcome']))
        



  interactivity=interactivity, compiler=compiler, result=result)


0      03.02.2020
1      28.01.2020
2      21.01.2020
3      03.02.2020
4      12.02.2020
          ...    
641    05.03.2020
642    07.03.2020
643    07.03.2020
644    03.03.2020
645    16.01.2020
Name: date_onset_symptoms, Length: 646, dtype: object


KeyError: 'outcome'

In [18]:
feature_labels = list(df.columns)
feature_labels.remove('hospitalized')
X = pd.DataFrame(df, columns=feature_labels)
y = pd.DataFrame(df, columns=['hospitalized'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf)
y_pred = clf.predict(X_test)
print(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
dot_data = tree.export_graphviz(clf, out_file=None,
    feature_names=X.columns,
    class_names=['Not Hospitalized', 'Hospitalized'],
    filled=True, rounded=True,  
    special_characters=True)  
graph = graphviz.Source(dot_data) 
graph.render("tree")
pprint(dict(zip(X.columns, clf.feature_importances_)))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
     age  country  chronic_disease_binary  sex_female  sex_male  body ache  \
363   60        5                       0           0         1          0   
444   50        1                       0           1         0          0   
501   40        1                       0           0         1          0   
491   40        1                       0           0         1          0   
520   70        1                       0           1         0          0   
..   ...      ...                     ...         ...       ...        ...   
307   70        7          