# Motivation: deadly COVID-19 outcomes


## 1. Custom Decision Tree induction algorithm

Here is a copy of our Decision Tree implementation.

In [None]:
class DecisionNode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col # attribute on which to split
        self.value = value # value on which to split
        self.results = results # If the node has no children - we store here class labels with their counts
        self.tb = tb  # True branch
        self.fb = fb  # False branch
        
def split(rows, column, value):
    # define split function according to the value type
    split_function = None
    if isinstance(value, int) or isinstance(value, float):
        split_function = lambda row: row[column] >= value
    else:
        split_function = lambda row: row[column] == value

    # Divide the rows into two sets and return them
    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]
    return (set1, set2)

def count_labels(rows):
    label_count = {}
    for row in rows:
        # The class label is in the last column
        label = row[- 1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    return label_count

from math import log

def gini_impurity(rows):
    total = len(rows)
    counts = count_labels(rows)
    gini = 0
    for key, val in counts.items():
        p = val / total
        gini += p*p
        
    return (1 - gini)

def entropy(rows):
    total = len(rows)
    counts = count_labels(rows)
    ent = 0.0
    for key,val in counts.items():
        p = val / total
        ent = ent - p * log(p, 2)
    return ent


def variance(rows):
    if len(rows) == 0: return 0
    num_label = [float(row[- 1]) for row in rows]
    mean = sum(num_label) / len(num_label)
    variance = sum([(d - mean) ** 2 for d in num_label]) / len(num_label)
    return variance

def buildtree(rows, score_func=entropy, min_improvement=0, min_samples=0, max_depth=None, depth=0):
    if len(rows) == 0:
        return DecisionNode()
    # Compute overall score for the entire rows dataset
    current_score = score_func(rows)

    # Set up accumulator variables to track the best split criteria
    best_score = current_score
    best_criteria = None
    best_sets = None
    
    # Total number of features - except the last column where we store the class (target)
    column_count = len(rows[0]) - 1 
    for col in range(0, column_count):
        # Generate the list of unique values in
        # this column to split on them
        column_values = set()
        for row in rows:
            column_values.add(row[col])
            
        # Now try splitting the rows 
        # on each unique value in this column
        for value in column_values:
            (set1, set2) = split(rows, col, value)

            # Evaluate the quality of the split
            # p is the proportion of subset set1 
            p = float(len(set1)) / len(rows)
            split_score = p * score_func(set1) + (1-p) * score_func(set2)
            
            if split_score < best_score and \
                (len(set1) > min_samples and len(set2) > min_samples) and \
                (current_score - split_score) > min_improvement:
                best_score = split_score
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the sub branches
    if (current_score - best_score) > min_improvement and \
        (max_depth is None or depth < max_depth) :
        # print("Splitting on",best_criteria, " 2 sets:", len(best_sets[0]),len(best_sets[1]))
        true_branch = buildtree(best_sets[0], score_func, min_improvement, min_samples, max_depth, depth+1)
        false_branch = buildtree(best_sets[1], score_func, min_improvement, min_samples, max_depth, depth+1)
        return DecisionNode(col=best_criteria[0], value=best_criteria[1],
                            tb=true_branch, fb=false_branch)
    else: # Done splitting - summarize class labels in leaf nodes
        return DecisionNode(results=count_labels(rows))

def prediction(leaf_labels):
    total = 0
    result = {}
    for label, count in leaf_labels.items():
        total += count
        result[label] = count

    for label, val in result.items():
        result[label] = str(int(result[label]/total * 100))+"%"

    return result

def print_tree(tree, current_branch, attributes=None,  indent='', leaf_funct=prediction):
    # Is this a leaf node?
    if tree.results != None:
        print(indent + current_branch + str(leaf_funct(tree.results)))
    else:
        # Print the split question
        split_col = str(tree.col)
        if attributes is not None:
            split_col = attributes[tree.col]
        split_val = str(tree.value)
        if type(tree.value) == int or type(tree.value) == float:
            split_val = ">=" + str(tree.value)
        print(indent + current_branch + split_col + ': ' + split_val + '? ')

        # Print the branches
        indent = indent + '  '
        print_tree(tree.tb, 'T->', attributes, indent)
        print_tree(tree.fb, 'F->', attributes, indent)

## 2. Coronavirus risk factors

As discussed in the lecture, decision trees can be used not only for classification/prediction, but also to find out which atttributes are most important in classifying the record into a specific class. In this part we want to find out which symptoms/chronic conditions contribute most to the deadly outcome from catching COVID-19.

This Mexican dataset which contains the information from the Statistical Yearbooks of Morbidity 2015-2017 (as well as the information regarding cases associated with COVID-19) was found on [kaggle](https://www.kaggle.com/tanmoyx/covid19-patient-precondition-dataset).

Download the preprocessed dataset which contains only data about patients who tested positive for COVID-19 and with symptom atributes converted to categorical: [link](https://drive.google.com/file/d/1uVd09ekR1ArLrA8qN-Xtu4l-FFbmetVy/view?usp=sharing).

In this dataset we have the following attributes:
1. sex: 1 -woman, 2-man
2. age: numeric
3. diabetes: yes/no
4. copd (chronic obstructive pulmonary disease): yes/no
5. asthma: yes/no
6. imm_supr (suppressed immune system): yes/no
7. hypertension: yes/no
8. cardiovascular: yes/no
9. renal_chronic: yes/no
10. tobacco: yes/no	
11. outcome: alive/dead

In [None]:
data_file = "../data_sets/covid_categorical_good.csv"

In [None]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data.columns

In [None]:
data_rows = data.to_numpy().tolist()
len(data_rows)

In [None]:
columns_list = data.columns.to_numpy().tolist()
print(columns_list)

Build decision tree using our custom algorithm:

In [None]:
tree = buildtree(data_rows, score_func=entropy, min_improvement=0, min_samples=30, max_depth=7)

In [None]:
print_tree(tree, '', columns_list)

What are the most important comorbidity factors? Hard to tell. 
We will try to discover them more efficiently in this project using classification rules.

Copyright &copy; 2022 Marina Barsky. All rights reserved.