# Identifying high flyers

In [2]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier
random.seed(365)

In [3]:
# Globals
PCT_TRAIN = 0.6
N_ITERS = 10

In [4]:
# Function to fit a regression tree to data

def train_test_eval(features, labels):
    
    n_rows = features.shape[0]
    n_rows_training = round(PCT_TRAIN * n_rows)
    random_index = list(range(n_rows))

    # Fit the tree several times
    for i in range(N_ITERS):

        random.shuffle(random_index)

        # Get training data
        train_index = random_index[: n_rows_training]
        train_features = features.iloc[train_index]
        train_labels = labels[train_index]

        # Get test data
        test_index = random_index[n_rows_training :]
        test_features = features.loc[test_index]
        test_labels = labels[test_index]

        # Create a tree using the training data
        tree = DecisionTreeClassifier()
        tree.fit(train_features, train_labels)

        # Predict on the test data
        predicted_labels = tree.predict(test_features)

        # Calculate how well it performed
        n_wrong = sum(test_labels != predicted_labels)
        mcr = n_wrong / len(test_labels)

        # Print out the results
        print("Trial #{}: Misclassified {} of {} points | MCR = {}".format(i + 1, n_wrong, len(test_labels), mcr))
        
    return    

## `baseline.csv` data

In [5]:
# Data cleaning

# Some columns / rows have too many NAs
baseline = pd.read_csv("data/baseline.csv")
baseline.drop(axis="columns", labels=["rec_attended_school", "dist_food_market"], inplace=True)
baseline.dropna(axis="index", how="any", inplace=True)
baseline.reset_index(inplace=True)
baseline.drop(axis="columns", labels=["index"], inplace=True)

# Recode 
baseline = baseline.replace("Willing to wait at least once", "1")
baseline = baseline.replace("Never willing to wait", "0")
baseline = baseline.astype({"imp_dich": "int32"})

### `HF2` as label

In [6]:
# Fit a basic classification tree to the baseline data with HF2 as labels

features = baseline[[
    "hhsize", "able", "rec_age", "imp_dich", "future1yr",
    "asst_idx", "lvstk_idx", "prod_idx", "any_savings"]]

labels = baseline["HF2"]

train_test_eval(features, labels)

Trial #1: Misclassified 42 of 296 points | MCR = 0.14189189189189189
Trial #2: Misclassified 48 of 296 points | MCR = 0.16216216216216217
Trial #3: Misclassified 48 of 296 points | MCR = 0.16216216216216217
Trial #4: Misclassified 49 of 296 points | MCR = 0.16554054054054054
Trial #5: Misclassified 44 of 296 points | MCR = 0.14864864864864866
Trial #6: Misclassified 53 of 296 points | MCR = 0.17905405405405406
Trial #7: Misclassified 45 of 296 points | MCR = 0.15202702702702703
Trial #8: Misclassified 53 of 296 points | MCR = 0.17905405405405406
Trial #9: Misclassified 57 of 296 points | MCR = 0.19256756756756757
Trial #10: Misclassified 41 of 296 points | MCR = 0.13851351351351351


### `HF3` as label

In [7]:
# Fit a basic classification tree to the baseline data with HF3 as labels

features = baseline[[
    "hhsize", "able", "rec_age", "imp_dich", "future1yr",
    "asst_idx", "lvstk_idx", "prod_idx", "any_savings"]]

labels = baseline["HF3"]

train_test_eval(features, labels)

Trial #1: Misclassified 108 of 296 points | MCR = 0.36486486486486486
Trial #2: Misclassified 122 of 296 points | MCR = 0.41216216216216217
Trial #3: Misclassified 115 of 296 points | MCR = 0.3885135135135135
Trial #4: Misclassified 107 of 296 points | MCR = 0.3614864864864865
Trial #5: Misclassified 118 of 296 points | MCR = 0.39864864864864863
Trial #6: Misclassified 122 of 296 points | MCR = 0.41216216216216217
Trial #7: Misclassified 107 of 296 points | MCR = 0.3614864864864865
Trial #8: Misclassified 105 of 296 points | MCR = 0.3547297297297297
Trial #9: Misclassified 104 of 296 points | MCR = 0.35135135135135137
Trial #10: Misclassified 111 of 296 points | MCR = 0.375


## `denver.csv` data

In [8]:
# Data cleaning

# Some rows have NAs
denver = pd.read_csv("data/denver.csv")
denver.dropna(axis="index", how="any", inplace=True)
denver.reset_index(inplace=True)
denver.drop(axis="columns", labels=["index"], inplace=True)

### `HF2` as labels

In [9]:
# Fit a basic classification tree to the denver data with HF2 as labels

features = denver[[
    "asst_idx_z", "lvstk_idx_z", "prod_idx_z", "hrv_value_z",
    "LGhrv_value_z", "exp_total_ap_z", "LGnfe_rev_z",
    "nfe_z", "inc_rev_idxSD_z", "fin_debt_idxSD_z"]]

labels = denver["HF2"]

train_test_eval(features, labels)

Trial #1: Misclassified 48 of 304 points | MCR = 0.15789473684210525
Trial #2: Misclassified 49 of 304 points | MCR = 0.1611842105263158
Trial #3: Misclassified 44 of 304 points | MCR = 0.14473684210526316
Trial #4: Misclassified 51 of 304 points | MCR = 0.16776315789473684
Trial #5: Misclassified 60 of 304 points | MCR = 0.19736842105263158
Trial #6: Misclassified 43 of 304 points | MCR = 0.14144736842105263
Trial #7: Misclassified 53 of 304 points | MCR = 0.17434210526315788
Trial #8: Misclassified 39 of 304 points | MCR = 0.12828947368421054
Trial #9: Misclassified 46 of 304 points | MCR = 0.1513157894736842
Trial #10: Misclassified 50 of 304 points | MCR = 0.16447368421052633


### `HF3` as labels

In [10]:
# Fit a basic classification tree to the denver data with HF3 as labels

features = denver[[
    "asst_idx_z", "lvstk_idx_z", "prod_idx_z", "hrv_value_z",
    "LGhrv_value_z", "exp_total_ap_z", "LGnfe_rev_z",
    "nfe_z", "inc_rev_idxSD_z", "fin_debt_idxSD_z"]]

labels = denver["HF3"]

train_test_eval(features, labels)

Trial #1: Misclassified 95 of 304 points | MCR = 0.3125
Trial #2: Misclassified 113 of 304 points | MCR = 0.3717105263157895
Trial #3: Misclassified 118 of 304 points | MCR = 0.3881578947368421
Trial #4: Misclassified 106 of 304 points | MCR = 0.34868421052631576
Trial #5: Misclassified 109 of 304 points | MCR = 0.35855263157894735
Trial #6: Misclassified 91 of 304 points | MCR = 0.2993421052631579
Trial #7: Misclassified 109 of 304 points | MCR = 0.35855263157894735
Trial #8: Misclassified 100 of 304 points | MCR = 0.32894736842105265
Trial #9: Misclassified 111 of 304 points | MCR = 0.3651315789473684
Trial #10: Misclassified 96 of 304 points | MCR = 0.3157894736842105
