# Identifying high flyers

In [34]:
# Imports
import math
import numpy as np
import pandas as pd
import random
from sklearn.tree import DecisionTreeRegressor
random.seed(365)

In [35]:
# Globals
PCT_TRAIN = 0.6
N_ITERS = 10

In [83]:
# Function to fit a regression tree to data

def train_test_eval(features, labels):
    
    n_rows = features.shape[0]
    n_rows_training = round(PCT_TRAIN * n_rows)
    random_index = list(range(n_rows))

    # Fit the tree several times
    for i in range(N_ITERS):

        random.shuffle(random_index)

        # Get training data
        train_index = random_index[: n_rows_training]
        train_features = features.iloc[train_index]
        train_labels = labels[train_index]

        # Get test data
        test_index = random_index[n_rows_training :]
        test_features = features.loc[test_index]
        test_labels = labels[test_index]

        # Create a tree using the training data
        tree = DecisionTreeRegressor()
        tree.fit(train_features, train_labels)

        # Predict on the test data
        predicted_labels = tree.predict(test_features)

        # Calculate how well it performed
        n_wrong = sum(test_labels != predicted_labels)
        mcr = n_wrong / len(test_labels)

        # Print out the results
        print("Trial #{}: Misclassified {} of {} points | MCR = {}".format(i + 1, n_wrong, len(test_labels), mcr))
        
    return    

## `baseline.csv` data

In [84]:
# Data cleaning

# Some columns / rows have too many NAs
baseline = pd.read_csv("data/baseline.csv")
baseline.drop(axis="columns", labels=["rec_attended_school", "dist_food_market"], inplace=True)
baseline.dropna(axis="index", how="any", inplace=True)
baseline.reset_index(inplace=True)
baseline.drop(axis="columns", labels=["index"], inplace=True)

# Recode 
baseline = baseline.replace("Willing to wait at least once", "1")
baseline = baseline.replace("Never willing to wait", "0")
baseline = baseline.astype({"imp_dich": "int32"})

### `HF2` as label

In [85]:
# Fit a basic classification tree to the baseline data with HF2 as labels

features = baseline[[
    "hhsize", "able", "rec_age", "imp_dich", "future1yr",
    "asst_idx", "lvstk_idx", "prod_idx", "any_savings"]]

labels = baseline["HF2"]

train_test_eval(features, labels)

Trial #1: Misclassified 43 of 296 points | MCR = 0.14527027027027026
Trial #2: Misclassified 46 of 296 points | MCR = 0.1554054054054054
Trial #3: Misclassified 52 of 296 points | MCR = 0.17567567567567569
Trial #4: Misclassified 45 of 296 points | MCR = 0.15202702702702703
Trial #5: Misclassified 50 of 296 points | MCR = 0.16891891891891891
Trial #6: Misclassified 39 of 296 points | MCR = 0.13175675675675674
Trial #7: Misclassified 59 of 296 points | MCR = 0.19932432432432431
Trial #8: Misclassified 39 of 296 points | MCR = 0.13175675675675674
Trial #9: Misclassified 48 of 296 points | MCR = 0.16216216216216217
Trial #10: Misclassified 44 of 296 points | MCR = 0.14864864864864866


### `HF3` as label

In [62]:
# Fit a basic classification tree to the baseline data with HF3 as labels

features = baseline[[
    "hhsize", "able", "rec_age", "imp_dich", "future1yr",
    "asst_idx", "lvstk_idx", "prod_idx", "any_savings"]]

labels = baseline["HF3"]

train_test_eval(features, labels)

Trial #1: Misclassified 112 of 296 points | MCR = 0.3783783783783784
Trial #2: Misclassified 107 of 296 points | MCR = 0.3614864864864865
Trial #3: Misclassified 111 of 296 points | MCR = 0.375
Trial #4: Misclassified 93 of 296 points | MCR = 0.3141891891891892
Trial #5: Misclassified 114 of 296 points | MCR = 0.38513513513513514
Trial #6: Misclassified 97 of 296 points | MCR = 0.3277027027027027
Trial #7: Misclassified 109 of 296 points | MCR = 0.36824324324324326
Trial #8: Misclassified 116 of 296 points | MCR = 0.3918918918918919
Trial #9: Misclassified 114 of 296 points | MCR = 0.38513513513513514
Trial #10: Misclassified 118 of 296 points | MCR = 0.39864864864864863


## `denver.csv` data

In [80]:
# Data cleaning

# Some rows have NAs
denver = pd.read_csv("data/denver.csv")
denver.dropna(axis="index", how="any", inplace=True)
denver.reset_index(inplace=True)
denver.drop(axis="columns", labels=["index"], inplace=True)

['asst_idx_z', 'lvstk_idx_z', 'prod_idx_z', 'hrv_value_z', 'LGhrv_value_z', 'exp_total_ap_z', 'LGnfe_rev_z', 'nfe_z', 'inc_rev_idxSD_z', 'fin_debt_idxSD_z', 'qsn', 'HF1', 'HF2', 'HF3']


### `HF2` as labels

In [81]:
# Fit a basic classification tree to the denver data with HF2 as labels

features = denver[[
    "asst_idx_z", "lvstk_idx_z", "prod_idx_z", "hrv_value_z",
    "LGhrv_value_z", "exp_total_ap_z", "LGnfe_rev_z",
    "nfe_z", "inc_rev_idxSD_z", "fin_debt_idxSD_z"]]

labels = denver["HF2"]

train_test_eval(features, labels)

Trial #1: Misclassified 61 of 304 points | MCR = 0.20065789473684212
Trial #2: Misclassified 56 of 304 points | MCR = 0.18421052631578946
Trial #3: Misclassified 51 of 304 points | MCR = 0.16776315789473684
Trial #4: Misclassified 49 of 304 points | MCR = 0.1611842105263158
Trial #5: Misclassified 54 of 304 points | MCR = 0.17763157894736842
Trial #6: Misclassified 53 of 304 points | MCR = 0.17434210526315788
Trial #7: Misclassified 49 of 304 points | MCR = 0.1611842105263158
Trial #8: Misclassified 67 of 304 points | MCR = 0.22039473684210525
Trial #9: Misclassified 49 of 304 points | MCR = 0.1611842105263158
Trial #10: Misclassified 53 of 304 points | MCR = 0.17434210526315788


### `HF3` as labels

In [82]:
# Fit a basic classification tree to the denver data with HF3 as labels

features = denver[[
    "asst_idx_z", "lvstk_idx_z", "prod_idx_z", "hrv_value_z",
    "LGhrv_value_z", "exp_total_ap_z", "LGnfe_rev_z",
    "nfe_z", "inc_rev_idxSD_z", "fin_debt_idxSD_z"]]

labels = denver["HF3"]

train_test_eval(features, labels)

Trial #1: Misclassified 112 of 304 points | MCR = 0.3684210526315789
Trial #2: Misclassified 102 of 304 points | MCR = 0.3355263157894737
Trial #3: Misclassified 114 of 304 points | MCR = 0.375
Trial #4: Misclassified 117 of 304 points | MCR = 0.3848684210526316
Trial #5: Misclassified 97 of 304 points | MCR = 0.3190789473684211
Trial #6: Misclassified 90 of 304 points | MCR = 0.29605263157894735
Trial #7: Misclassified 107 of 304 points | MCR = 0.3519736842105263
Trial #8: Misclassified 114 of 304 points | MCR = 0.375
Trial #9: Misclassified 113 of 304 points | MCR = 0.3717105263157895
Trial #10: Misclassified 96 of 304 points | MCR = 0.3157894736842105
