In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib import pyplot as plt
import pandas as pd
import datetime

In [None]:
def featurize(df, label):
  # Set label and features
  df_copy = df.copy()
  y = df_copy[label]
  X = df_copy.assign(const=1)  # drop all categorical features and allow y-intercept to vary
  X = X.drop(columns=[label])
  return y, X

def non_numeric_dtypes(df):
  non_numeric = list(set(df.columns) - set(df.select_dtypes('number').columns))
  print(non_numeric)
  return non_numeric

def group_by_counts(df, labels):
  df_copy = df.copy()
  df_copy = df_copy.groupby(labels).size.reset_index().rename(columns={0:'count'})
  return df_copy

In [None]:
df = pd.read_csv("noReplies-clean.csv")
df.head()

In [None]:
y, X = featurize(df, "IN_SCHOOL_FLAG")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)
clf = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)

results = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})

In [None]:
y_test_dummies = pd.get_dummies(results['Actual'])
y_pred_dummies = pd.get_dummies(results['Predicted'])

# Accuracy  = (true positives + true negatives) / (total cases); ranges from 0 (worst) to 1 (best)
print(f"Accuracy:\t{metrics.accuracy_score(y_test, y_pred)}")

# Precision = (true positives / (true positives + false positives))
print(f"Precision:\t{metrics.precision_score(y_test_dummies[1], y_pred_dummies[1], labels=['0.0', '1.0'])}")
  
# Recall    = (true positives / (true positives + false negatives)) 
print(f"Recall:\t\t{metrics.recall_score(y_test_dummies[1], y_pred_dummies[1], labels=['0.0', '1.0'])}")
  
# F1        = (2 * (precision * recall) / (precision + recall))
print(f"F1:\t\t{metrics.f1_score(y_test_dummies[1], y_pred_dummies[1], labels=['0.0', '1.0'])}")