Author: Yanyu Long  
Updated: May 16, 2021

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # ROC curve

def expand_grid(data_dict):
    import itertools
    import pandas as pd
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns = data_dict.keys())

def evaluate(ypred_prob, ytrue, thres = 0.5): 
    import numpy as np
    from sklearn import metrics
    ypred = (ypred_prob > thres)
    TP = sum((ypred == 1) & (ytrue == 1))
    FP = sum((ypred == 1) & (ytrue == 0))
    FN = sum((ypred == 0) & (ytrue == 1))
    TN = sum((ypred == 0) & (ytrue == 0))
    fpr, tpr, thresholds = metrics.roc_curve(ytrue, ypred_prob)
    return dict(Acc = np.mean(ypred != ytrue), 
                TP = TP, FP = FP, FN = FN, TN = TN, 
                BER = 0.5 * (FP/(TN+FP) + FN/(FN+TP)),
                AUC = metrics.auc(fpr, tpr),
                fpr = fpr, tpr = tpr
                )

One-hot encoding categorical variables [(ref)](https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd)  

* `pd.get_dummies`
* sklearn.preprocessing.OneHotEncoder

```python
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
pd.DataFrame(encoder.fit_transform(df_train[["state"]]).toarray())
pd.DataFrame(encoder.transform(df_test[["state"]]).toarray())
```

In [105]:
df_train = pd.read_csv("data/merged_train.csv").drop(["ym", "city", "county", "num_episodes"], axis = 1).dropna().reset_index(drop = True)
df_test = pd.read_csv("data/merged_test.csv").drop(["ym", "city", "county", "num_episodes"], axis = 1).dropna().reset_index(drop = True)
x_train = pd.get_dummies(df_train.drop("outcome", axis = 1), columns = ["state"], prefix = ["state"]); x_train.shape
y_train = df_train["outcome"]
x_test = pd.get_dummies(df_test.drop("outcome", axis = 1), columns = ["state"], prefix = ["state"]); x_test.shape
y_test = df_test["outcome"]

(1254, 32)

(417, 32)

In [116]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
eval_train = evaluate(ypred_prob = pd.Series([item[1] for item in clf.predict_proba(x_train)]), 
                      ytrue = y_train, thres = 0.5)
# plt.plot(eval_train["fpr"], eval_train["tpr"], marker='.'); plt.show()



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

classifiers = [
    LogisticRegression(),
    GaussianNB(),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    AdaBoostClassifier(),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    SVC(gamma=2, C=1),
    MLPClassifier(alpha=1, max_iter=1000)
]

LabelEncoder()

array([16, 12, 11, ...,  1,  7,  8])