In [1]:
import csv
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel, LabeledPoint, RandomForest

app_name = "w261_michelle_training"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
rawRDD = spark.read.csv("data/sample.txt", header=False, sep="\t").rdd
dataRDD = rawRDD.map(lambda row: ([None if el is None else int(el) for el in row[1:14]] + list(row[14:]), int(row[0])))

In [4]:
# load the data we saved from the EDA. This helps us engineer the features and configure the model

frequent_feats = {}

with open("data/freq_category_counts.csv") as csvfile:
    for row in csv.DictReader(csvfile):
        total = int(row["total"])
        if total >= 10:
            key = "{}-{}".format(row["col_name"], row["category"])
            frequent_feats[key] = int(row["category_id"])

with open("data/num_significant_categories.csv") as csvfile:
    num_significant_categories = { row["field"]: int(row["count"]) for row in csv.DictReader(csvfile) }

## Ensembles of Trees

The numerical columns are pretty much used directly. Note that NULLs are encoded with the value `-10` (also tried 0 and imputing with the medians). Categorical values are kept if they are in the common categories from the EDA. They were assigned an integer ID, which is in the `frequent_feats` dict. This is used to encode each value. All rare values are converted to a special ID. NULLs are converted to yet another special ID. Note that this adds 2 additional categories from the ones we picked from the EDA. 

In [5]:
def to_labeled(pair):
    """transform input data into the features"""
    row, label = pair
    # collect the converted values here
    vector = []
    
    for i, val in enumerate(row):
        # if this is an numerical column
        if i < 13:
            if val is None:
                val = -10
            vector.append(val)
    return LabeledPoint(label, vector)

def resample(pair):
    """sample the positive examples twice to increase their importance"""
    if pair.label == 1:
        return [pair, pair]
    else:
        return [pair]

labeledRDD = dataRDD.map(to_labeled)

trainingData, validationData = labeledRDD.randomSplit([0.9, 0.1])
# re-samples the positive class
#trainingData = trainingData.flatMap(resample)

labels = validationData.map(lambda lp: lp.label).collect()

In [13]:
trainingData.take(5)

[LabeledPoint(1.0, [1.0,0.0,66.0,7.0,230.0,37.0,11.0,39.0,58.0,1.0,2.0,-10.0,28.0]),
 LabeledPoint(0.0, [8.0,2.0,8.0,4.0,0.0,0.0,25.0,9.0,175.0,2.0,5.0,-10.0,0.0]),
 LabeledPoint(0.0, [0.0,0.0,1.0,1.0,4829.0,47.0,2.0,5.0,38.0,0.0,1.0,-10.0,1.0]),
 LabeledPoint(1.0, [2.0,4.0,-10.0,1.0,40.0,1.0,2.0,1.0,1.0,1.0,1.0,-10.0,1.0]),
 LabeledPoint(1.0, [-10.0,1.0,-10.0,1.0,6916.0,28.0,4.0,4.0,31.0,-10.0,2.0,-10.0,1.0])]

In [17]:
model_gbdt = GradientBoostedTrees.trainClassifier(trainingData, maxBins=13,
                                                 categoricalFeaturesInfo={}, maxDepth=6, numIterations=10)

# Evaluate model on test instances and compute validation accuracy
predictions_gbdt = model_gbdt.predict(validationData.map(lambda x: x.features))
#labelsAndPredictions = validationData.map(lambda lp: lp.label).zip(predictions)
#testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(validationData.count())

preds_gbdt = predictions_gbdt.collect()
accuracy_gbdt = np.mean(np.array(labels) == np.array(preds_gbdt))
print('accuracy = ' + str(accuracy_gbdt))

model_name = "models/gbdt-model"
# save the model
!rm -rf /media/notebooks/{model_name}
model_gbdt.save(sc, model_name)

# how to load the model again, tho not necessary in this file
#sameModel = GradientBoostedTreesModel.load(sc, model_name)
#print(sameModel.toDebugString())

accuracy = 0.7562296858071506


In [None]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn.grid_search import GridSearchCV

gb_grid_params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              #'max_features': [1.0, 0.3, 0.1] 
              }
print(gb_grid_params)

gb_gs = GradientBoostingClassifier(n_estimators = 600)

clf = grid_search.GridSearchCV(gb_gs,
                               gb_grid_params,
                               cv=2,
                               scoring='roc_auc',
                               verbose = 3, 
                               n_jobs=10);
clf.fit(train_gs_X, train_gs_Y);

In [18]:
print(classification_report(labels, preds_gbdt))
print(confusion_matrix(labels, preds_gbdt))

             precision    recall  f1-score   support

        0.0       0.78      0.95      0.85      7589
        1.0       0.55      0.19      0.28      2564

avg / total       0.72      0.76      0.71     10153

[[7196  393]
 [2082  482]]


Some config I tried so far:

|max depth|num iterations|top n categories|re-sampling ratio neg:pos|acc|1-recall|F1 score|note|
|---------|--------------|----------------|-------------------------|---|--------|--|----|
|3|5|200000|1:1|-|-|-|the model fails to train due to out-of-memory error|
|3|5|50|1:1|75%|9%|67%|basically predicts all 0's|
|6|10|50|1:1|76%|21%|71%|Model getting much bigger though. Takes about 2.5 hours to train full data on my laptop|
|6|10|50|1:2|72%|50%|73%|trades off accuracy for recall|
|6|15|50|1:2|72%|50%|73%|no improvement|
|12|10|50|1:2|70%|48%|71%|actually gets worse on all metrics|
|6|10|75|1:2|72%|50%|73%|no improvement|
|4|10|75|1:2|71%|49%|72%|slight decrease on all metrics|

In [8]:
model_rf = RandomForest.trainClassifier(trainingData,
                                        categoricalFeaturesInfo=categoricalFeaturesInfo,
                                        maxBins=maxBins,
                                        numClasses=2,
                                        maxDepth=15,
                                        numTrees=10)

# Evaluate model on test instances and compute validation accuracy
predictions_rf = model_rf.predict(validationData.map(lambda x: x.features))

preds_rf = predictions_rf.collect()
accuracy_rf = np.mean(np.array(labels) == np.array(preds_rf))
print('accuracy = ' + str(accuracy_rf))

model_name = "models/rf-model"
# save the model
!rm -rf /media/notebooks/{model_name}
model_rf.save(sc, model_name)

accuracy = 0.7517037037037037


In [9]:
print(classification_report(labels, preds_rf))
print(confusion_matrix(labels, preds_rf))

             precision    recall  f1-score   support

        0.0       0.79      0.91      0.84      7511
        1.0       0.54      0.29      0.38      2614

avg / total       0.72      0.75      0.72     10125

[[6852  659]
 [1855  759]]


Some config I tried so far:

|max depth|num trees|top n categories|re-sampling ratio neg:pos|acc|1-recall|F1 score|note|
|---------|---------|----------------|-------------------------|---|--------|--|----|
|6|10|50|1:1|75%|19%|70%|almost as good as GBDT|
|10|10|50|1:1|75%|23%|71%|benefits from a little more depth|
|10|15|50|1:1|75%|19%|70%|more trees is worse|
|15|15|50|1:1|75%|27%|72%|but more depth continues to benefit|
|15|10|50|1:1|75%|31%|73%|and dialing back # of trees is even better|
|15|5|50|1:1|75%|29%|72%|but too few trees is bad|
|30|5|50|1:1|72%|42%|71%|doubling the depth gets very good recall|
|30|15|50|1:1|74%|34%|72%|adding more trees shifts the trade-off|
|15|10|50|1:2|68%|62%|70%|resampling gets way better recall|
|15|15|50|1:2|70%|60%|71%||
|20|15|50|1:2|69%|59%|70%||
|10|10|50|1:2|70%|55%|71%||

## Start desperately throwing models at this

In [10]:
def to_encoded(pair):
    """transform input data into the features, but use one-hot encoding for the categorical variables"""
    row, label = pair
    # collect the converted values here
    vector = []
    
    for i, val in enumerate(row):
        # if this is an numerical column
        if i < 13:
            if val is None:
                val = 0
            val = np.max([0, val])
            vector.append(val)
        # if this is categorical
        else:
            feat_len = num_significant_categories["C" + str(i - 13)]
            one_hot = np.zeros(feat_len + 2)
            if val is not None:
                key = "C{}-{}".format(i - 13, val)
                # if its one of our "common" values
                if key in frequent_feats:
                    # look up its ID
                    one_hot[frequent_feats[key]] = 1
                else:
                    # give it the special value for RARE
                    one_hot[feat_len] = 1
            else:
                # give it the special value for NULL
                one_hot[feat_len + 1] = 1
            vector.extend(one_hot)
    return LabeledPoint(label, vector)

def resample(pair):
    """sample the positive examples twice to increase their importance"""
    if pair.label == 1:
        return [pair, pair]
    else:
        return [pair]

encodedRDD = dataRDD.map(to_encoded)
trainingData, validationData = encodedRDD.randomSplit([0.9, 0.1])
labels = validationData.map(lambda lp: lp.label).collect()

In [11]:
from pyspark.mllib.classification import NaiveBayes

# Train a naive Bayes model.
model_nb = NaiveBayes.train(trainingData, lambda_=1.0)
predictions_nb = model_nb.predict(validationData.map(lambda x: x.features))

preds_nb = predictions_nb.collect()
accuracy_nb = np.mean(np.array(labels) == np.array(preds_nb))
print("accuracy = " + str(accuracy_nb))

model_name = "models/nb-model"
# save the model
!rm -rf /media/notebooks/{model_name}
model_nb.save(sc, model_name)

accuracy = 0.46170527353376045


In [12]:
print(classification_report(labels, preds_nb))
print(confusion_matrix(labels, preds_nb))

             precision    recall  f1-score   support

        0.0       0.81      0.36      0.50      7572
        1.0       0.29      0.75      0.42      2573

avg / total       0.68      0.46      0.48     10145

[[2746 4826]
 [ 635 1938]]


In [13]:
from pyspark.mllib.classification import LogisticRegressionWithSGD

model_lr = LogisticRegressionWithSGD.train(trainingData)
predictions_lr = model_lr.predict(validationData.map(lambda x: x.features))

preds_lr = predictions_lr.collect()
accuracy_lr = np.mean(np.array(labels) == np.array(preds_lr))
print("accuracy = " + str(accuracy_lr))

model_name = "models/lr-model"
!rm -rf /media/notebooks/{model_name}
model_lr.save(sc, model_name)

accuracy = 0.7235091177920158


In [14]:
print(classification_report(labels, preds_lr))
print(confusion_matrix(labels, preds_lr))

             precision    recall  f1-score   support

        0.0       0.76      0.91      0.83      7572
        1.0       0.39      0.16      0.23      2573

avg / total       0.67      0.72      0.68     10145

[[6927  645]
 [2160  413]]


In [None]:
from pyspark.mllib.classification import SVMWithSGD

model_svm = SVMWithSGD.train(trainingData)
predictions_svm = model_svm.predict(validationData.map(lambda x: x.features))

preds_svm = predictions_svm.collect()
accuracy_svm = np.mean(np.array(labels) == np.array(preds_svm))
print("accuracy = " + str(accuracy_svm))

model_name = "models/svm-model"
!rm -rf /media/notebooks/{model_name}
model_svm.save(sc, model_name)

In [None]:
print(classification_report(labels, preds_svm))
print(confusion_matrix(labels, preds_svm))