In [1]:
import csv
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel, LabeledPoint, RandomForest

app_name = "w261_final_training"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
rawRDD = spark.read.csv("data/train.txt", header=False, sep="\t").rdd
dataRDD = rawRDD.map(lambda row: ([None if el is None else int(el) for el in row[1:14]] + list(row[14:]), int(row[0])))

In [4]:
# load the data we saved from the EDA. This helps us engineer the features and configure the model

frequent_feats = {}

with open("data/freq_category_counts.csv") as csvfile:
    for row in csv.DictReader(csvfile):
        total = int(row["total"])
        if total >= 10:
            key = "{}-{}".format(row["col_name"], row["category"])
            frequent_feats[key] = int(row["category_id"])

with open("data/num_significant_categories.csv") as csvfile:
    num_significant_categories = { row["field"]: int(row["count"]) for row in csv.DictReader(csvfile) }

## Ensembles of Trees

The numerical columns are pretty much used directly. Note that NULLs are encoded with the value `-10` (also tried 0 and imputing with the medians). Categorical values are kept if they are in the common categories from the EDA. They were assigned an integer ID, which is in the `frequent_feats` dict. This is used to encode each value. All rare values are converted to a special ID. NULLs are converted to yet another special ID. Note that this adds 2 additional categories from the ones we picked from the EDA. 

In [7]:
def to_labeled(pair):
    """transform input data into the features"""
    row, label = pair
    # collect the converted values here
    vector = []
    
    for i, val in enumerate(row):
        # if this is an numerical column
        if i < 13:
            if val is None:
                val = -10
        # if this is categorical
        else:
            if val is not None:
                key = "C{}-{}".format(i - 13, val)
                # if its one of our "common" values
                if key in frequent_feats:
                    # look up its ID
                    val = frequent_feats[key]
                else:
                    # give it the special value for RARE
                    val = num_significant_categories["C" + str(i - 13)]
            else:
                # give it the special value for NULL
                val = num_significant_categories["C" + str(i - 13)] + 1
        vector.append(val)
    return LabeledPoint(label, vector)

def resample(pair):
    """sample the positive examples twice to increase their importance"""
    if pair.label == 1:
        return [pair, pair]
    else:
        return [pair]

labeledRDD = dataRDD.map(to_labeled)

# set model params
categoricalFeaturesInfo = { int(feat[1:]) + 13: count + 2 for feat, count in num_significant_categories.items() }
maxBins = max(num_significant_categories.values()) + 2
trainingData, validationData = labeledRDD.randomSplit([0.9, 0.1])
# re-samples the positive class
#trainingData = trainingData.flatMap(resample)

labels = validationData.map(lambda lp: lp.label).collect()

In [6]:
model_gbdt = GradientBoostedTrees.trainClassifier(trainingData,
                                                  categoricalFeaturesInfo=categoricalFeaturesInfo,
                                                  maxBins=maxBins,
                                                  maxDepth=6,
                                                  numIterations=10) # how many trees

# Evaluate model on test instances and compute validation accuracy
predictions_gbdt = model_gbdt.predict(validationData.map(lambda x: x.features))
#labelsAndPredictions = validationData.map(lambda lp: lp.label).zip(predictions)
#testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(validationData.count())

preds_gbdt = predictions_gbdt.collect()
accuracy_gbdt = np.mean(np.array(labels) == np.array(preds_gbdt))
print('accuracy = ' + str(accuracy_gbdt))

model_name = "models/gbdt-model"
# save the model
!rm -rf /media/notebooks/{model_name}
model_gbdt.save(sc, model_name)

# how to load the model again, tho not necessary in this file
#sameModel = GradientBoostedTreesModel.load(sc, model_name)
#print(sameModel.toDebugString())

accuracy = 0.7239152371342079


In [7]:
print(classification_report(labels, preds_gbdt))
print(confusion_matrix(labels, preds_gbdt))

             precision    recall  f1-score   support

        0.0       0.83      0.80      0.81      7402
        1.0       0.46      0.50      0.48      2508

avg / total       0.73      0.72      0.73      9910

[[5918 1484]
 [1252 1256]]


Some config I tried so far:

|max depth|num iterations|top n categories|re-sampling ratio neg:pos|acc|1-recall|F1 score|note|
|---------|--------------|----------------|-------------------------|---|--------|--|----|
|3|5|200000|1:1|-|-|-|the model fails to train due to out-of-memory error|
|3|5|50|1:1|75%|9%|67%|basically predicts all 0's|
|6|10|50|1:1|76%|21%|71%|Model getting much bigger though. Takes about 2.5 hours to train full data on my laptop|
|6|10|50|1:2|72%|50%|73%|trades off accuracy for recall|
|6|15|50|1:2|72%|50%|73%|no improvement|
|12|10|50|1:2|70%|48%|71%|actually gets worse on all metrics|
|6|10|75|1:2|72%|50%|73%|no improvement|
|4|10|75|1:2|71%|49%|72%|slight decrease on all metrics|

In [8]:
model_rf = RandomForest.trainClassifier(trainingData,
                                        categoricalFeaturesInfo=categoricalFeaturesInfo,
                                        maxBins=maxBins,
                                        numClasses=2,
                                        maxDepth=15,
                                        numTrees=10)

# Evaluate model on test instances and compute validation accuracy
predictions_rf = model_rf.predict(validationData.map(lambda x: x.features))

preds_rf = predictions_rf.collect()
accuracy_rf = np.mean(np.array(labels) == np.array(preds_rf))
print('accuracy = ' + str(accuracy_rf))

model_name = "models/rf-model"
# save the model
!rm -rf /media/notebooks/{model_name}
model_rf.save(sc, model_name)

accuracy = 0.6859737638748739


In [9]:
print(classification_report(labels, preds_rf))
print(confusion_matrix(labels, preds_rf))

             precision    recall  f1-score   support

        0.0       0.85      0.71      0.77      7402
        1.0       0.42      0.62      0.50      2508

avg / total       0.74      0.69      0.70      9910

[[5251 2151]
 [ 961 1547]]


Some config I tried so far:

|max depth|num trees|top n categories|re-sampling ratio neg:pos|acc|1-recall|F1 score|note|
|---------|---------|----------------|-------------------------|---|--------|--|----|
|6|10|50|1:1|75%|19%|70%|almost as good as GBDT|
|10|10|50|1:1|75%|23%|71%|benefits from a little more depth|
|10|15|50|1:1|75%|19%|70%|more trees is worse|
|15|15|50|1:1|75%|27%|72%|but more depth continues to benefit|
|15|10|50|1:1|75%|31%|73%|and dialing back # of trees is even better|
|15|5|50|1:1|75%|29%|72%|but too few trees is bad|
|30|5|50|1:1|72%|42%|71%|doubling the depth gets very good recall|
|30|15|50|1:1|74%|34%|72%|adding more trees shifts the trade-off|
|15|10|50|1:2|68%|62%|70%|resampling gets way better recall|
|15|15|50|1:2|70%|60%|71%||
|20|15|50|1:2|69%|59%|70%||
|10|10|50|1:2|70%|55%|71%||

## Start desperately throwing models at this

In [8]:
def to_encoded(pair):
    """transform input data into the features, but use one-hot encoding for the categorical variables"""
    row, label = pair
    # collect the converted values here
    vector = []
    
    for i, val in enumerate(row):
        # if this is an numerical column
        if i < 13:
            if val is None:
                val = 0
            val = np.max([0, val])
            vector.append(val)
        # if this is categorical
        else:
            feat_len = num_significant_categories["C" + str(i - 13)]
            one_hot = np.zeros(feat_len + 2)
            if val is not None:
                key = "C{}-{}".format(i - 13, val)
                # if its one of our "common" values
                if key in frequent_feats:
                    # look up its ID
                    one_hot[frequent_feats[key]] = 1
                else:
                    # give it the special value for RARE
                    one_hot[feat_len] = 1
            else:
                # give it the special value for NULL
                one_hot[feat_len + 1] = 1
            vector.extend(one_hot)
    return LabeledPoint(label, vector)

def resample(pair):
    """sample the positive examples twice to increase their importance"""
    if pair.label == 1:
        return [pair, pair]
    else:
        return [pair]

encodedRDD = dataRDD.map(to_encoded)
trainingData, validationData = encodedRDD.randomSplit([0.9, 0.1])
labels = validationData.map(lambda lp: lp.label).collect()

In [9]:
from pyspark.mllib.classification import NaiveBayes

# Train a naive Bayes model.
model_nb = NaiveBayes.train(trainingData, lambda_=1.0)
predictions_nb = model_nb.predict(validationData.map(lambda x: x.features))

preds_nb = predictions_nb.collect()
accuracy_nb = np.mean(np.array(labels) == np.array(preds_nb))
print("accuracy = " + str(accuracy_nb))

model_name = "models/nb-model"
# save the model
!rm -rf /media/notebooks/{model_name}
model_nb.save(sc, model_name)

accuracy = 0.4633343310386112


In [10]:
print(classification_report(labels, preds_nb))
print(confusion_matrix(labels, preds_nb))

             precision    recall  f1-score   support

        0.0       0.80      0.36      0.50      7432
        1.0       0.29      0.75      0.42      2591

avg / total       0.67      0.46      0.48     10023

[[2710 4722]
 [ 657 1934]]


In [12]:
from pyspark.mllib.classification import LogisticRegressionWithSGD

model_lr = LogisticRegressionWithSGD.train(trainingData)
predictions_lr = model_lr.predict(validationData.map(lambda x: x.features))

preds_lr = predictions_lr.collect()
accuracy_lr = np.mean(np.array(labels) == np.array(preds_lr))
print("accuracy = " + str(accuracy_lr))

model_name = "models/lr-model"
!rm -rf /media/notebooks/{model_name}
model_lr.save(sc, model_name)

accuracy = 0.7249326548937444


In [13]:
print(classification_report(labels, preds_lr))
print(confusion_matrix(labels, preds_lr))

             precision    recall  f1-score   support

        0.0       0.76      0.92      0.83      7432
        1.0       0.42      0.16      0.23      2591

avg / total       0.67      0.72      0.68     10023

[[6851  581]
 [2176  415]]


In [22]:
from pyspark.mllib.classification import SVMWithSGD

model_svm = SVMWithSGD.train(trainingData)
predictions_svm = model_svm.predict(validationData.map(lambda x: x.features))

preds_svm = predictions_svm.collect()
accuracy_svm = np.mean(np.array(labels) == np.array(preds_svm))
print("accuracy = " + str(accuracy_svm))

model_name = "models/svm-model"
!rm -rf /media/notebooks/{model_name}
model_svm.save(sc, model_name)

accuracy = 0.7247331138381722


In [23]:
print(classification_report(labels, preds_svm))
print(confusion_matrix(labels, preds_svm))

             precision    recall  f1-score   support

        0.0       0.76      0.92      0.83      7432
        1.0       0.42      0.16      0.23      2591

avg / total       0.67      0.72      0.68     10023

[[6849  583]
 [2176  415]]
