In [1]:
import math
import csv
import hashlib
import numpy as np

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261_final_training"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
rawRDD = spark.read.csv("data/train.txt", header=False, sep="\t").rdd
dataRDD = rawRDD.map(lambda row: ([None if el is None else int(el) for el in row[1:14]] + list(row[14:]), int(row[0])))

In [4]:
# load the data we saved from the EDA. This helps us engineer the features and configure the model

frequent_feats = {}

with open("data/freq_category_counts.csv") as csvfile:
    for row in csv.DictReader(csvfile):
        total = int(row["total"])
        if total >= 10:
            key = "{}-{}".format(row["col_name"], row["category"])
            frequent_feats[key] = int(row["category_id"])

with open("data/num_significant_categories.csv") as csvfile:
    num_significant_categories = { row["field"]: int(row["count"]) for row in csv.DictReader(csvfile) }

The numerical columns are pretty much used directly. Note that NULLs are encoded with the value `-10`. Categorical values are kept if they are in the common categories from the EDA. They were assigned an integer ID, which is in the `frequent_feats` dict. This is used to encode each value. All rare values are converted to a special ID. NULLs are converted to yet another special ID. Note that this adds 2 additional categories from the ones we picked from the EDA. 

In [5]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel, LabeledPoint

def to_labeled(pair):
    """transform input data into the features"""
    row, label = pair
    # collect the converted values here
    vector = []
    
    for i, val in enumerate(row):
        # if this is an numerical column
        if i < 13:
            if val is None:
                val = -10
        # if this is categorical
        else:
            if val is not None:
                key = "C{}-{}".format(i - 13, val)
                # if its one of our "common" values
                if key in frequent_feats:
                    # look up its ID
                    val = frequent_feats[key]
                else:
                    # give it the special value for RARE
                    val = num_significant_categories["C" + str(i - 13)]
            else:
                # give it the special value for NULL
                val = num_significant_categories["C" + str(i - 13)] + 1
        vector.append(val)
    return LabeledPoint(label, vector)

labeledRDD = dataRDD.map(to_labeled)

In [6]:
# set model params
categoricalFeaturesInfo = { int(feat[1:]) + 13: count + 2 for feat, count in num_significant_categories.items() }
maxBins = max(num_significant_categories.values()) + 2
trainingData, validationData = labeledRDD.randomSplit([0.9, 0.1])


model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo=categoricalFeaturesInfo,
                                             maxBins=maxBins,
                                             maxDepth=6,
                                             numIterations=10) # how many trees

# Evaluate model on test instances and compute validation accuracy
predictions = model.predict(validationData.map(lambda x: x.features))
labelsAndPredictions = validationData.map(lambda lp: lp.label).zip(predictions)
#testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(validationData.count())
labels = validationData.map(lambda lp: lp.label).collect()
preds = predictions.collect()
accuracy = np.mean(np.array(labels) == np.array(preds))
print('accuracy = ' + str(accuracy))

# save the model
!rm -rf /media/notebooks/tree-model/
model.save(sc, "tree-model")

accuracy = 0.765596219806837


In [9]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(labels, preds))
print(confusion_matrix(labels, preds))

             precision    recall  f1-score   support

        0.0       0.78      0.96      0.86   3409752
        1.0       0.62      0.21      0.32   1174564

avg / total       0.74      0.77      0.72   4584316

[[3258790  150962]
 [ 923619  250945]]


OUCH! That recall for the 1 class really sucks! The accuracy is bad too, seeing as how we should get about 74% accuracy simply by always predicting a 0.

In [7]:
# how to load the model again, tho not necessary in this file
sameModel = GradientBoostedTreesModel.load(sc, "tree-model")

In [8]:
print(sameModel.toDebugString())

TreeEnsembleModel classifier with 10 trees

  Tree 0:
    If (feature 0 <= 0.5)
     If (feature 10 <= 1.5)
      If (feature 7 <= 3.5)
       If (feature 6 <= 0.5)
        If (feature 25 in {0.0,5.0,42.0,24.0,37.0,25.0,14.0,29.0,1.0,6.0,28.0,21.0,33.0,9.0,13.0,41.0,2.0,32.0,34.0,45.0,17.0,22.0,44.0,12.0,49.0,7.0,39.0,3.0,35.0,18.0,50.0,31.0,11.0,43.0,40.0,23.0,8.0,36.0,30.0,4.0,47.0,15.0})
         If (feature 24 in {5.0,10.0,24.0,1.0,6.0,13.0,32.0,34.0,22.0,44.0,27.0,12.0,49.0,7.0,3.0,35.0,18.0,16.0,11.0,43.0,40.0,23.0,30.0,19.0,4.0})
          Predict: -0.8272294975306648
         Else (feature 24 not in {5.0,10.0,24.0,1.0,6.0,13.0,32.0,34.0,22.0,44.0,27.0,12.0,49.0,7.0,3.0,35.0,18.0,16.0,11.0,43.0,40.0,23.0,30.0,19.0,4.0})
          Predict: -0.6622148497367225
        Else (feature 25 not in {0.0,5.0,42.0,24.0,37.0,25.0,14.0,29.0,1.0,6.0,28.0,21.0,33.0,9.0,13.0,41.0,2.0,32.0,34.0,45.0,17.0,22.0,44.0,12.0,49.0,7.0,39.0,3.0,35.0,18.0,50.0,31.0,11.0,43.0,40.0,23.0,8.0,36.0,30.0,4.0,4