In [1]:
import math
import csv
import hashlib
import numpy as np

In [2]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261_final_training"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [3]:
rawRDD = spark.read.csv("data/train.txt", header=False, sep="\t").rdd
dataRDD = rawRDD.map(lambda row: ([None if el is None else int(el) for el in row[1:14]] + list(row[14:]), int(row[0])))

In [4]:
frequent_feats = {}

with open("data/freq_category_counts.csv") as csvfile:
    for row in csv.DictReader(csvfile):
        total = int(row["total"])
        if total >= 10:
            key = "{}-{}".format(row["col_name"], row["category"])
            frequent_feats[key] = int(row["category_id"])

with open("data/num_significant_categories.csv") as csvfile:
    num_significant_categories = { row["field"]: int(row["count"]) for row in csv.DictReader(csvfile) }

In [23]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel, LabeledPoint

def to_labeled(pair):
    row, label = pair
    vector = []
    for i, val in enumerate(row):
        if i < 13:
            if val is None:
                val = -10
        else:
            if val is not None:
                key = "C{}-{}".format(i - 13, val)
                if key in frequent_feats:
                    val = frequent_feats[key]
                else:
                    val = num_significant_categories["C" + str(i - 13)]
            else:
                val = num_significant_categories["C" + str(i - 13)] + 1
        vector.append(val)
    return LabeledPoint(label, vector)

labeledRDD = dataRDD.map(to_labeled)

In [6]:
!rm -rf /media/notebooks/tree-model/

# Load and parse the data file.
categoricalFeaturesInfo = { int(feat[1:]) + 13: count + 2 for feat, count in num_significant_categories.items() }
trainingData, validationData = labeledRDD.randomSplit([0.9, 0.1])


model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo=categoricalFeaturesInfo,
                                             maxBins=52,
                                             numIterations=5)

# Evaluate model on test instances and compute test error
predictions = model.predict(validationData.map(lambda x: x.features))
labelsAndPredictions = validationData.map(lambda lp: lp.label).zip(predictions)
#testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(validationData.count())
labels = validationData.map(lambda lp: lp.label).collect()
preds = predictions.collect()
accuracy = np.mean(np.array(labels) == np.array(preds))
print('accuracy = ' + str(accuracy))

# Save and load model
model.save(sc, "tree-model")

accuracy = 0.7555379325575529


In [7]:
sameModel = GradientBoostedTreesModel.load(sc, "tree-model")

In [15]:
print(sameModel.toDebugString())

TreeEnsembleModel classifier with 5 trees

  Tree 0:
    If (feature 0 <= 0.5)
     If (feature 10 <= 1.5)
      If (feature 7 <= 3.5)
       Predict: -0.5923807621533136
      Else (feature 7 > 3.5)
       Predict: -0.7367491828201482
     Else (feature 10 > 1.5)
      If (feature 5 <= 31.5)
       Predict: -0.21648363718424168
      Else (feature 5 > 31.5)
       Predict: -0.5079386942371497
    Else (feature 0 > 0.5)
     If (feature 5 <= 6.5)
      If (feature 25 in {0.0,42.0,24.0,37.0,25.0,14.0,29.0,1.0,6.0,28.0,38.0,21.0,33.0,9.0,41.0,2.0,32.0,34.0,45.0,17.0,22.0,44.0,27.0,12.0,39.0,3.0,35.0,18.0,50.0,31.0,11.0,43.0,40.0,23.0,8.0,36.0,30.0,4.0,47.0,15.0})
       Predict: -0.05881714443232319
      Else (feature 25 not in {0.0,42.0,24.0,37.0,25.0,14.0,29.0,1.0,6.0,28.0,38.0,21.0,33.0,9.0,41.0,2.0,32.0,34.0,45.0,17.0,22.0,44.0,27.0,12.0,39.0,3.0,35.0,18.0,50.0,31.0,11.0,43.0,40.0,23.0,8.0,36.0,30.0,4.0,47.0,15.0})
       Predict: 0.4012951079874257
     Else (feature 5 > 6.5)
     

In [14]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(labels, preds))
print(confusion_matrix(labels, preds))

             precision    recall  f1-score   support

        0.0       0.76      0.98      0.86   3405999
        1.0       0.67      0.09      0.16   1176397

avg / total       0.74      0.76      0.68   4582396

[[3353771   52228]
 [1067994  108403]]
