In [11]:
import os
import pyspark.mllib
import numpy as np
import matplotlib.pyplot
import seaborn as sns
import pyspark

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from IPython.display import display, HTML

get_ipython().magic(u'pylab inline')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
### Initialize streaming context
conf = SparkConf()\
                .setMaster("local[2]")\
                .setAppName("AutoRegression")\
                .set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("spark play").getOrCreate()

In [3]:
path = "file:////Users/swaite/Stirling/CSIE-63/assignment-7/data/auto_mpg_original-1.csv"
df = sc.textFile(path, use_unicode=False) \
                    .map(lambda x: x.split(","))\
                    .filter(lambda x: x[0] > 0.0)\
                    .filter(lambda x: x[3] > 0.0) \
                    
print(df.take(5))

[['18', '8', '307', '130', '3504', '12', '70', '1', 'chevrolet'], ['15', '8', '350', '165', '3693', '11.5', '70', '1', 'buick'], ['18', '8', '318', '150', '3436', '11', '70', '1', 'plymouth'], ['16', '8', '304', '150', '3433', '12', '70', '1', 'amc'], ['17', '8', '302', '140', '3449', '10.5', '70', '1', 'ford']]


In [24]:
#     1. mpg:           continuous
#     2. cylinders:     multi-valued discrete
#     3. displacement:  continuous
#     4. horsepower:    continuous
#     5. weight:        continuous
#     6. acceleration:  continuous
#     7. model year:    multi-valued discrete
#     8. origin:        multi-valued discrete
#     9. car name:      string (unique for each instance)

## Move RDD Order to:

#     1. mpg:           continuous
#     2. displacement:  continuous
#     3. horsepower:    continuous
#     4. weight:        continuous
#     5. acceleration:  continuous
#     6. model year:    multi-valued discrete
#     7. origin:        multi-valued discrete
#     8. cylinders:     multi-valued discrete
#     9. car name:      string (unique for each instance)
        
df = df.map(lambda x: [
                        x[0].replace('NA', "0.0"), 
                        x[2].replace('NA', "0.0"), 
                        x[3].replace('NA', "0.0"), 
                        x[4].replace('NA', "0.0"), 
                        x[5].replace('NA', "0.0"), 
                        x[6].replace('NA', "0.0"), 
                        x[7].replace('NA', "0.0"), 
                        x[1].replace('NA', "0.0"), 
                        x[8].replace('NA', "0.0")
                    ]) 
print(df.take(5))

[['18', '130', '3504', '12', '70', '1', '8', '307', 'chevrolet'], ['15', '165', '3693', '11.5', '70', '1', '8', '350', 'buick'], ['18', '150', '3436', '11', '70', '1', '8', '318', 'plymouth'], ['16', '150', '3433', '12', '70', '1', '8', '304', 'amc'], ['17', '140', '3449', '10.5', '70', '1', '8', '302', 'ford']]


# Create feature mappings for categorical features

In [25]:
# function to get the categorical feature mapping for a given variable column
def get_mapping(rdd, idx):
    return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

In [26]:
# extract all the categorical mappings
mappings = [get_mapping(df, i) for i in range(1,4)]
cat_len = sum(map(len, mappings))
num_len = len(df.first()[5:8])
total_len = num_len + cat_len
print "Feature vector length for categorical features: %d" % cat_len 
print "Feature vector length for numerical features: %d" % num_len
print "Total feature vector length: %d" % total_len

Feature vector length for categorical features: 547
Feature vector length for numerical features: 3
Total feature vector length: 550


# Create Feature Vectors

In [27]:
# function to use the feature mappings to extract binary feature vectors, and concatenate with
# the numerical feature vectors
def extract_features(record):
    cat_vec = np.zeros(cat_len)
    i = 0
    step = 0
    for field in record[1:4]:
        m = mappings[i]
        idx = m[field]
        cat_vec[idx + step] = 1
        i = i + 1
        step = step + len(m)
    
    num_vec = np.array([float(field) for field in record[5:8]])
    return np.concatenate((cat_vec, num_vec))

# function to extract the label from the last column
def extract_label(record):
    return float(record[0])

In [28]:
data = df.map(lambda r: LabeledPoint(extract_label(r), extract_features(r)))
first_point = data.first()
print "Raw data: " + str(first_point)
print "Label: " + str(first_point.label)
print "Linear Model feature vector:\n" + str(first_point.features)
print "Linear Model feature vector length: " + str(len(first_point.features))

Raw data: (18.0,[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0

In [29]:
# we need a separate set of feature vectors for the decision tree
def extract_features_dt(record):
    return np.array(map(float, record[2:8]))
    
data_dt = df.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
first_point_dt = data_dt.first()
print "Decision Tree feature vector: " + str(first_point_dt.features)
print "Decision Tree feature vector length: " + str(len(first_point_dt.features))

Decision Tree feature vector: [3504.0,12.0,70.0,1.0,8.0,307.0]
Decision Tree feature vector length: 6


# Train a Regression Model on the Car Dataset

In [30]:
linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))

Linear Model predictions: [(18.0, -1.0187537112768575e+35), (15.0, -1.1613490090535328e+35), (18.0, -1.0552325260820744e+35), (16.0, -1.0088061513748944e+35), (17.0, -1.0021725267765866e+35)]


In [31]:
# we pass in an empty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

Decision Tree predictions: [(18.0, 18.133333333333333), (15.0, 18.133333333333333), (18.0, 18.133333333333333), (16.0, 18.133333333333333), (17.0, 18.133333333333333)]
Decision Tree depth: 5
Decision Tree number of nodes: 63


# Perfomance Metrics

In [32]:
# set up performance metrics functions 

def squared_error(actual, pred):
    return (pred - actual)**2

def abs_error(actual, pred):
    return np.abs(pred - actual)

def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

# Decision Tree Model

In [34]:
# compute performance metrics for decision tree model
mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean()
mae_dt = true_vs_predicted_dt.map(lambda (t, p): abs_error(t, p)).mean()
rmsle_dt = np.sqrt(true_vs_predicted_dt.map(lambda (t, p): squared_log_error(t, p)).mean())
print "Decision Tree - Mean Squared Error: %2.4f" % mse_dt
print "Decision Tree - Mean Absolute Error: %2.4f" % mae_dt
print "Decision Tree - Root Mean Squared Log Error: %2.4f" % rmsle_dt

Decision Tree - Mean Squared Error: 7.9139
Decision Tree - Mean Absolute Error: 1.8721
Decision Tree - Root Mean Squared Log Error: 0.1912
