In [54]:
import os
import pyspark.mllib
import numpy as np
import matplotlib.pyplot
import seaborn as sns
import pyspark

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from IPython.display import display, HTML

get_ipython().magic(u'pylab inline')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
### Initialize streaming context
conf = SparkConf()\
                .setMaster("local[2]")\
                .setAppName("AutoRegression")\
                .set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("spark play").getOrCreate()

In [27]:
# ## Extract the variables we want to keep
# 
# _All variables:_
# 
# `instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt`
# 
# _Variables to keep:_
# 
# `season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt`

path = "file:////Users/swaite/Stirling/CSIE-63/assignment-7/data/bike-sharing-hour.csv"
bike_rdd = sc.textFile(path, use_unicode=False) \
                    .map(lambda x: x.split(","))
header = bike_rdd.first()
print "header:"
print(header)
# x[2]-x[13], x[16]
bike_rdd_train = bike_rdd.filter(lambda x: x != header)\
                         #.map(lambda x: (x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11],x[12],x[13],x[16]))\
                         .cache()
print("\nTraining Data column length: " + str(len(bike_rdd_train.first())))
print(bike_rdd_train.take(5))


header:
['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']

Training Data column length: 13
[('1', '0', '1', '0', '0', '6', '0', '1', '0.24', '0.2879', '0.81', '0', '16'), ('1', '0', '1', '1', '0', '6', '0', '1', '0.22', '0.2727', '0.8', '0', '40'), ('1', '0', '1', '2', '0', '6', '0', '1', '0.22', '0.2727', '0.8', '0', '32'), ('1', '0', '1', '3', '0', '6', '0', '1', '0.24', '0.2879', '0.75', '0', '13'), ('1', '0', '1', '4', '0', '6', '0', '1', '0.24', '0.2879', '0.75', '0', '1')]


In [28]:
# Create feature mappings for categorical features

In [29]:
# function to get the categorical feature mapping for a given variable column
def get_mapping(rdd, idx):
    return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

In [33]:
# we want to extract the feature mappings for columns 2 - 9
# try it out on column 2 first
print "Mapping of second categorical feature column: %s" % get_mapping(bike_rdd_train, 2)

Mapping of second categorical feature column: {'11': 0, '10': 6, '12': 7, '1': 1, '3': 2, '2': 8, '5': 3, '4': 9, '7': 4, '6': 10, '9': 5, '8': 11}


In [41]:
# extract all the categorical mappings
mappings = [get_mapping(bike_rdd_train, i) for i in range(2,13)]
cat_len = sum(map(len, mappings))
num_len = len(bike_rdd_train.first())
total_len = num_len + cat_len
print "Feature vector length for categorical features: %d" % cat_len 
print "Feature vector length for numerical features: %d" % num_len
print "Total feature vector length: %d" % total_len

Feature vector length for categorical features: 1154
Feature vector length for numerical features: 13
Total feature vector length: 1167


In [42]:
# Create Feature Vectors

In [43]:
# function to use the feature mappings to extract binary feature vectors, and concatenate with
# the numerical feature vectors
def extract_features(record):
    cat_vec = np.zeros(cat_len)
    i = 0
    step = 0
    for field in record[2:9]:
        m = mappings[i]
        idx = m[field]
        cat_vec[idx + step] = 1
        i = i + 1
        step = step + len(m)
    
    num_vec = np.array([float(field) for field in record[10:14]])
    return np.concatenate((cat_vec, num_vec))

# function to extract the label from the last column
def extract_label(record):
    return float(record[-1])

In [45]:
data = bike_rdd_train.map(lambda r: LabeledPoint(extract_label(r), extract_features(r)))
first_point = data.first()
print "Raw data: " + str(first)
print "Label: " + str(first_point.label)
print "Linear Model feature vector:\n" + str(first_point.features)
print "Linear Model feature vector length: " + str(len(first_point.features))

Raw data: <function first at 0x10ab63de8>
Label: 16.0
Linear Model feature vector:
[0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,

In [50]:
# we need a separate set of feature vectors for the decision tree
def extract_features_dt(record):
    return np.array(map(float, record[2:14]))
    
data_dt = bike_rdd_train.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
first_point_dt = data_dt.first()
print "Decision Tree feature vector: " + str(first_point_dt.features)
print "Decision Tree feature vector length: " + str(len(first_point_dt.features))

Decision Tree feature vector: [1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,16.0]
Decision Tree feature vector length: 11


In [51]:
# Train a Regression Model on the Bike Sharing Dataset

In [52]:
linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))



Linear Model predictions: [(16.0, -1.9894012556799439e+36), (40.0, -4.9721324143362443e+36), (32.0, -3.9778862866476995e+36), (13.0, -1.6165443882305017e+36), (1.0, -1.2517602536659713e+35)]


In [55]:
# we pass in an empty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

Decision Tree predictions: [(16.0, 17.01814882032668), (40.0, 44.286852589641434), (32.0, 30.96880733944954), (13.0, 17.01814882032668), (1.0, 5.6638457921955805)]
Decision Tree depth: 5
Decision Tree number of nodes: 63


In [56]:
# Perfomance Metrics

In [57]:
# set up performance metrics functions 

def squared_error(actual, pred):
    return (pred - actual)**2

def abs_error(actual, pred):
    return np.abs(pred - actual)

def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

In [58]:
# compute performance metrics for linear model
mse = true_vs_predicted.map(lambda (t, p): squared_error(t, p)).mean()
mae = true_vs_predicted.map(lambda (t, p): abs_error(t, p)).mean()
rmsle = np.sqrt(true_vs_predicted.map(lambda (t, p): squared_log_error(t, p)).mean())
print "Linear Model - Mean Squared Error: %2.4f" % mse
print "Linear Model - Mean Absolute Error: %2.4f" % mae
print "Linear Model - Root Mean Squared Log Error: %2.4f" % rmsle

Linear Model - Mean Squared Error: 1062643507737328563794480371722602217184329868125552687618838621785224642560.0000
Linear Model - Mean Absolute Error: 23547538462393464589505485305406291968.0000
Linear Model - Root Mean Squared Log Error: nan
