In [54]:
import os
import pyspark.mllib
import numpy as np
import matplotlib.pyplot
import seaborn as sns
import pyspark

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from IPython.display import display, HTML

get_ipython().magic(u'pylab inline')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
### Initialize streaming context
conf = SparkConf()\
                .setMaster("local[2]")\
                .setAppName("AutoRegression")\
                .set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("spark play").getOrCreate()

In [60]:
# ## Extract the variables we want to keep
# 
# _All variables:_
# 
# `instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt`
# 
# _Variables to keep:_
# 
# `season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt`

path = "file:////Users/swaite/Stirling/CSIE-63/assignment-7/data/bike-sharing-hour.csv"
bike_rdd = sc.textFile(path, use_unicode=False) \
                    .map(lambda x: x.split(","))
header = bike_rdd.first()
print "header:"
print(header)
bike_rdd_train = bike_rdd.filter(lambda x: x != header)\
                         .cache()
print("\nTraining Data column length: " + str(len(bike_rdd_train.first())))
print(bike_rdd_train.take(5))


header:
['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']

Training Data column length: 17
[['1', '2011-01-01', '1', '0', '1', '0', '0', '6', '0', '1', '0.24', '0.2879', '0.81', '0', '3', '13', '16'], ['2', '2011-01-01', '1', '0', '1', '1', '0', '6', '0', '1', '0.22', '0.2727', '0.8', '0', '8', '32', '40'], ['3', '2011-01-01', '1', '0', '1', '2', '0', '6', '0', '1', '0.22', '0.2727', '0.8', '0', '5', '27', '32'], ['4', '2011-01-01', '1', '0', '1', '3', '0', '6', '0', '1', '0.24', '0.2879', '0.75', '0', '3', '10', '13'], ['5', '2011-01-01', '1', '0', '1', '4', '0', '6', '0', '1', '0.24', '0.2879', '0.75', '0', '0', '1', '1']]


# Create feature mappings for categorical features

In [29]:
# function to get the categorical feature mapping for a given variable column
def get_mapping(rdd, idx):
    return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

In [33]:
# we want to extract the feature mappings for columns 2 - 9
# try it out on column 2 first
print "Mapping of second categorical feature column: %s" % get_mapping(bike_rdd_train, 2)

Mapping of second categorical feature column: {'11': 0, '10': 6, '12': 7, '1': 1, '3': 2, '2': 8, '5': 3, '4': 9, '7': 4, '6': 10, '9': 5, '8': 11}


In [128]:
# extract all the categorical mappings
mappings = [get_mapping(bike_rdd_train, i) for i in range(2,10)]
print(mappings)
cat_len = sum(map(len, mappings))
num_len = len(bike_rdd_train.first()[11:15])
total_len = num_len + cat_len
print "Feature vector length for categorical features: %d" % cat_len 
print "Feature vector length for numerical features: %d" % num_len
print "Total feature vector length: %d" % total_len

[{'1': 0, '3': 1, '2': 2, '4': 3}, {'1': 0, '0': 1}, {'11': 0, '10': 6, '12': 7, '1': 1, '3': 2, '2': 8, '5': 3, '4': 9, '7': 4, '6': 10, '9': 5, '8': 11}, {'20': 2, '21': 14, '22': 4, '23': 15, '1': 6, '0': 18, '3': 7, '2': 19, '5': 8, '4': 20, '7': 9, '6': 21, '9': 10, '8': 22, '11': 0, '10': 12, '13': 1, '12': 13, '15': 11, '14': 23, '17': 3, '16': 17, '19': 5, '18': 16}, {'1': 0, '0': 1}, {'1': 0, '0': 3, '3': 1, '2': 4, '5': 2, '4': 5, '6': 6}, {'1': 0, '0': 1}, {'1': 0, '3': 1, '2': 2, '4': 3}]
Feature vector length for categorical features: 57
Feature vector length for numerical features: 4
Total feature vector length: 61


# Create Feature Vectors

In [78]:
# function to use the feature mappings to extract binary feature vectors, and concatenate with
# the numerical feature vectors
def extract_features(record):
    cat_vec = np.zeros(cat_len)
    i = 0
    step = 0
    for field in record[2:9]:
        m = mappings[i]
        idx = m[field]
        cat_vec[idx + step] = 1
        i = i + 1
        step = step + len(m)
    
    num_vec = np.array([float(field) for field in record[10:14]])
    return np.concatenate((cat_vec, num_vec))

# function to extract the label from the last column
def extract_label(record):
    return float(record[-1])

# Train your model using LinearRegressionSGD method. 
## Use test data (15% of all) to assess the quality of prediction for cnt variable. 

In [117]:
training_data, testing_data = bike_rdd_train.randomSplit([.85, .15], seed=1234)
training_data = training_data.cache()
testing_data = testing_data.cache()
print("Training Data Count (85%): " + str(training_data.count()))
print("Testing Data Count (15%): " + str(testing_data.count()))
print(testing_data.take(1))

Training Data Count (85%): 14803
Testing Data Count (15%): 2576
[['11', '2011-01-01', '1', '0', '1', '10', '0', '6', '0', '1', '0.38', '0.3939', '0.76', '0.2537', '12', '24', '36']]


In [118]:
data = testing_data.map(lambda r: LabeledPoint(extract_label(r), extract_features(r)))
first_point = data.first()
print "Raw data: " + str(first_point)
print "Label: " + str(first_point.label)
print "Linear Model feature vector:\n" + str(first_point.features)
print "Linear Model feature vector length: " + str(len(first_point.features))

Raw data: (36.0,[1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.38,0.3939,0.76,0.2537])
Label: 36.0
Linear Model feature vector:
[1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.38,0.3939,0.76,0.2537]
Linear Model feature vector length: 61


In [119]:
# we need a separate set of feature vectors for the decision tree
def extract_features_dt(record):
    return np.array(map(float, record[2:14]))
    
data_dt = testing_data.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
first_point_dt = data_dt.first()
print "Decision Tree feature vector: " + str(first_point_dt.features)
print "Decision Tree feature vector length: " + str(len(first_point_dt.features))

Decision Tree feature vector: [1.0,0.0,1.0,10.0,0.0,6.0,0.0,1.0,0.38,0.3939,0.76,0.2537]
Decision Tree feature vector length: 12


In [121]:
print(testing_data.take(1))

[['11', '2011-01-01', '1', '0', '1', '10', '0', '6', '0', '1', '0.38', '0.3939', '0.76', '0.2537', '12', '24', '36']]


# Train a Regression Model on the Bike Sharing Dataset

In [122]:
linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))

Linear Model predictions: [(36.0, 129.4893833950199), (39.0, 135.07384467007708), (59.0, 126.82225862471493), (76.0, 125.96404091718107), (5.0, 125.56387918963046)]


In [124]:
# we pass in an empty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

Decision Tree predictions: [(36.0, 113.62790697674419), (39.0, 52.753246753246756), (59.0, 113.62790697674419), (76.0, 113.62790697674419), (5.0, 26.5625)]
Decision Tree depth: 5
Decision Tree number of nodes: 63


# Perfomance Metrics

In [125]:
# set up performance metrics functions 

def squared_error(actual, pred):
    return (pred - actual)**2

def abs_error(actual, pred):
    return np.abs(pred - actual)

def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

In [127]:
# Calculate at least two performance metrics of your model.
# compute performance metrics for linear model
mse = true_vs_predicted.map(lambda (t, p): squared_error(t, p)).mean()
mae = true_vs_predicted.map(lambda (t, p): abs_error(t, p)).mean()
rmsle = np.sqrt(true_vs_predicted.map(lambda (t, p): squared_log_error(t, p)).mean())
print "Linear Model - Mean Squared Error: %2.4f" % mse
print "Linear Model - Mean Absolute Error: %2.4f" % mae
print "Linear Model - Root Mean Squared Log Error: %2.4f" % rmsle

Linear Model - Mean Squared Error: 30491.9925
Linear Model - Mean Absolute Error: 130.1671
Linear Model - Root Mean Squared Log Error: 1.4651
