In [1]:
import pandas as pd
import numpy as np
import sys
#import sources.endomondolib as endo
#import sources.pysparkconvenience as ps
from numpy import array
from math import sqrt
from pyspark.sql.functions import *
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.sql import DataFrameReader
from pyspark.sql import SQLContext
from IPython.display import display, HTML
from pyspark.sql.functions import col, mean, min, max
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, GBTRegressor, RandomForestRegressor
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

from pyspark.ml.feature import VectorAssembler

# Disable warnings, set Matplotlib inline plotting and load Pandas package
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.options.display.mpl_style = 'default'

%load_ext autotime

In [2]:
!pwd

/home/ubuntu/Regression_5th_Iteration
time: 111 ms


In [3]:
#sc = SQLContext()
#create pyspark dataframe from csv
def df_from_csv(csv_file):
    text = sc.textFile(csv_file)\
       .map(lambda line: line.split(','))
   #didn’t work with take(1). believe returns
   #different object then first()
    schema = text.first()
    data = text.filter(lambda x: x != schema)
    df = sqlContext.createDataFrame(data, schema)
    return df

#here’s the new vectorizer function:

def vectorizeData(data):
        return data.rdd.map(lambda r: [r[0], r[1], r[2], r[3], Vectors.dense(r[4:-1]),float(r[-1])])\
            .toDF(['route cluster', 'performance cluster', 'userid', 'workoutid', 'features', 'label'])

#load cluster csv
pandas_df = pd.read_csv('/home/ubuntu/endo_sample_6_2_1.csv')
df = sqlContext.createDataFrame(pandas_df)

time: 30.1 s


In [4]:
pandas_df.columns

Index([u'Unnamed: 0', u'workoutid', u'route_prediction', u'perf_prediction',
       u'userid', u'diff_altitude', u'geo_distance', u'heart_rate_avg',
       u'speed_avg', u'elapsed_time', u'user_avg_speed', u'user_avg_dist'],
      dtype='object')

time: 4.09 ms


In [5]:
df.show(5)

+----------+---------+----------------+---------------+--------+------------------+--------------+--------------+-------------+------------+-----------------+--------------+
|Unnamed: 0|workoutid|route_prediction|perf_prediction|  userid|     diff_altitude|  geo_distance|heart_rate_avg|    speed_avg|elapsed_time|   user_avg_speed| user_avg_dist|
+----------+---------+----------------+---------------+--------+------------------+--------------+--------------+-------------+------------+-----------------+--------------+
|         0|327494888|               3|              2|12406103|29.205192565900003|0.296969622374| 157.123153687|9.66162204742|      9439.0|7.462391590249999|0.209828388074|
|         1|174621145|               3|              1|  866771|  4.77955552469E-7|0.214295297861|           0.0|10.3593454361|      7998.0|    3.05411797065|0.218213216825|
|         2| 11867693|               3|              0|  866771|  4.77955552469E-7|0.193027198315|           0.0|          0.0|   

In [6]:
df.schema

StructType(List(StructField(Unnamed: 0,LongType,true),StructField(workoutid,LongType,true),StructField(route_prediction,LongType,true),StructField(perf_prediction,LongType,true),StructField(userid,LongType,true),StructField(diff_altitude,DoubleType,true),StructField(geo_distance,DoubleType,true),StructField(heart_rate_avg,DoubleType,true),StructField(speed_avg,DoubleType,true),StructField(elapsed_time,DoubleType,true),StructField(user_avg_speed,DoubleType,true),StructField(user_avg_dist,DoubleType,true)))

time: 2.23 ms


In [7]:
select_columns = ['route_prediction', 'perf_prediction','userid', 'workoutid', 'geo_distance', 'diff_altitude', \
                'user_avg_dist', 'user_avg_speed', 'elapsed_time']

reg_df = vectorizeData(df.select(select_columns))

time: 395 ms


In [8]:
route_clusters = reg_df.select('route cluster').distinct().collect()
perf_clusters = reg_df.select('performance cluster').distinct().collect()
route_cluster_numbers = [int(route_clusters[i][0]) for i in range(len(route_clusters))]
perf_cluster_numbers = [int(perf_clusters[i][0]) for i in range(len(perf_clusters))]

time: 4.69 s


In [9]:
print "Cluster Summaries"
for n in perf_cluster_numbers:
    for i in route_cluster_numbers:
        temp_df = reg_df[(reg_df['route cluster'] == i) & (reg_df['performance cluster'] == n)]
        print 'Route%d_Perf%d:' % (i, n)
        temp_df.describe('label').show()

Cluster Summaries
Route0_Perf0:
+-------+-----------------+
|summary|            label|
+-------+-----------------+
|  count|            37716|
|   mean|3555.609900307562|
| stddev|888.5164729306092|
|    min|            553.0|
|    max|           7149.0|
+-------+-----------------+

Route1_Perf0:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|             45877|
|   mean|1738.0090895219828|
| stddev| 804.1243151589644|
|    min|             500.0|
|    max|            5105.0|
+-------+------------------+

Route3_Perf0:
+-------+-----------------+
|summary|            label|
+-------+-----------------+
|  count|             4901|
|   mean|6402.024892879004|
| stddev|1726.207035668799|
|    min|           1014.0|
|    max|          12174.0|
+-------+-----------------+

Route2_Perf0:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              5985|
|   mean|3763.0765246449455|
| stddev| 118

In [10]:
#model_dictionary
model_dict = {}
model_dict['lr'] = {'model': LinearRegression(featuresCol="features", labelCol="label"),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.maxIter, [5, 10]) \
                    .addGrid(temp_lr.regParam, [0, 0.1, 1,10]) \
                    .addGrid(temp_lr.elasticNetParam, [0,0.5,1]) \
                    .build()"}

model_dict['dt'] = {'model': DecisionTreeRegressor(featuresCol="features", labelCol="label",maxMemoryInMB=1028),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.maxDepth, [3, 5]) \
                    .addGrid(temp_lr.minInfoGain, [0, 0.1, 1]) \
                    .build()"}

model_dict['gbt'] = {'model': GBTRegressor(featuresCol="features", labelCol="label", maxMemoryInMB=2056),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.maxDepth, [3, 5]) \
                    .addGrid(temp_lr.maxIter, [10,20,40]) \
                    .build()"}

model_dict['rfr'] = {'model': RandomForestRegressor(featuresCol="features", labelCol="label", maxMemoryInMB = 2056),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.maxDepth, [3, 5]) \
                    .addGrid(temp_lr.numTrees, [10,20,40]) \
                    .build()"}

time: 171 ms


#Truncated param maps for speed in model_dictionary
model_dict = {}
model_dict['lr'] = {'model': LinearRegression(featuresCol="features", labelCol="label"),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.regParam, [0, 0.1, 0.01]) \
                    .build()"}

model_dict['dt'] = {'model': DecisionTreeRegressor(featuresCol="features", labelCol="label",maxMemoryInMB=1028),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.maxDepth, [3, 5]) \
                    .build()"}

#model_dict['gbt'] = {'model': GBTRegressor(featuresCol="features", labelCol="label", maxMemoryInMB=2056),\
#                    'param': "ParamGridBuilder() \
#                    .addGrid(temp_lr.maxDepth, [3, 5]) \
#                    .build()"}

model_dict['rfr'] = {'model': RandomForestRegressor(featuresCol="features", labelCol="label", maxMemoryInMB = 2056),\
                    'param': "ParamGridBuilder() \
                    .addGrid(temp_lr.maxDepth, [3, 5]) \
                    .build()"}

In [11]:
fit_model_dict = {}

time: 950 µs


In [12]:
reg_df.show(5)

+-------------+-------------------+--------+---------+--------------------+------+
|route cluster|performance cluster|  userid|workoutid|            features| label|
+-------------+-------------------+--------+---------+--------------------+------+
|            3|                  2|12406103|327494888|[0.296969622374,2...|9439.0|
|            3|                  1|  866771|174621145|[0.214295297861,4...|7998.0|
|            3|                  0|  866771| 11867693|[0.193027198315,4...|5610.0|
|            3|                  0|  866771|268802906|[0.186620146036,4...|5302.0|
|            3|                  0|  866771|285918512|[0.223394304514,4...|5733.0|
+-------------+-------------------+--------+---------+--------------------+------+
only showing top 5 rows

time: 120 ms


In [13]:
reg_df.schema

StructType(List(StructField(route cluster,LongType,true),StructField(performance cluster,LongType,true),StructField(userid,LongType,true),StructField(workoutid,LongType,true),StructField(features,VectorUDT,true),StructField(label,DoubleType,true)))

time: 2.3 ms


In [14]:
results_df = reg_df.toPandas().set_index(['workoutid'])

time: 36.9 s


In [15]:
results_df['lr'] = None
results_df['rfr'] = None
results_df['dt'] = None
results_df['gbt'] = None

time: 16.5 ms


In [None]:
results_df.head()

Unnamed: 0_level_0,route cluster,performance cluster,userid,features,label,lr,rfr,dtr,gbt
workoutid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
327494888,3,2,12406103,"[0.296969622374, 29.2051925659, 0.209828388074...",9439.0,,,,
174621145,3,1,866771,"[0.214295297861, 4.77955552469e-07, 0.21821321...",7998.0,,,,
11867693,3,0,866771,"[0.193027198315, 4.77955552469e-07, 0.21821321...",5610.0,,,,
268802906,3,0,866771,"[0.186620146036, 4.77955552469e-07, 0.21821321...",5302.0,,,,
285918512,3,0,866771,"[0.223394304514, 4.77955552469e-07, 0.21821321...",5733.0,,,,


time: 17.8 ms


In [None]:
for m in model_dict.keys():
    for n in perf_cluster_numbers:
        for i in route_cluster_numbers:
            temp_df = reg_df[(reg_df['route cluster'] == i) & (reg_df['performance cluster'] == n)]
            #temp_df1 = reg_df[reg_df['route cluster']==i]
            #temp_df = temp_df1[reg_df['performance cluster']==n]
            temp_df_cv = temp_df

            if temp_df.count() < 5:
                print "Cluster pair dropped: Route%d_Perf%d_%s" % (i,n,m)
                pass

            else:
                temp_lr = model_dict[m]['model']
                temp_lrModel = temp_lr.fit(temp_df['label','features'])
                temp_df = temp_lrModel.transform(temp_df)

                paramGrid = eval(model_dict[m]['param'])
                #paramGrid = ParamGridBuilder() \
                #.addGrid(temp_lr.maxDepth, [3, 5]) \
                #.addGrid(temp_lr.numTrees, [10,20,40]) \
                #.build()


                evaluator = RegressionEvaluator(
                    labelCol="label", predictionCol="prediction", metricName="mae")

                crossval = CrossValidator(estimator=temp_lr,
                                          estimatorParamMaps=paramGrid,
                                          evaluator=evaluator,
                                          numFolds=10)  

                # Run cross-validation, and choose the best set of parameters.
                cvModel = crossval.fit(temp_df_cv)

                pred = cvModel.transform(temp_df_cv)

                pd_pred = pred.toPandas().set_index(['workoutid'])
                results_df.ix[pd_pred.index, m] = pd_pred.prediction

                
                #print("For " + str(type(temp_lr)) + " and cluster " + str(i))
                #print(cvModel.explainParams())

                #Print the coefficients and intercept for linear regression
                #print("Coefficients: " + str(temp_lrModel.coefficients))
                #print("Intercept: " + str(temp_lrModel.intercept))


                
                mae = evaluator.evaluate(pred)
                rmse = evaluator.evaluate(pred,{evaluator.metricName: "rmse"})
                r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})

                fit_model_dict['Route%d_Perf%d_%s' % (i,n,m)]={}
                fit_model_dict['Route%d_Perf%d_%s' % (i,n,m)]['mae'] = mae
                fit_model_dict['Route%d_Perf%d_%s' % (i,n,m)]['rmse'] = rmse
                fit_model_dict['Route%d_Perf%d_%s' % (i,n,m)]['r2'] = r2
                #fit_model_dict['Route%d_Perf%d_%s' % (i,n,m)]['model'] = cvModel.bestModel
                

                
                print("(Route, Perf, Model): " + str((i,n,m)) +": " + str(rmse))
                print(cvModel.bestModel)
                #print(cvModel.bestModel.weights)
                #print(cvModel.bestModel.coefficients)
                basePath = "/home/ubuntu/Regression_5th_Iteration/Models/"
                cvModel.bestModel.save(basePath +"Route%d_Perf%d_%s" % (i,n,m))
                
                fit_model_dict['Route%d_Perf%d_%s' % (i,n,m)]["path"] = basePath +"Route%d_Perf%d_%s" % (i,n,m)
 

(Route, Perf, Model): (0, 0, 'rfr'): 692.503825975
RandomForestRegressionModel (uid=rfr_40a3f763d0bb) with 10 trees
(Route, Perf, Model): (1, 0, 'rfr'): 553.446929595
RandomForestRegressionModel (uid=rfr_f64a634c26aa) with 20 trees
(Route, Perf, Model): (3, 0, 'rfr'): 1386.04953647
RandomForestRegressionModel (uid=rfr_00c91f61b84f) with 40 trees
(Route, Perf, Model): (2, 0, 'rfr'): 688.865936667
RandomForestRegressionModel (uid=rfr_5aeef92c9195) with 20 trees
(Route, Perf, Model): (4, 0, 'rfr'): 681.062080983
RandomForestRegressionModel (uid=rfr_49302d02e358) with 40 trees
(Route, Perf, Model): (0, 1, 'rfr'): 717.100507449
RandomForestRegressionModel (uid=rfr_2287752284e4) with 20 trees
(Route, Perf, Model): (1, 1, 'rfr'): 462.123878538
RandomForestRegressionModel (uid=rfr_6b1422382d6f) with 20 trees
(Route, Perf, Model): (3, 1, 'rfr'): 1197.63947839
RandomForestRegressionModel (uid=rfr_e38df1fa7cb2) with 40 trees
(Route, Perf, Model): (2, 1, 'rfr'): 1121.37401119
RandomForestRegressio

In [None]:
fit_model_df = pd.DataFrame.from_dict(fit_model_dict, orient = 'index')

time: 5.04 ms


In [None]:
fit_model_df

Unnamed: 0,path,mae,r2,rmse
Route0_Perf0_dt,/home/ubuntu/Regression_5th_Iteration/Models/R...,529.494504,0.384126,697.27777
Route0_Perf0_gbt,/home/ubuntu/Regression_5th_Iteration/Models/R...,431.621336,0.556981,591.385644
Route0_Perf0_lr,/home/ubuntu/Regression_5th_Iteration/Models/R...,544.488171,0.354878,713.642488
Route0_Perf0_rfr,/home/ubuntu/Regression_5th_Iteration/Models/R...,526.11113,0.39253,692.503826
Route0_Perf1_dt,/home/ubuntu/Regression_5th_Iteration/Models/R...,545.740507,0.407843,719.008217
Route0_Perf1_gbt,/home/ubuntu/Regression_5th_Iteration/Models/R...,432.063304,0.598188,592.279257
Route0_Perf1_lr,/home/ubuntu/Regression_5th_Iteration/Models/R...,566.879338,0.372516,740.144526
Route0_Perf1_rfr,/home/ubuntu/Regression_5th_Iteration/Models/R...,546.291741,0.410981,717.100507
Route0_Perf2_dt,/home/ubuntu/Regression_5th_Iteration/Models/R...,580.078532,0.387892,775.848934
Route0_Perf2_gbt,/home/ubuntu/Regression_5th_Iteration/Models/R...,453.266263,0.595519,630.684104


time: 33.2 ms


In [None]:
results_df

Unnamed: 0_level_0,route cluster,performance cluster,userid,features,label,lr,rfr,dtr,gbt,dt
workoutid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
327494888,3,2,12406103,"[0.296969622374, 29.2051925659, 0.209828388074...",9439.0,8246.49,7593.11,,7763.07,7502.983471
174621145,3,1,866771,"[0.214295297861, 4.77955552469e-07, 0.21821321...",7998.0,6178.23,6323.22,,6478.54,6266.860880
11867693,3,0,866771,"[0.193027198315, 4.77955552469e-07, 0.21821321...",5610.0,5816.14,5766.52,,5463.8,5602.517182
268802906,3,0,866771,"[0.186620146036, 4.77955552469e-07, 0.21821321...",5302.0,5662.59,5437.87,,5353.95,5309.524038
285918512,3,0,866771,"[0.223394304514, 4.77955552469e-07, 0.21821321...",5733.0,6543.9,6438.71,,6201.45,6189.503171
454249158,3,0,7710890,"[0.198450043797, 4.77955552469e-07, 0.22426493...",6317.0,5717.7,5820.46,,6254.9,5602.517182
674628540,3,1,1058434,"[0.206375300884, 32.0196228027, 0.21711718291,...",6129.0,5952.65,6096.1,,6144.82,6266.860880
524268591,3,2,1058434,"[0.191477164626, 12.2665195465, 0.21711718291,...",4697.0,5464.78,6177.23,,5455.78,6622.211601
458967418,3,1,2226932,"[0.208758547902, 30.9316120148, 0.203843176365...",8569.0,6483.04,6445.07,,5962.7,6266.860880
255209949,3,1,2675116,"[0.237587377429, 7.76073455811, 0.229243882, 1...",5244.0,6089.9,6523.25,,6597.71,6281.719246


time: 70.5 ms


In [None]:
!pwd

/home/ubuntu/Regression_5th_Iteration
time: 125 ms


In [None]:
fit_model_df.to_csv(path_or_buf = 'fit_model_df.csv')
results_df.to_csv(path_or_buf = 'results_df.csv')

time: 3.63 s
