In [1]:
import pandas as pd
import numpy as np
import sys
#import sources.endomondolib as endo
#import sources.pysparkconvenience as ps
from numpy import array
from math import sqrt
from pyspark.sql.functions import *
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.sql import DataFrameReader
from pyspark.sql import SQLContext
from IPython.display import display, HTML
from pyspark.sql.functions import col, mean, min, max
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, GBTRegressor, RandomForestRegressor
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

from pyspark.ml.feature import VectorAssembler

# Disable warnings, set Matplotlib inline plotting and load Pandas package
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.options.display.mpl_style = 'default'

%load_ext autotime

In [9]:
#sc = SQLContext()
#create pyspark dataframe from csv
def df_from_csv(csv_file):
    text = sc.textFile(csv_file)\
       .map(lambda line: line.split(','))
   #didn’t work with take(1). believe returns
   #different object then first()
    schema = text.first()
    data = text.filter(lambda x: x != schema)
    df = sqlContext.createDataFrame(data, schema)
    return df

df = df_from_csv('dave2.csv')

#here’s the new vectorizer function:

def vectorizeData(data):
    return data.rdd.map(lambda r: [r[0], r[1], r[2], Vectors.dense(r[3:7]), float(r[7])]).toDF([\
           'cluster', 'userid', 'workoutid', 'features', 'label'])

time: 605 ms


In [10]:
select_columns = ['prediction', 'userid', 'workoutid', 'sum_geo_distance', 'diff_altitude', \
                'new_avg_speed', 'new_avg_dist', 'max_elapsed_time']
df = df.select(select_columns)

time: 16.3 ms


In [11]:
df.schema

StructType(List(StructField(prediction,StringType,true),StructField(userid,StringType,true),StructField(workoutid,StringType,true),StructField(sum_geo_distance,StringType,true),StructField(diff_altitude,StringType,true),StructField(new_avg_speed,StringType,true),StructField(new_avg_dist,StringType,true),StructField(max_elapsed_time,StringType,true)))

time: 2.58 ms


In [12]:
reg_df = vectorizeData(df)

distinct_clusters = reg_df.select('cluster').distinct().collect()
cluster_numbers = [int(distinct_clusters[i][0]) for i in range(len(distinct_clusters))]



time: 2.52 s


In [13]:
reg_df.schema

StructType(List(StructField(cluster,StringType,true),StructField(userid,StringType,true),StructField(workoutid,StringType,true),StructField(features,VectorUDT,true),StructField(label,DoubleType,true)))

time: 2.35 ms


In [19]:
for i in cluster_numbers:
        temp_df = reg_df[reg_df['cluster']==i]
        temp_lr = LinearRegression()
        temp_lrModel = temp_lr.fit(temp_df['label','features'])
        temp_df = temp_lrModel.transform(temp_df)
        
        print("Cluster Number: " + str(i))
        #Print the coefficients and intercept for linear regression
        print("Cofficient Names: " + 'sum_geo_distance, diff_altitude, new_avg_speed, and new_avg_dist')
        print("Coefficients: " + str(temp_lrModel.coefficients))
        print("Intercept: " + str(temp_lrModel.intercept))
        
        # Summarize the model over the training set and print out some metrics
        trainingSummary = temp_lrModel.summary
        print("numIterations: %d" % trainingSummary.totalIterations)
        print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
        trainingSummary.residuals.show()
        print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
        print("r2: %f" % trainingSummary.r2)
        print("\n\n")

Cluster Number: 7
Cofficient Names: sum_geo_distance, diff_altitude, new_avg_speed, and new_avg_dist
Coefficients: [3371.05498911,-5.33534616802,67.1954453654,-5643.85718678]
Intercept: 371.586509259
numIterations: 1
objectiveHistory: [0.0]
+-------------------+
|          residuals|
+-------------------+
| 156.01259681030615|
| 39.435247933869505|
| -868.1724709247669|
| 149.67942055607136|
| 27.949032281379687|
|-1.5072025920067063|
|  117.8963535923906|
| -159.4057651565156|
| 115.66900286070882|
|    77.997713894031|
|-47.704130043186524|
|  71.14905659891792|
|-505.05286987415616|
|  9.281628307533083|
|  416.2458516045906|
|  20.15526917955799|
| -48.20298888803427|
|  131.1805400851099|
|-410.02352665733457|
| 187.31366513570765|
+-------------------+
only showing top 20 rows

RMSE: 265.155389
r2: 0.228409



Cluster Number: 3
Cofficient Names: sum_geo_distance, diff_altitude, new_avg_speed, and new_avg_dist
Coefficients: [20831.4884412,0.532920058793,26.2181478057,20831.4884412

Above results show huge (opposite) coefficients for sum_geo_distance and new_avg_dist

This indicates probable colinearity, which could be solved by replacing new_avg_dist with sum_geo_distance-new_avg_dist. 

Another solution could be to dissolve the new_avg_dist by new_avg_speed to create a new_avg_time variable; however, it may be better to create this variable rather than the other two.

In [None]:
for i in cluster_numbers:
        temp_df = reg_df[reg_df['cluster']==i]
        temp_lr = LinearRegression()
        temp_lrModel = temp_lr.fit(temp_df['label','features'])
        temp_df = temp_lrModel.transform(temp_df)
        
        print("Cluster Number: " + str(i))
        #Print the coefficients and intercept for linear regression
        print("Cofficient Names: " + 'sum_geo_distance, diff_altitude, new_avg_speed, and new_avg_dist')
        print("Coefficients: " + str(temp_lrModel.coefficients))
        print("Intercept: " + str(temp_lrModel.intercept))
        
        # Summarize the model over the training set and print out some metrics
        trainingSummary = temp_lrModel.summary
        print("numIterations: %d" % trainingSummary.totalIterations)
        print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
        trainingSummary.residuals.show()
        print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
        print("r2: %f" % trainingSummary.r2)
        print("\n\n")