In [1]:
%run spark.ipynb

In [2]:
feature_df = spark.read.csv('./feature.csv', inferSchema=True, header=True)

In [6]:
feature_df.count()

1150164

In [None]:
import shutil
import os

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors


feature_cols = ["surge", "hour", "weekday", "speedLaged", "hamsCountLaged", "m1", "m2", "m3",
            "d0", "d1", "d2", "w0", "w1", "w2", "sd", "cd", "sw", "cw"]
meta_cols = ['gridId', 'datetimeId', 'hourGroup', 'count', 'm1', 'm2', 'm3']
label_col = 'count'

transformed = (
    feature_df
    .select(feature_cols + [label_col] + meta_cols)
    .rdd
    .map(lambda r: (Vectors.dense(r[:len(feature_cols)]), r[label_col]) + r[-len(meta_cols):])
    .toDF(['features', 'label'] + meta_cols)
)

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from datetime import datetime, date, timedelta
import pyspark.sql.functions as F


# TODO increase maxCategories, so weekday can be interpreted as category

data = transformed
test_datetime_id = date(2018, 5, 22)
last_datetime_id = date(2018, 6, 1)

data.persist()

feature_indexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=10).fit(data)

prediction_path = './per_grid_random_forest.csv'
if os.path.exists(prediction_path) and os.path.isdir(prediction_path):
    shutil.rmtree(prediction_path)

# print(data.agg({'datetimeId': 'min'}).collect()[0])
    
while test_datetime_id < last_datetime_id:
    start_datetime_id = test_datetime_id - timedelta(days=21)
    train = data.filter(F.col('datetimeId') >= start_datetime_id).filter(F.col('datetimeId') < test_datetime_id)
    test = data.filter(F.to_date('datetimeId') == test_datetime_id)
        
    print('datetime id: {}, train count: {}, test count: {}'.format(test_datetime_id, train.count(), test.count()))

    from pyspark.ml.regression import RandomForestRegressor
  
    predictions = []
    grid_ids = test.select(F.col("gridId")).distinct().collect()
    for c, grid_id_row in enumerate(grid_ids):
        grid_id = grid_id_row.gridId
        train_part = train.filter(F.col("gridId") == grid_id)
        test_part = test.filter(F.col("gridId") == grid_id)
        rf = RandomForestRegressor(numTrees=50, minInstancesPerNode=1, featuresCol='indexedFeatures')
        pipeline = Pipeline(stages=[feature_indexer, rf])
        model = pipeline.fit(train)
        
        p = model.transform(test)
        predictions.append(p)
                                
        print("{} grids of {} processed".format(c+1, len(grid_ids)))
    
    prediction = predictions[0]
    for p in predictions[1:]:
       prediction = prediction.union(p)
        
    prediction = prediction.select('gridId', 'datetimeId', 'prediction', 'label')
    
    prediction.write.csv(prediction_path, mode='append', header=False)

    prediction = prediction.groupBy('gridId').agg((F.sum(F.abs(prediction['prediction']-prediction['label'])\
            /prediction['label']*F.sqrt(prediction['label'])) / F.sum(F.sqrt(prediction['label']))).alias('err'))
    
    print("avg error: {}".format(prediction.agg({'err': 'avg'}).collect()[0]))
#     evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    
#     rmse = evaluator.evaluate(prediction)
    
#     print("datetime id: {}, rmse: {}".format(test_datetime_id, rmse))
    
    test_datetime_id += timedelta(days=1)
    


datetime id: 2018-05-22, train count: 780063, test count: 35733
1 grids of 129 processed
2 grids of 129 processed
3 grids of 129 processed
4 grids of 129 processed
5 grids of 129 processed
6 grids of 129 processed
7 grids of 129 processed
8 grids of 129 processed
9 grids of 129 processed
10 grids of 129 processed
11 grids of 129 processed
12 grids of 129 processed
13 grids of 129 processed
14 grids of 129 processed
15 grids of 129 processed
16 grids of 129 processed
17 grids of 129 processed
18 grids of 129 processed
19 grids of 129 processed
20 grids of 129 processed
21 grids of 129 processed
22 grids of 129 processed
23 grids of 129 processed
24 grids of 129 processed
25 grids of 129 processed
26 grids of 129 processed
27 grids of 129 processed
28 grids of 129 processed
29 grids of 129 processed
30 grids of 129 processed
31 grids of 129 processed
32 grids of 129 processed
33 grids of 129 processed
34 grids of 129 processed
35 grids of 129 processed
36 grids of 129 processed
37 grids 