In [67]:
from pyspark.sql import SQLContext
from handyspark import *
from pyspark.sql import functions as sf
from matplotlib import pyplot as plt
from pyspark.sql.functions import col, avg, date_format,month,hour,lag, date_sub,lit
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
import pandas as pd
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, OneHotEncoder
from datetime import datetime, timedelta, date
from pyspark.ml import Pipeline

In [68]:
base_path = "/home/darkmatter/Desktop/smart-meters-in-london/"
sqlcontext = SQLContext(sc)
household_info = sqlContext.read.csv(base_path+"informations_households.csv",header=True,inferSchema=True)
household_mini = sc.parallelize(household_info.take(1)).toDF()
# household_mini = household_info

In [69]:
def prepare_dataset(df):
    df.printSchema()
    
    # Resampling to 1 hour and extracting variable like month,hour,weekday etc.
    df = df.select("LCLID","tstp","energy(kWh/hh)",date_format("tstp","yyyy-MM-dd").alias("date"),date_format("tstp",'HH:mm').alias("start time"),date_format("tstp",'E').alias("weekDay"),month("tstp").alias("month"),hour("tstp").alias("hour"))
    df1 = (df.groupby('LCLID',"date","hour").sum("energy(kWh/hh)")).orderBy('date','hour',ascending=True)
    df1 = df1.withColumnRenamed("sum(energy(kWh/hh))","energy(kWh/h)")
    resampled_df = df1.select("LCLid","date","hour","energy(kWh/h)",month("date").alias("month"),date_format("date",'E').alias("weekDay"))
    resampled_df = resampled_df.withColumn("energy(kWh/h)", sf.round(resampled_df["energy(kWh/h)"], 3))
    resampled_df = resampled_df.withColumn("date", resampled_df["date"].cast(DateType()))
#     resampled_df.printSchema()
#     resampled_df.show()
    
    # window period = 2 for lag input for same hour 
    window = Window.partitionBy('LCLid').orderBy('date','hour')
    for diff in range(1,3):
        resampled_df = resampled_df.withColumn('{}_diff_energy_t'.format(diff),lag(resampled_df['energy(kWh/h)'], count=24*diff).over(window))
    # window period = 2 for lag input for same-1 hour 
    for diff in range(1,3):
        resampled_df = resampled_df.withColumn('{}_diff_energy_t_1'.format(diff),lag(resampled_df['energy(kWh/h)'], count=24*diff+1).over(window))    
    # window period = 2 for lag input for same-2 hour 
    for diff in range(1,3):
        resampled_df = resampled_df.withColumn('{}_diff_energy_t_2'.format(diff),lag(resampled_df['energy(kWh/h)'], count=24*diff+2).over(window))    
    
    df_resample_lag = resampled_df
#     df_resample_lag.printSchema()
#     df_resample_lag.show()
    
    # Mean of previous 2 days
    df_resample_lag = df_resample_lag.withColumn("rnk",sf.dense_rank().over(Window.partitionBy().orderBy("date")))
    df_resample_lag.show()
    for days in range(1,3):
        df_resample_lag = df_resample_lag.withColumn("mean_{}".format(days),sf.round(avg("energy(kWh/h)").over(Window.partitionBy().orderBy("rnk").rangeBetween(-days,-days))))
#     df_resample_lag.show()
    
    # Min power of previous 2 days
    for days in range(1,3):
        df_resample_lag = df_resample_lag.withColumn("min_{}".format(days),sf.min("energy(kWh/h)").over(Window.partitionBy().orderBy("rnk").rangeBetween(-days,-days)))
        df_resample_lag = df_resample_lag.withColumn("max_{}".format(days),sf.max("energy(kWh/h)").over(Window.partitionBy().orderBy("rnk").rangeBetween(-days,-days)))
    
    return df_resample_lag

### TODO : Instead of processing each LCLID process each file using window.partitionby("LCLID")

In [70]:
block_read = set([])
for row in household_mini.rdd.collect():
    house_id = row.LCLid
    file = row.file
    print(house_id,file)
    file_path = base_path + "halfhourly_dataset/"+ file+".csv"
    if file not in block_read:
        block_read.add(file)
        half_hourly_consumption_data = sqlContext.read.csv(file_path,header=True,inferSchema=True)
        half_hourly_consumption_data.dropna(how='any')
    indiv_house_data = half_hourly_consumption_data.where(col("LCLid") == house_id)
    indiv_house_data = indiv_house_data.withColumnRenamed("energy(kWh/hh)","energy")
    indiv_house_data.show()
    indiv_house_data = indiv_house_data.withColumn("energy(kWh/hh)", indiv_house_data["energy"].cast("float"))
    indiv_house_data = indiv_house_data.drop("energy")
    indiv_house_data.printSchema()
    if indiv_house_data.rdd.isEmpty():
        print("Missing Id = {} in file = {}".format(house_id,file))
        continue
    df = prepare_dataset(indiv_house_data)
    df.printSchema()
    df = df.dropna(how="any")

MAC005492 block_0
+---------+-------------------+-------+
|    LCLid|               tstp| energy|
+---------+-------------------+-------+
|MAC005492|2012-04-17 08:30:00| 0.135 |
|MAC005492|2012-04-17 09:00:00| 0.086 |
|MAC005492|2012-04-17 09:30:00| 0.063 |
|MAC005492|2012-04-17 10:00:00| 0.125 |
|MAC005492|2012-04-17 10:30:00| 0.272 |
|MAC005492|2012-04-17 11:00:00| 0.063 |
|MAC005492|2012-04-17 11:30:00| 0.045 |
|MAC005492|2012-04-17 12:00:00| 0.044 |
|MAC005492|2012-04-17 12:30:00| 0.084 |
|MAC005492|2012-04-17 13:00:00|  0.14 |
|MAC005492|2012-04-17 13:30:00| 0.221 |
|MAC005492|2012-04-17 14:00:00| 0.181 |
|MAC005492|2012-04-17 14:30:00| 0.185 |
|MAC005492|2012-04-17 15:00:00| 0.158 |
|MAC005492|2012-04-17 15:30:00| 0.111 |
|MAC005492|2012-04-17 16:00:00|  0.16 |
|MAC005492|2012-04-17 16:30:00| 0.074 |
|MAC005492|2012-04-17 17:00:00| 0.348 |
|MAC005492|2012-04-17 17:30:00| 0.298 |
|MAC005492|2012-04-17 18:00:00| 0.163 |
+---------+-------------------+-------+
only showing top 20 ro

In [71]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

df1 = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "d"),
    (5, "e")
], ["id", "category"])

stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df1)
indexed = model.transform(df1)

encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.show()

+---+--------+-------------+-------------+
| id|category|categoryIndex|  categoryVec|
+---+--------+-------------+-------------+
|  0|       a|          0.0|(4,[0],[1.0])|
|  1|       b|          2.0|(4,[2],[1.0])|
|  2|       c|          3.0|(4,[3],[1.0])|
|  3|       a|          0.0|(4,[0],[1.0])|
|  4|       d|          4.0|    (4,[],[])|
|  5|       e|          1.0|(4,[1],[1.0])|
+---+--------+-------------+-------------+



In [72]:
stringIndexer = StringIndexer(inputCol="weekDay", outputCol="Index_Week")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="Index_Week", outputCol="categoryWeek")
df_encoded = encoder.transform(indexed)
df_encoded.show()
columns = df_encoded.columns
inputcols = columns[4:]
inputcols.append(columns[2])
inputcols.remove("Index_Week")
inputcols.remove("weekDay")

vecAssembler = VectorAssembler(inputCols=inputcols, outputCol="features")
df_feature = vecAssembler.transform(df_encoded)
df_feature.show()

+---------+----------+----+-------------+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+
|    LCLid|      date|hour|energy(kWh/h)|month|weekDay|1_diff_energy_t|2_diff_energy_t|1_diff_energy_t_1|2_diff_energy_t_1|1_diff_energy_t_2|2_diff_energy_t_2|rnk|mean_1|mean_2|min_1|max_1|min_2|max_2|Index_Week| categoryWeek|
+---------+----------+----+-------------+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+
|MAC005492|2012-04-19|  10|        0.301|    4|    Thu|          0.232|          0.397|            0.188|            0.149|            0.413|            0.135|  3|   0.0|   0.0|0.098|0.555|0.108|0.646|       3.0|(6,[3],[1.0])|
|MAC005492|2012-04-19|  11|        0.124|    4|    Thu|          0.324|          0.108|     

In [73]:
# row1 = df_feature.agg({"date": "max"}).collect()[0]
print(type(row1[0]))
df_feature.count()
from datetime import datetime, timedelta

<class 'datetime.date'>


In [74]:
df_feature = df_feature.withColumnRenamed("energy(kWh/h)","label")
train_df = df_feature.where(col("date") <= ((df_feature.agg({"date": "max"}).collect()[0])[0]-timedelta(df.count()//(24*60))))
test_df = df_feature.where(col("date") > ((df_feature.agg({"date": "max"}).collect()[0])[0]-timedelta(df.count()//(24*60))))

In [75]:
test_df.show()

+---------+----------+----+-----+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+--------------------+
|    LCLid|      date|hour|label|month|weekDay|1_diff_energy_t|2_diff_energy_t|1_diff_energy_t_1|2_diff_energy_t_1|1_diff_energy_t_2|2_diff_energy_t_2|rnk|mean_1|mean_2|min_1|max_1|min_2|max_2|Index_Week| categoryWeek|            features|
+---------+----------+----+-----+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+--------------------+
|MAC005492|2014-02-20|   0|0.357|    2|    Thu|          0.226|           0.35|            0.361|            0.246|            0.385|            0.377|546|   0.0|   0.0|0.169|0.617| 0.17|1.106|       3.0|(6,[3],[1.0])|[2.0,0.226,0.35,0...|
|MAC005492|2014-02-20|   1|0.383|    2| 

In [77]:
rf = RandomForestRegressor(maxDepth=10, numTrees=10, maxBins=128)
rfmodel = rf.fit(train_df)
pred_val = rfmodel.transform(test_df)
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName="rmse")
accuracy = evaluator.evaluate(pred_val)
pred_val.show()
# print('RMSE = : %.4f'%(accuracy))

+---------+----------+----+-----+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+--------------------+-------------------+
|    LCLid|      date|hour|label|month|weekDay|1_diff_energy_t|2_diff_energy_t|1_diff_energy_t_1|2_diff_energy_t_1|1_diff_energy_t_2|2_diff_energy_t_2|rnk|mean_1|mean_2|min_1|max_1|min_2|max_2|Index_Week| categoryWeek|            features|         prediction|
+---------+----------+----+-----+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+--------------------+-------------------+
|MAC005492|2014-02-20|   0|0.357|    2|    Thu|          0.226|           0.35|            0.361|            0.246|            0.385|            0.377|546|   0.0|   0.0|0.169|0.617| 0.17|1.106|       3.0|(6,[3],[1.0])|[2