In [1]:
from pyspark.sql import SQLContext
from matplotlib import pyplot as plt
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
import pandas as pd
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, OneHotEncoder
from datetime import datetime, timedelta, date
from pyspark.ml import Pipeline
import pyspark

In [2]:
# sc.stop()
# sc = pyspark.SparkContext(master="spark://192.168.1.3:7077",appName="spark")
# sc

In [None]:
def prepare_dataset(resampled_df,window_period,week_lag):
    # window period = 6 for lag input for same hour,same-1,same-2 
    window = Window.partitionBy('cluster_id').orderBy('date','hour')
    for lag_hour in range(0,window_period+1):
        for diff in range(1,3):
            resampled_df = resampled_df.withColumn('{}_diff_energy_t_{}'.format(diff,lag_hour),lag(resampled_df['energy(kWh/h)'], count=24*diff+lag_hour).over(window)) 
    for lag_week in range(1,week_lag+1):
        resampled_df = resampled_df.withColumn('diff_energy_week_t_{}'.format(lag_week),lag(resampled_df['energy(kWh/h)'], count=24*7*lag_week).over(window)) 
    df_resample_lag = resampled_df

    # Mean of previous 6 days
    df_resample_lag = df_resample_lag.withColumn("rnk",sf.dense_rank().over(Window.partitionBy('cluster_id').orderBy('date')))
    for days in range(1,window_period+1):
        df_resample_lag = df_resample_lag.withColumn("mean_{}".format(days),avg("energy(kWh/h)").over(Window.partitionBy("cluster_id").orderBy("rnk").rangeBetween(-days,-days)))
#     df_resample_lag.show()
    # Min power of previous 2 days
    for days in range(1,window_period+1):
        df_resample_lag = df_resample_lag.withColumn("min_{}".format(days),sf.min("energy(kWh/h)").over(Window.partitionBy("cluster_id").orderBy("rnk").rangeBetween(-days,-days)))
        df_resample_lag = df_resample_lag.withColumn("max_{}".format(days),sf.max("energy(kWh/h)").over(Window.partitionBy("cluster_id").orderBy("rnk").rangeBetween(-days,-days)))
    
    return df_resample_lag

In [None]:
def add_weather_feature(df):
    weather_data = sqlcontext.read.csv(base_path+"weather_hourly_darksky.csv",header=True,inferSchema=True)
    weather_daily_data = sqlcontext.read.csv(base_path+"weather_daily_darksky.csv",header=True,inferSchema=True)
    weather_daily_data = weather_daily_data.select(date_format("temperatureMaxTime","yyyy-MM-dd").alias("date2"),"temperatureMax","temperatureMin")
    weather_data = weather_data.withColumn("hour1",hour(weather_data["time"]))
    weather_data = weather_data.withColumn("date1",date_format(weather_data["time"],"yyyy-MM-dd").cast(DateType()))
    weather_data = weather_data.drop("time","icon","temperature")
    weather_data = weather_data.join(broadcast(weather_daily_data),(weather_daily_data.date2 == weather_data.date1))
    weather_data.printSchema()
    df_full_dataset = df.join(broadcast(weather_data),(df.date == weather_data.date1) & (df.hour == weather_data.hour1))
    df_full_dataset = df_full_dataset.drop("hour1","date1").cache()
    df_full_dataset.take(1)
    return df_full_dataset

def add_holiday_feature(df):
    holiday_data = sqlcontext.read.csv(base_path+"uk_bank_holidays.csv",header=True,inferSchema=True)
    holiday_data = holiday_data.withColumn("Bank holidays",date_format(holiday_data["Bank holidays"],"yyyy-MM-dd").cast(DateType()))
    holiday_data = holiday_data.select("Bank holidays")
    holiday_data = holiday_data.withColumn("holiday",lit(1))
    feature_df = df.join(holiday_data,holiday_data["Bank holidays"] == df_full_dataset["date"],how="left")
    feature_df = feature_df.fillna({'holiday':'0'})
    feature_df = feature_df.drop("Bank holidays")
    feature_df = feature_df.withColumn("Weekday/end",sf.when((col("weekDay")==str("Sat")) | (col("weekDay")==str("Sat")),1).otherwise(0))
    return feature_df

def acorn_info(df,household_info):
    Acorn_data_group = household_info.select("LCLid","stdorToU","Acorn_grouped")
    Acorn_data_group.select("Acorn_grouped").distinct().collect()
    possible_group = ["Comfortable","Affluent","Adversity"]
    Acorn_data_group = Acorn_data_group.filter(Acorn_data_group.Acorn_grouped.isin(possible_group))
    feature_df = df.join(Acorn_data_group,["LCLid"])                # preventing duplicate column in df
    feature_df.printSchema()
    #Acorn_data_group.select("stdorToU").distinct().collect()
    feature_df.take(1)
    return feature_df



    

In [3]:
base_path = "/home/darkmatter/Desktop/smart-meters-in-london/"
sqlcontext = SQLContext(sc)

In [4]:
household_info = sqlcontext.read.csv(base_path+"informations_households.csv",header=True,inferSchema=True)
household_mini = household_info
household_mini = household_info.limit(5)

In [5]:
household_mini.printSchema()

root
 |-- LCLid: string (nullable = true)
 |-- stdorToU: string (nullable = true)
 |-- Acorn: string (nullable = true)
 |-- Acorn_grouped: string (nullable = true)
 |-- file: string (nullable = true)



In [7]:
flag = 0
df_full = []
df_file = household_mini.select("file").distinct()
df_file.show()
count = 0
for row in df_file.rdd.collect():
    file = row.file
    print(file,count)
    count += 1
    file_path = base_path + "halfhourly_dataset/"+ file+".csv"
    half_hourly_consumption_data = sqlcontext.read.csv(file_path,header=True,inferSchema=True).cache()
    half_hourly_consumption_data.dropna(how='any')
    if flag == 0:
        df_full = sqlcontext.createDataFrame([],half_hourly_consumption_data.schema)
        flag = 1
    df_full = df_full.union(half_hourly_consumption_data)
    df_full = df_full.cache()

+-------+
|   file|
+-------+
|block_0|
+-------+

block_0 0


In [None]:
df_full = df_full.filter((date_format(dataset_df.tstp,"yyyy-MM-dd") >= date(2013,1,1)) & (date_format(dataset_df.tstp,"yyyy-MM-dd") <= date(2013,12,31)))
year_df = (dataset_df.groupBy("LCLid").count()).filter(year_df["count"] >= 8760 )
print("Total user in 2013 with full evidence ", year_df.select("LCLid").distinct().count())

In [8]:
cluster_file = sqlContext.read.csv(base_path+"cluster_info.csv")
LCLid_With_Cluster_id = df_full.join(broadcast(cluster_file),["LCLid"])

AnalysisException: 'Path does not exist: file:/home/darkmatter/Desktop/smart-meters-in-london/cluster_info.csv;'

In [9]:
# Resampling to 1 hour and extracting variable like month,hour,weekday etc.
df = LCLid_With_Cluster_id.select("cluster_id","tstp","energy(kWh/hh)",date_format("tstp","yyyy-MM-dd").alias("date"),date_format("tstp",'HH:mm').alias("start time"),date_format("tstp",'E').alias("weekDay"),month("tstp").alias("month"),hour("tstp").alias("hour"))
df1 = (df.groupby('cluster_id',"date","hour").sum("energy(kWh/hh)")).orderBy('date','hour',ascending=True)
df1 = df1.withColumnRenamed("sum(energy(kWh/hh))","energy(kWh/h)")
resampled_df = df1.select("cluster_id","date","hour","energy(kWh/h)",month("date").alias("month"),date_format("date",'E').alias("weekDay"))
resampled_df = resampled_df.withColumn("energy(kWh/h)", sf.round(resampled_df["energy(kWh/h)"], 3))
resampled_df = resampled_df.withColumn("date", resampled_df["date"].cast(DateType()))

NameError: name 'LCLid_With_Cluster_id' is not defined

In [1]:
feature_df = prepare_dataset(resampled_df,6,4)
add_weather_feature_df = add_weather_feature(feature_df)
add_holiday_feature_df = add_holiday_feature(add_weather_feature_df)
add_acorn_info_df = acorn_info(add_holiday_feature_df)

NameError: name 'prepare_dataset' is not defined

In [2]:
final_feature_df = add_acorn_info_df
inputCols = ["weekDay","precipType","summary","stdorToU","Acorn_grouped"]
outputCols = ["weekDay_index","precipType_index","summary_index","stdorToU_index","Acorn_grouped_index"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(feature_df) for column in inputCols ]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(final_feature_df).transform(final_feature_df)

NameError: name 'add_acorn_info_df' is not defined