In [1]:
from pyspark.sql import SQLContext
from handyspark import *
from pyspark.sql import functions as sf
from matplotlib import pyplot as plt
from pyspark.sql.functions import col, avg, date_format,month,hour,lag, date_sub,lit
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
import pandas as pd
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, OneHotEncoder
from datetime import datetime, timedelta, date
from pyspark.ml import Pipeline
import pyspark

In [2]:
sc.stop()
sc = pyspark.SparkContext(master="spark://172.16.27.115:7077",appName="spark")
sc

In [5]:
base_path = "/media/iitp/disk/smart-meters-in-london/"
sqlcontext = SQLContext(sc)
household_info = sqlcontext.read.csv(base_path+"informations_households.csv",header=True,inferSchema=True)
household_mini = sc.parallelize(household_info.take(2500)).toDF()
# household_mini = household_info

In [6]:
def prepare_dataset(df):
    # Resampling to 1 hour and extracting variable like month,hour,weekday etc.
    df = df.select("LCLID","tstp","energy(kWh/hh)",date_format("tstp","yyyy-MM-dd").alias("date"),date_format("tstp",'HH:mm').alias("start time"),date_format("tstp",'E').alias("weekDay"),month("tstp").alias("month"),hour("tstp").alias("hour"))
    df1 = (df.groupby('LCLID',"date","hour").sum("energy(kWh/hh)")).orderBy('date','hour',ascending=True)
    df1 = df1.withColumnRenamed("sum(energy(kWh/hh))","energy(kWh/h)")
    resampled_df = df1.select("LCLid","date","hour","energy(kWh/h)",month("date").alias("month"),date_format("date",'E').alias("weekDay"))
    resampled_df = resampled_df.withColumn("energy(kWh/h)", sf.round(resampled_df["energy(kWh/h)"], 3))
    resampled_df = resampled_df.withColumn("date", resampled_df["date"].cast(DateType()))
#     resampled_df.printSchema()
#     resampled_df.show()
    
#     # window period = 2 for lag input for same hour,same-1,same-2 
    window = Window.partitionBy('LCLid').orderBy('date','hour')
    for lag_hour in range(0,3):
        for diff in range(1,3):
            resampled_df = resampled_df.withColumn('{}_diff_energy_t_{}'.format(diff,lag_hour),lag(resampled_df['energy(kWh/h)'], count=24*diff+lag_hour).over(window)) 
#     df_resample_lag.show()
    
    # 4 weeks previous lag value of same time
    for lag_week in range(1,5):
        resampled_df = resampled_df.withColumn('diff_energy_week_t_{}'.format(lag_week),lag(resampled_df['energy(kWh/h)'], count=24*7*lag_week).over(window)) 

    df_resample_lag = resampled_df
#     df_resample_lag.printSchema()    
    # Mean of previous 2 days
    df_resample_lag = df_resample_lag.withColumn("rnk",sf.dense_rank().over(Window.partitionBy('LCLid').orderBy('date')))
    for days in range(1,3):
        df_resample_lag = df_resample_lag.withColumn("mean_{}".format(days),avg("energy(kWh/h)").over(Window.partitionBy("LCLid").orderBy("rnk").rangeBetween(-days,-days)))
#     df_resample_lag.show()
    # Min power of previous 2 days
    for days in range(1,3):
        df_resample_lag = df_resample_lag.withColumn("min_{}".format(days),sf.min("energy(kWh/h)").over(Window.partitionBy("LCLid").orderBy("rnk").rangeBetween(-days,-days)))
        df_resample_lag = df_resample_lag.withColumn("max_{}".format(days),sf.max("energy(kWh/h)").over(Window.partitionBy("LCLid").orderBy("rnk").rangeBetween(-days,-days)))
    
    return df_resample_lag

## dataset_df holds lag variable and other metric variable, weather variable need to be added

In [7]:
flag = 0
avg_house_data = []
df_file = household_mini.select("file").distinct()
dataset_df = []
for row in df_file.rdd.collect():
    file = row.file
    print(file)
    file_path = base_path + "halfhourly_dataset/"+ file+".csv"
    half_hourly_consumption_data = sqlcontext.read.csv(file_path,header=True,inferSchema=True)
    half_hourly_consumption_data.dropna(how='any')
    half_hourly_consumption_data = half_hourly_consumption_data.withColumn("energy(kWh/hh)",
                                                                           half_hourly_consumption_data["energy(kWh/hh)"].cast("float"))
#     half_hourly_consumption_data.printSchema()
    df = prepare_dataset(half_hourly_consumption_data)
    df = df.drop("rnk")
    if flag == 0:
        dataset_df = sqlcontext.createDataFrame([],df.schema)
        flag = 1
    dataset_df = dataset_df.union(df)
dataset_df.show(2)

block_39
block_43
block_19
block_35
block_26
block_20
block_12
block_21
block_36
block_49
block_18
block_44
block_8
block_4
block_29
block_38
block_47
block_48
block_42
block_27
block_1
block_3
block_46
block_32
block_45
block_40
block_15
block_5
block_28
block_24
block_33
block_23
block_31
block_0
block_14
block_41
block_22
block_30
block_37
block_10
block_9
block_11
block_6
block_2
block_7
block_25
block_16
block_13
block_34
block_17
+---------+----------+----+-------------+-----+-------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+--------------------+--------------------+--------------------+--------------------+------+------+-----+-----+-----+-----+
|    LCLid|      date|hour|energy(kWh/h)|month|weekDay|1_diff_energy_t_0|2_diff_energy_t_0|1_diff_energy_t_1|2_diff_energy_t_1|1_diff_energy_t_2|2_diff_energy_t_2|diff_energy_week_t_1|diff_energy_week_t_2|diff_energy_week_t_3|diff_energy_week_t_4|mean_1|mean_2|min_1|max_1|m

## filtering data for year 2013

In [8]:
dataset_df = dataset_df.filter((dataset_df.date >= date(2013,1,1)) & (dataset_df.date <= date(2013,12,31)))
#dataset_df.show(1)

## For adding weather data

In [9]:
weather_data = sqlcontext.read.csv(base_path+"weather_hourly_darksky.csv",header=True,inferSchema=True)
weather_daily_data = sqlcontext.read.csv(base_path+"weather_daily_darksky.csv",header=True,inferSchema=True)
weather_daily_data = weather_daily_data.select(date_format("temperatureMaxTime","yyyy-MM-dd").alias("date2"),"temperatureMax","temperatureMin")
weather_daily_data = weather_daily_data.withColumn("date2",weather_daily_data["date2"].cast(DateType()))
weather_data = weather_data.withColumn("hour1",hour(weather_data["time"]))
weather_data = weather_data.withColumn("date1",date_format(weather_data["time"],"yyyy-MM-dd").cast(DateType()))
weather_data = weather_data.drop("time","icon","temperature")
weather_data = weather_data.join(weather_daily_data,(weather_daily_data.date2 == weather_data.date1))
weather_data.printSchema()
# df_full_dataset = dataset_df.join(weather_data,(dataset_df.date == weather_data.date1) & (dataset_df.hour == weather_data.hour1))
# df_full_dataset = df_full_dataset.drop("hour1","date1")
# df_full_dataset.show(1)

root
 |-- visibility: double (nullable = true)
 |-- windBearing: integer (nullable = true)
 |-- dewPoint: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- windSpeed: double (nullable = true)
 |-- precipType: string (nullable = true)
 |-- humidity: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- hour1: integer (nullable = true)
 |-- date1: date (nullable = true)
 |-- date2: date (nullable = true)
 |-- temperatureMax: double (nullable = true)
 |-- temperatureMin: double (nullable = true)



## adding Holiday column treating weekend as holiday too

In [10]:
holiday_data = sqlcontext.read.csv(base_path+"uk_bank_holidays.csv",header=True,inferSchema=True)
holiday_data = holiday_data.withColumn("Bank holidays",date_format(holiday_data["Bank holidays"],"yyyy-MM-dd").cast(DateType()))
holiday_data = holiday_data.select("Bank holidays")
holiday_data = holiday_data.withColumn("holiday",lit(1))
# feature_df = df_full_dataset.join(holiday_data,holiday_data["Bank holidays"] == df_full_dataset["date"],how="left_outer")
# feature_df = feature_df.fillna({'holiday':'0'})
# feature_df = feature_df.drop("Bank holidays")
# feature_df.select("holiday").show(2)
# feature_df.printSchema()

## joining with main df

In [11]:
weather_holiday_df = holiday_data.join(weather_data,holiday_data["Bank holidays"] == weather_data["date1"])
weather_holiday_df =  weather_holiday_df.drop("Bank holidays")
feature_df = dataset_df.join(weather_holiday_df,(dataset_df.date == weather_data.date1) & (dataset_df.hour == weather_data.hour1))
feature_df = feature_df.fillna({'holiday':'0'})
feature_df = feature_df.drop("hour1","date1","date2")
feature_df.printSchema()

root
 |-- LCLid: string (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- energy(kWh/h): double (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekDay: string (nullable = true)
 |-- 1_diff_energy_t_0: double (nullable = true)
 |-- 2_diff_energy_t_0: double (nullable = true)
 |-- 1_diff_energy_t_1: double (nullable = true)
 |-- 2_diff_energy_t_1: double (nullable = true)
 |-- 1_diff_energy_t_2: double (nullable = true)
 |-- 2_diff_energy_t_2: double (nullable = true)
 |-- diff_energy_week_t_1: double (nullable = true)
 |-- diff_energy_week_t_2: double (nullable = true)
 |-- diff_energy_week_t_3: double (nullable = true)
 |-- diff_energy_week_t_4: double (nullable = true)
 |-- mean_1: double (nullable = true)
 |-- mean_2: double (nullable = true)
 |-- min_1: double (nullable = true)
 |-- max_1: double (nullable = true)
 |-- min_2: double (nullable = true)
 |-- max_2: double (nullable = true)
 |-- holiday: integer (nullable = false

## adding week days or not

In [12]:
feature_df = feature_df.withColumn("Weekday/end",sf.when((col("weekDay")==str("Sat")) | (col("weekDay")==str("Sat")),1).otherwise(0))
feature_df.printSchema()

root
 |-- LCLid: string (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- energy(kWh/h): double (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekDay: string (nullable = true)
 |-- 1_diff_energy_t_0: double (nullable = true)
 |-- 2_diff_energy_t_0: double (nullable = true)
 |-- 1_diff_energy_t_1: double (nullable = true)
 |-- 2_diff_energy_t_1: double (nullable = true)
 |-- 1_diff_energy_t_2: double (nullable = true)
 |-- 2_diff_energy_t_2: double (nullable = true)
 |-- diff_energy_week_t_1: double (nullable = true)
 |-- diff_energy_week_t_2: double (nullable = true)
 |-- diff_energy_week_t_3: double (nullable = true)
 |-- diff_energy_week_t_4: double (nullable = true)
 |-- mean_1: double (nullable = true)
 |-- mean_2: double (nullable = true)
 |-- min_1: double (nullable = true)
 |-- max_1: double (nullable = true)
 |-- min_2: double (nullable = true)
 |-- max_2: double (nullable = true)
 |-- holiday: integer (nullable = false

## adding Acorn grouped and cleaning mising and error data as [Row(Acorn_grouped='Adversity'),
 Row(Acorn_grouped='ACORN-'),
 Row(Acorn_grouped='Affluent'),
 Row(Acorn_grouped='ACORN-U'),
 Row(Acorn_grouped='Comfortable')]

In [13]:
Acorn_data_group = household_info.select("LCLid","stdorToU","Acorn_grouped")
Acorn_data_group.select("Acorn_grouped").distinct().collect()
possible_group = ["Comfortable","Affluent","Adversity"]
Acorn_data_group = Acorn_data_group.filter(Acorn_data_group.Acorn_grouped.isin(possible_group))
feature_df = feature_df.join(Acorn_data_group,["LCLid"])                # preventing duplicate column in df
feature_df.printSchema()
Acorn_data_group.select("stdorToU").distinct().collect()

root
 |-- LCLid: string (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- energy(kWh/h): double (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekDay: string (nullable = true)
 |-- 1_diff_energy_t_0: double (nullable = true)
 |-- 2_diff_energy_t_0: double (nullable = true)
 |-- 1_diff_energy_t_1: double (nullable = true)
 |-- 2_diff_energy_t_1: double (nullable = true)
 |-- 1_diff_energy_t_2: double (nullable = true)
 |-- 2_diff_energy_t_2: double (nullable = true)
 |-- diff_energy_week_t_1: double (nullable = true)
 |-- diff_energy_week_t_2: double (nullable = true)
 |-- diff_energy_week_t_3: double (nullable = true)
 |-- diff_energy_week_t_4: double (nullable = true)
 |-- mean_1: double (nullable = true)
 |-- mean_2: double (nullable = true)
 |-- min_1: double (nullable = true)
 |-- max_1: double (nullable = true)
 |-- min_2: double (nullable = true)
 |-- max_2: double (nullable = true)
 |-- holiday: integer (nullable = false

[Row(stdorToU='Std'), Row(stdorToU='ToU')]

In [14]:
pd.options.display.max_columns = None
feature_df.limit(10).toPandas()

Unnamed: 0,LCLid,date,hour,energy(kWh/h),month,weekDay,1_diff_energy_t_0,2_diff_energy_t_0,1_diff_energy_t_1,2_diff_energy_t_1,1_diff_energy_t_2,2_diff_energy_t_2,diff_energy_week_t_1,diff_energy_week_t_2,diff_energy_week_t_3,diff_energy_week_t_4,mean_1,mean_2,min_1,max_1,min_2,max_2,holiday,visibility,windBearing,dewPoint,pressure,apparentTemperature,windSpeed,precipType,humidity,summary,temperatureMax,temperatureMin,Weekday/end,stdorToU,Acorn_grouped
0,MAC005306,2013-06-05,17,0.186,6,Wed,0.161,0.201,0.276,0.361,0.224,0.101,1.852,0.325,0.272,0.421,0.197417,0.226625,0.103,0.803,0.097,1.361,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,ToU,Affluent
1,MAC005204,2013-06-05,17,0.118,6,Wed,0.066,0.077,0.064,0.089,0.086,0.118,0.343,0.519,0.129,0.081,0.098417,0.131167,0.049,0.326,0.055,0.578,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,ToU,Affluent
2,MAC001725,2013-06-05,17,0.431,6,Wed,0.505,0.397,0.371,0.381,0.343,0.771,0.223,0.188,0.498,0.255,0.669417,0.425375,0.343,2.591,0.153,0.771,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
3,MAC001895,2013-06-05,17,0.594,6,Wed,0.452,0.716,0.329,1.645,0.223,0.99,1.085,1.032,1.584,0.512,0.711708,1.006292,0.222,4.916,0.323,4.874,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
4,MAC001699,2013-06-05,17,0.233,6,Wed,0.316,0.248,0.321,0.227,0.316,0.227,0.244,0.239,0.243,0.186,0.25975,0.294167,0.221,0.321,0.227,0.721,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
5,MAC000443,2013-06-05,17,2.708,6,Wed,1.356,1.117,1.179,1.253,1.219,1.356,2.079,2.58,3.018,2.043,1.535458,1.4885,0.213,3.681,0.308,3.195,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
6,MAC001651,2013-06-05,17,0.463,6,Wed,0.276,0.12,1.674,0.121,0.44,0.121,0.609,0.233,0.633,0.288,0.429708,0.218375,0.182,1.674,0.116,0.381,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,ToU,Affluent
7,MAC001720,2013-06-05,17,0.682,6,Wed,0.357,0.422,0.338,0.551,0.518,0.179,0.609,0.457,0.902,0.456,0.287833,0.256417,0.109,0.68,0.113,0.551,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
8,MAC005291,2013-06-05,17,0.3,6,Wed,0.183,0.076,0.631,0.089,0.87,0.072,0.156,0.085,0.111,0.051,0.251458,0.124292,0.061,0.87,0.062,0.336,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,ToU,Affluent
9,MAC005284,2013-06-05,17,0.226,6,Wed,0.079,0.113,0.155,0.115,0.115,0.127,0.071,0.155,0.159,0.147,0.113708,0.124083,0.042,0.312,0.042,0.293,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,ToU,Affluent


## Taking year 2013 in consideration   

All points : 411720
na points dropped : 411128  (without Acorn group join)


In [15]:
print("All points : {}".format(feature_df.count()))
feature_df = feature_df.na.drop()
print("na points dropped : {}".format(feature_df.count()))

All points : 456722
na points dropped : 454185


### TODO : Instead of processing each LCLID process each file using window.partitionby("LCLID")

In [16]:
# block_read = set([])
# for row in household_mini.rdd.collect():
#     house_id = row.LCLid
#     file = row.file
#     print(house_id,file)
#     file_path = base_path + "halfhourly_dataset/"+ file+".csv"
#     if file not in block_read:
#         block_read.add(file)
#         half_hourly_consumption_data = sqlContext.read.csv(file_path,header=True,inferSchema=True)
#         half_hourly_consumption_data.dropna(how='any')
#     indiv_house_data = half_hourly_consumption_data.where(col("LCLid") == house_id)
#     indiv_house_data = indiv_house_data.withColumnRenamed("energy(kWh/hh)","energy")
#     indiv_house_data.show()
#     indiv_house_data = indiv_house_data.withColumn("energy(kWh/hh)", indiv_house_data["energy"].cast("float"))
#     indiv_house_data = indiv_house_data.drop("energy")
#     indiv_house_data.printSchema()
#     if indiv_house_data.rdd.isEmpty():
#         print("Missing Id = {} in file = {}".format(house_id,file))
#         continue
#     df = prepare_dataset(indiv_house_data)
#     df.printSchema()
#     df = df.dropna(how="any")

In [35]:
pd.options.display.max_columns = None
feature_df.limit(10).toPandas()

Unnamed: 0,LCLid,date,hour,energy(kWh/h),month,weekDay,1_diff_energy_t_0,2_diff_energy_t_0,1_diff_energy_t_1,2_diff_energy_t_1,1_diff_energy_t_2,2_diff_energy_t_2,diff_energy_week_t_1,diff_energy_week_t_2,diff_energy_week_t_3,diff_energy_week_t_4,mean_1,mean_2,min_1,max_1,min_2,max_2,holiday,visibility,windBearing,dewPoint,pressure,apparentTemperature,windSpeed,precipType,humidity,summary,temperatureMax,temperatureMin,Weekday/end,stdorToU,Acorn_grouped
0,MAC003668,2013-06-05,17,1.438,6,Wed,2.002,0.507,2.882,0.502,1.346,0.481,1.135,2.591,0.477,0.57,1.534833,1.410958,0.421,3.748,0.406,4.064,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
1,MAC003252,2013-06-05,17,0.339,6,Wed,0.893,1.164,0.686,0.638,1.213,0.216,0.787,0.732,0.304,0.678,0.507958,0.540708,0.159,1.295,0.128,2.778,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
2,MAC003775,2013-06-05,17,0.526,6,Wed,0.664,0.405,2.198,1.867,1.187,1.104,0.384,0.258,1.278,0.486,0.934542,0.997458,0.192,2.198,0.223,4.209,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
3,MAC003400,2013-06-05,17,0.32,6,Wed,0.407,0.189,0.407,0.193,0.374,0.279,0.504,0.454,0.464,0.278,0.41275,0.406375,0.106,1.223,0.107,1.053,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
4,MAC003613,2013-06-05,17,0.687,6,Wed,0.838,0.89,0.917,1.185,0.881,1.126,2.842,0.772,0.718,0.859,0.81325,1.111958,0.349,2.306,0.377,2.434,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
5,MAC003718,2013-06-05,17,0.413,6,Wed,0.188,0.194,0.179,0.158,0.233,0.325,0.628,0.19,0.271,0.323,0.342583,0.4335,0.172,0.877,0.158,1.471,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
6,MAC003597,2013-06-05,17,1.786,6,Wed,2.436,1.205,2.073,1.373,0.912,0.821,1.34,0.693,0.822,1.193,0.979,0.808458,0.466,2.436,0.468,1.722,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
7,MAC003463,2013-06-05,17,0.361,6,Wed,0.729,0.71,0.586,0.937,0.493,0.598,1.804,1.378,0.393,1.712,0.504542,0.389792,0.201,1.368,0.171,0.937,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
8,MAC003656,2013-06-05,17,0.731,6,Wed,0.887,0.701,0.551,0.563,0.585,1.296,0.33,0.382,0.39,0.342,0.645,0.779042,0.304,2.877,0.308,1.924,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent
9,MAC003557,2013-06-05,17,0.374,6,Wed,0.348,0.275,0.216,0.194,0.15,0.144,0.099,0.397,0.315,0.219,0.313917,0.254708,0.105,1.232,0.102,1.194,1,13.9,75,10.51,1019.55,18.98,4.18,rain,0.58,Partly Cloudy,19.64,8.96,0,Std,Affluent


In [36]:
feature_df.select("stdorToU").distinct().collect()
# feature_df1.where(col("LCLid") == )

[Row(stdorToU='Std')]

In [17]:
inputCols = ["weekDay","precipType","summary","stdorToU","Acorn_grouped"]
outputCols = ["weekDay_index","precipType_index","summary_index","stdorToU_index","Acorn_grouped_index"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(feature_df) for column in inputCols ]
# # stringIndexer = StringIndexer(inputCol=inputCols, outputCol=outputCols)
# encoder = OneHotEncoderEstimator(inputCols=outputCols, outputCols=outputCols)
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(feature_df).transform(feature_df)

# encoder = OneHotEncoder(inputCols=outputCols, outputCols=outputCols)
# pipeline = Pipeline(stages=encoder)
# pipeline.fit(df).transform(df).show()
# # model = stringIndexer.fit(df)
# # indexed = model.transform(df)
# outputCols.remove("Acorn_grouped_index")
# for col in outputCols: 
#     encoder = OneHotEncoder(inputCol=col, outputCol="category_{}".format(col))
#     df_encoded = encoder.transform(df).cache()
# df_encoded.show()
# # columns = df_encoded.columns
# # inputcols = columns[4:]
# # inputcols.append(columns[2])
# # inputcols.remove("Index_Week")
# # inputcols.remove("weekDay")

# # vecAssembler = VectorAssembler(inputCols=inputcols, outputCol="features")
# # df_feature = vecAssembler.transform(df_encoded)
# # df_feature.show()

In [18]:
#outputCols.remove("Acorn_grouped_index")
for col in outputCols: 
    encoder = OneHotEncoder(inputCol=col, outputCol="category_{}".format(col))
    df_encoded = encoder.transform(df).cache()
df_encoded.show()

+---------+----------+----+-------------+-----+-------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-----+-----+-----+-----+-------+----------+-----------+--------+--------+-------------------+---------+----------+--------+-------------+--------------+--------------+-----------+--------+-------------+-------------+----------------+-------------+--------------+-------------------+----------------------------+
|    LCLid|      date|hour|energy(kWh/h)|month|weekDay|1_diff_energy_t_0|2_diff_energy_t_0|1_diff_energy_t_1|2_diff_energy_t_1|1_diff_energy_t_2|2_diff_energy_t_2|diff_energy_week_t_1|diff_energy_week_t_2|diff_energy_week_t_3|diff_energy_week_t_4|             mean_1|             mean_2|min_1|max_1|min_2|max_2|holiday|visibility|windBearing|dewPoint|pressure|apparentTemperature|windSpeed|precipType|

## For declaring feature column

In [19]:
columns = df_encoded.columns
inputcols1 = columns[4:]
inputcols1.append(columns[2])
inputcols1 = set(inputcols1) - set(inputCols)
inputcols1 = inputcols1 - set(outputCols)
inputcols1 = list(inputcols1)
inputcols1.remove("date2")
vecAssembler = VectorAssembler(inputCols=inputcols1, outputCol="features")
df_feature = vecAssembler.transform(df_encoded)
df_feature.show()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/iitp/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-320f361b8721>", line 7, in <module>
    inputcols1.remove("date2")
ValueError: list.remove(x): x not in list

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/iitp/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2018, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'ValueError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/iitp/.local/lib/python3.5/site-packages/IPython/core/ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/iitp/.local/lib/python3.5/sit

ValueError: list.remove(x): x not in list

In [73]:
# row1 = df_feature.agg({"date": "max"}).collect()[0]
print(type(row1[0]))
df_feature.count()
from datetime import datetime, timedelta

<class 'datetime.date'>


## Train - Test Split

In [58]:
df_feature = df_feature.withColumnRenamed("energy(kWh/h)","label")
# ****************** for 80-20 for each user *******************
# train_df = df_feature.where(col("date") <= ((df_feature.agg({"date": "max"}).collect()[0])[0]-timedelta(df.count()//(24*60))))
# test_df = df_feature.where(col("date") > ((df_feature.agg({"date": "max"}).collect()[0])[0]-timedelta(df.count()//(24*60))))
# ***************8 month train 4 month test
train_df = df_feature.where(df_feature["date"] <= date(2013,8,31))
test_df = df_feature.where(df_feature["date"] > date(2013,8,31))

In [75]:
test_df.show()

+---------+----------+----+-----+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+--------------------+
|    LCLid|      date|hour|label|month|weekDay|1_diff_energy_t|2_diff_energy_t|1_diff_energy_t_1|2_diff_energy_t_1|1_diff_energy_t_2|2_diff_energy_t_2|rnk|mean_1|mean_2|min_1|max_1|min_2|max_2|Index_Week| categoryWeek|            features|
+---------+----------+----+-----+-----+-------+---------------+---------------+-----------------+-----------------+-----------------+-----------------+---+------+------+-----+-----+-----+-----+----------+-------------+--------------------+
|MAC005492|2014-02-20|   0|0.357|    2|    Thu|          0.226|           0.35|            0.361|            0.246|            0.385|            0.377|546|   0.0|   0.0|0.169|0.617| 0.17|1.106|       3.0|(6,[3],[1.0])|[2.0,0.226,0.35,0...|
|MAC005492|2014-02-20|   1|0.383|    2| 

In [59]:
rf = RandomForestRegressor(maxDepth=10, numTrees=10, maxBins=128)
rfmodel = rf.fit(train_df)
pred_val = rfmodel.transform(test_df)
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName="rmse")
accuracy = evaluator.evaluate(pred_val)
# pred_val.show()
print('RMSE = : %.4f'%(accuracy))

Py4JJavaError: An error occurred while calling o3673.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 1403.0 failed 1 times, most recent failure: Lost task 5.0 in stage 1403.0 (TID 125648, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:541)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:537)
	at scala.Array$.tabulate(Array.scala:331)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:537)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:534)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:563)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:130)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:541)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:537)
	at scala.Array$.tabulate(Array.scala:331)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:537)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:534)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50202)


Traceback (most recent call last):
  File "/usr/lib/python3.5/socketserver.py", line 313, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 341, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 354, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.5/socketserver.py", line 681, in __init__
    self.handle()
  File "/usr/lib/Spark/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/usr/lib/Spark/python/pyspark/serializers.py", line 685, in read_int
    raise EOFError
EOFError


----------------------------------------
