In [11]:
from pyspark.sql import SQLContext
from pyspark.sql import functions as sf
from matplotlib import pyplot as plt
from pyspark.sql.functions import col, avg, date_format,month,hour,lag, date_sub,lit
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
from pyspark.sql.functions import broadcast
import pandas as pd
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, OneHotEncoder
from datetime import datetime, timedelta, date
from pyspark.ml import Pipeline
import pyspark

In [12]:
sc.stop()
sc = pyspark.SparkContext(master="spark://172.16.27.208:7077",appName="spark")
sc

In [13]:
base_path = "/home/test5/Desktop/smart-meters-in-london/"
sqlcontext = SQLContext(sc)

In [14]:
household_info = sqlcontext.read.csv(base_path+"informations_households.csv",header=True,inferSchema=True)
#household_mini = sc.parallelize(household_info.take(1)).toDF()
household_mini = household_info

In [15]:
def prepare_dataset(df):
    # Resampling to 1 hour and extracting variable like month,hour,weekday etc.
    df = df.select("LCLID","tstp","energy(kWh/hh)",date_format("tstp","yyyy-MM-dd").alias("date"),date_format("tstp",'HH:mm').alias("start time"),date_format("tstp",'E').alias("weekDay"),month("tstp").alias("month"),hour("tstp").alias("hour"))
    df1 = (df.groupby('LCLID',"date","hour").sum("energy(kWh/hh)")).orderBy('date','hour',ascending=True)
    df1 = df1.withColumnRenamed("sum(energy(kWh/hh))","energy(kWh/h)")
    resampled_df = df1.select("LCLid","date","hour","energy(kWh/h)",month("date").alias("month"),date_format("date",'E').alias("weekDay"))
    resampled_df = resampled_df.withColumn("energy(kWh/h)", sf.round(resampled_df["energy(kWh/h)"], 3))
    resampled_df = resampled_df.withColumn("date", resampled_df["date"].cast(DateType()))
#     resampled_df.printSchema()
#     resampled_df.show()
    
#     # window period = 2 for lag input for same hour,same-1,same-2 
    window = Window.partitionBy('LCLid').orderBy('date','hour')
    for lag_hour in range(0,3):
        for diff in range(1,3):
            resampled_df = resampled_df.withColumn('{}_diff_energy_t_{}'.format(diff,lag_hour),lag(resampled_df['energy(kWh/h)'], count=24*diff+lag_hour).over(window)) 
#     df_resample_lag.show()
    
    # 4 weeks previous lag value of same time
    for lag_week in range(1,5):
        resampled_df = resampled_df.withColumn('diff_energy_week_t_{}'.format(lag_week),lag(resampled_df['energy(kWh/h)'], count=24*7*lag_week).over(window)) 

    df_resample_lag = resampled_df
#     df_resample_lag.printSchema()    
    # Mean of previous 2 days
    df_resample_lag = df_resample_lag.withColumn("rnk",sf.dense_rank().over(Window.partitionBy('LCLid').orderBy('date')))
    for days in range(1,3):
        df_resample_lag = df_resample_lag.withColumn("mean_{}".format(days),avg("energy(kWh/h)").over(Window.partitionBy("LCLid").orderBy("rnk").rangeBetween(-days,-days)))
#     df_resample_lag.show()
    # Min power of previous 2 days
    for days in range(1,3):
        df_resample_lag = df_resample_lag.withColumn("min_{}".format(days),sf.min("energy(kWh/h)").over(Window.partitionBy("LCLid").orderBy("rnk").rangeBetween(-days,-days)))
        df_resample_lag = df_resample_lag.withColumn("max_{}".format(days),sf.max("energy(kWh/h)").over(Window.partitionBy("LCLid").orderBy("rnk").rangeBetween(-days,-days)))
    
    return df_resample_lag

## dataset_df holds lag variable and other metric variable, weather variable need to be added

In [16]:
household_mini.count()

5566

In [17]:
flag = 0
avg_house_data = []
df_file = household_mini.select("file").distinct()
dataset_df = []
for row in df_file.rdd.collect():
    file = row.file
    print(file)
    file_path = base_path + "halfhourly_dataset/"+ file+".csv"
    half_hourly_consumption_data = sqlcontext.read.csv(file_path,header=True,inferSchema=True)
    half_hourly_consumption_data.dropna(how='any')
    half_hourly_consumption_data = half_hourly_consumption_data.withColumn("energy(kWh/hh)",
                                                                           half_hourly_consumption_data["energy(kWh/hh)"].cast("float"))
#     half_hourly_consumption_data.printSchema()
    df = prepare_dataset(half_hourly_consumption_data)
    df = df.drop("rnk")
    if flag == 0:
        dataset_df = sqlcontext.createDataFrame([],df.schema)
        flag = 1
    dataset_df = dataset_df.union(df)
#dataset_df.take(2)

block_64
block_91
block_39
block_43
block_77
block_19
block_35
block_53
block_26
block_20
block_52
block_12
block_21
block_36
block_89
block_84
block_49
block_93
block_99
block_18
block_44
block_8
block_71
block_104
block_4
block_29
block_38
block_47
block_48
block_42
block_85
block_27
block_108
block_76
block_1
block_3
block_56
block_94
block_72
block_75
block_78
block_62
block_101
block_46
block_110
block_32
block_51
block_45
block_59
block_40
block_15
block_95
block_96
block_5
block_68
block_28
block_97
block_82
block_69
block_70
block_61
block_24
block_33
block_23
block_31
block_67
block_0
block_14
block_41
block_100
block_22
block_109
block_83
block_30
block_88
block_98
block_106
block_37
block_10
block_80
block_103
block_55
block_73
block_111
block_86
block_63
block_105
block_9
block_92
block_11
block_54
block_107
block_6
block_2
block_66
block_7
block_25
block_74
block_87
block_60
block_57
block_79
block_16
block_102
block_13
block_34
block_58
block_50
block_17
block_81
block_90

In [9]:
dataset_df.count()

83919248

## filtering data as per requirement
Total user in 2013  5528

Total user in 2013 with full evidence  3961


In [19]:
dataset_df = dataset_df.filter((dataset_df.date >= date(2013,1,1)) & (dataset_df.date <= date(2013,12,31)))
dataset_df = dataset_df.na.drop()
# print(dataset_df.count())
print ("Total user in 2013 ", dataset_df.select("LCLid").distinct().count())
year_df = dataset_df.groupBy("LCLid").count()
year_df = year_df.filter(year_df["count"] >= 8760 )
print("Total user in 2013 with full evidence ", year_df.select("LCLid").distinct().count())

KeyboardInterrupt: 

In [8]:
dataset_df = dataset_df.join(broadcast(year_df),["LCLid"])
dataset_df.take(1)

[Row(LCLid='MAC001858', date=datetime.date(2013, 1, 1), hour=0, energy(kWh/h)=0.968, month=1, weekDay='Tue', 1_diff_energy_t_0=0.761, 2_diff_energy_t_0=0.825, 1_diff_energy_t_1=0.57, 2_diff_energy_t_1=1.241, 1_diff_energy_t_2=0.606, 2_diff_energy_t_2=1.489, diff_energy_week_t_1=0.436, diff_energy_week_t_2=2.26, diff_energy_week_t_3=0.85, diff_energy_week_t_4=0.464, mean_1=0.6940833333333334, mean_2=0.6582500000000001, min_1=0.328, max_1=1.569, min_2=0.3, max_2=1.609, count=8760)]

###  Save data for year 2013  Total 3961 user

datapoint = 3961*8760 = 34698360

In [10]:
#print(dataset_df.count())
dataset_df.toPandas().to_csv(base_path+"cleaned_2013.csv",header=True)
#dataset_df.coalesce(1).write.csv(base_path+"Cleaned_201312",header = True)

Py4JJavaError: An error occurred while calling o23333.collectToPython.
: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:282)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.next(SparkPlan.scala:278)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.foreach(SparkPlan.scala:278)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeCollect$1.apply(SparkPlan.scala:300)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeCollect$1.apply(SparkPlan.scala:299)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3255)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3255)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


## For adding weather data

In [9]:
weather_data = sqlcontext.read.csv(base_path+"weather_hourly_darksky.csv",header=True,inferSchema=True)
weather_daily_data = sqlcontext.read.csv(base_path+"weather_daily_darksky.csv",header=True,inferSchema=True)
weather_daily_data = weather_daily_data.select(date_format("temperatureMaxTime","yyyy-MM-dd").alias("date2"),"temperatureMax","temperatureMin")
weather_data = weather_data.withColumn("hour1",hour(weather_data["time"]))
weather_data = weather_data.withColumn("date1",date_format(weather_data["time"],"yyyy-MM-dd").cast(DateType()))
weather_data = weather_data.drop("time","icon","temperature")
weather_data = weather_data.join(broadcast(weather_daily_data),(weather_daily_data.date2 == weather_data.date1))
weather_data.printSchema()
df_full_dataset = dataset_df.join(broadcast(weather_data),(dataset_df.date == weather_data.date1) & (dataset_df.hour == weather_data.hour1))
df_full_dataset = df_full_dataset.drop("hour1","date1").cache()
df_full_dataset.take(1)

root
 |-- visibility: double (nullable = true)
 |-- windBearing: integer (nullable = true)
 |-- dewPoint: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- windSpeed: double (nullable = true)
 |-- precipType: string (nullable = true)
 |-- humidity: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- hour1: integer (nullable = true)
 |-- date1: date (nullable = true)
 |-- date2: string (nullable = true)
 |-- temperatureMax: double (nullable = true)
 |-- temperatureMin: double (nullable = true)



[Row(LCLid='MAC001858', date=datetime.date(2013, 1, 1), hour=0, energy(kWh/h)=0.968, month=1, weekDay='Tue', 1_diff_energy_t_0=0.761, 2_diff_energy_t_0=0.825, 1_diff_energy_t_1=0.57, 2_diff_energy_t_1=1.241, 1_diff_energy_t_2=0.606, 2_diff_energy_t_2=1.489, diff_energy_week_t_1=0.436, diff_energy_week_t_2=2.26, diff_energy_week_t_3=0.85, diff_energy_week_t_4=0.464, mean_1=0.6940833333333334, mean_2=0.6582500000000001, min_1=0.328, max_1=1.569, min_2=0.3, max_2=1.609, count=8760, visibility=13.28, windBearing=269, dewPoint=2.6, pressure=1008.19, apparentTemperature=3.66, windSpeed=5.46, precipType='rain', humidity=0.73, summary='Partly Cloudy', date2='2013-01-01', temperatureMax=7.49, temperatureMin=3.31)]

In [None]:
df_full_dataset.count()

## adding Holiday column treating weekend as holiday too

In [10]:
holiday_data = sqlcontext.read.csv(base_path+"uk_bank_holidays.csv",header=True,inferSchema=True)
holiday_data = holiday_data.withColumn("Bank holidays",date_format(holiday_data["Bank holidays"],"yyyy-MM-dd").cast(DateType()))
holiday_data = holiday_data.select("Bank holidays")
holiday_data = holiday_data.withColumn("holiday",lit(1))
#weather_holiday_df = holiday_data.join(weather_data,holiday_data["Bank holidays"] == weather_data["date1"] )
#weather_holiday_df = weather_holiday_df.drop("date1")
feature_df = df_full_dataset.join(holiday_data,holiday_data["Bank holidays"] == df_full_dataset["date"],how="left")
feature_df = feature_df.fillna({'holiday':'0'})
feature_df = feature_df.drop("Bank holidays")
#feature_df.select("holiday").show(2)
#feature_df.printSchema()

#feature_df.count()

## adding week days or not

In [11]:
feature_df = feature_df.withColumn("Weekday/end",sf.when((col("weekDay")==str("Sat")) | (col("weekDay")==str("Sat")),1).otherwise(0))

## adding Acorn grouped and cleaning mising and error data as [Row(Acorn_grouped='Adversity'),
 Row(Acorn_grouped='ACORN-'),
 Row(Acorn_grouped='Affluent'),
 Row(Acorn_grouped='ACORN-U'),
 Row(Acorn_grouped='Comfortable')]
 
 * Total user now : 3930
 * Missing Time : 1
 * available : 24*365-1 = 8759

In [12]:
Acorn_data_group = household_info.select("LCLid","stdorToU","Acorn_grouped")
Acorn_data_group.select("Acorn_grouped").distinct().collect()
possible_group = ["Comfortable","Affluent","Adversity"]
Acorn_data_group = Acorn_data_group.filter(Acorn_data_group.Acorn_grouped.isin(possible_group))
feature_df = feature_df.join(Acorn_data_group,["LCLid"])                # preventing duplicate column in df
feature_df.printSchema()
#Acorn_data_group.select("stdorToU").distinct().collect()
feature_df.take(1)

root
 |-- LCLid: string (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- energy(kWh/h): double (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekDay: string (nullable = true)
 |-- 1_diff_energy_t_0: double (nullable = true)
 |-- 2_diff_energy_t_0: double (nullable = true)
 |-- 1_diff_energy_t_1: double (nullable = true)
 |-- 2_diff_energy_t_1: double (nullable = true)
 |-- 1_diff_energy_t_2: double (nullable = true)
 |-- 2_diff_energy_t_2: double (nullable = true)
 |-- diff_energy_week_t_1: double (nullable = true)
 |-- diff_energy_week_t_2: double (nullable = true)
 |-- diff_energy_week_t_3: double (nullable = true)
 |-- diff_energy_week_t_4: double (nullable = true)
 |-- mean_1: double (nullable = true)
 |-- mean_2: double (nullable = true)
 |-- min_1: double (nullable = true)
 |-- max_1: double (nullable = true)
 |-- min_2: double (nullable = true)
 |-- max_2: double (nullable = true)
 |-- count: long (nullable = false)
 |-

[Row(LCLid='MAC001858', date=datetime.date(2013, 1, 1), hour=0, energy(kWh/h)=0.968, month=1, weekDay='Tue', 1_diff_energy_t_0=0.761, 2_diff_energy_t_0=0.825, 1_diff_energy_t_1=0.57, 2_diff_energy_t_1=1.241, 1_diff_energy_t_2=0.606, 2_diff_energy_t_2=1.489, diff_energy_week_t_1=0.436, diff_energy_week_t_2=2.26, diff_energy_week_t_3=0.85, diff_energy_week_t_4=0.464, mean_1=0.6940833333333334, mean_2=0.6582500000000001, min_1=0.328, max_1=1.569, min_2=0.3, max_2=1.609, count=8760, visibility=13.28, windBearing=269, dewPoint=2.6, pressure=1008.19, apparentTemperature=3.66, windSpeed=5.46, precipType='rain', humidity=0.73, summary='Partly Cloudy', date2='2013-01-01', temperatureMax=7.49, temperatureMin=3.31, holiday=1, Weekday/end=0, stdorToU='ToU', Acorn_grouped='Comfortable')]

## Total data point now = 34422870

In [None]:
feature_df.count()
#pd.options.display.max_columns = None
#feature_df.limit(10).toPandas()

### TODO : Instead of processing each LCLID process each file using window.partitionby("LCLID")

In [None]:
# block_read = set([])
# for row in household_mini.rdd.collect():
#     house_id = row.LCLid
#     file = row.file
#     print(house_id,file)
#     file_path = base_path + "halfhourly_dataset/"+ file+".csv"
#     if file not in block_read:
#         block_read.add(file)
#         half_hourly_consumption_data = sqlContext.read.csv(file_path,header=True,inferSchema=True)
#         half_hourly_consumption_data.dropna(how='any')
#     indiv_house_data = half_hourly_consumption_data.where(col("LCLid") == house_id)
#     indiv_house_data = indiv_house_data.withColumnRenamed("energy(kWh/hh)","energy")
#     indiv_house_data.show()
#     indiv_house_data = indiv_house_data.withColumn("energy(kWh/hh)", indiv_house_data["energy"].cast("float"))
#     indiv_house_data = indiv_house_data.drop("energy")
#     indiv_house_data.printSchema()
#     if indiv_house_data.rdd.isEmpty():
#         print("Missing Id = {} in file = {}".format(house_id,file))
#         continue
#     df = prepare_dataset(indiv_house_data)
#     df.printSchema()
#     df = df.dropna(how="any")

In [None]:
pd.options.display.max_columns = None
feature_df.limit(10).toPandas()

In [13]:
feature_df.select("stdorToU").distinct().collect()
# feature_df1.where(col("LCLid") == )

[Row(stdorToU='Std'), Row(stdorToU='ToU')]

In [14]:
inputCols = ["weekDay","precipType","summary","stdorToU","Acorn_grouped"]
outputCols = ["weekDay_index","precipType_index","summary_index","stdorToU_index","Acorn_grouped_index"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(feature_df) for column in inputCols ]
# # stringIndexer = StringIndexer(inputCol=inputCols, outputCol=outputCols)
# encoder = OneHotEncoderEstimator(inputCols=outputCols, outputCols=outputCols)
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(feature_df).transform(feature_df)

# encoder = OneHotEncoder(inputCols=outputCols, outputCols=outputCols)
# pipeline = Pipeline(stages=encoder)
# pipeline.fit(df).transform(df).show()
# # model = stringIndexer.fit(df)
# # indexed = model.transform(df)
# outputCols.remove("Acorn_grouped_index")
# for col in outputCols: 
#     encoder = OneHotEncoder(inputCol=col, outputCol="category_{}".format(col))
#     df_encoded = encoder.transform(df).cache()
# df_encoded.show()
# # columns = df_encoded.columns
# # inputcols = columns[4:]
# # inputcols.append(columns[2])
# # inputcols.remove("Index_Week")
# # inputcols.remove("weekDay")

# # vecAssembler = VectorAssembler(inputCols=inputcols, outputCol="features")
# # df_feature = vecAssembler.transform(df_encoded)
# # df_feature.show()

#### Save Feature matrix with string indexing without one hot.

In [15]:
#day_month = [31,28,31,30,31,30,31,31,30,31,30,31]
for mth in range(1,13):
    df_save = df.where(df["month"]== mth)
    df_save.toPandas().to_csv(base_path+"Cleaned_2013_Features_mth_{}.csv".format(mth),header=True)
#df.toPandas().to_csv(base_path+"Cleaned_2013_Features_less_1hot.csv",header=True)
#df.write.format("csv").save(base_path+"Cleaned_2013_Features_less_1hot",header = True)

### Execute when you have to read feature matrix

In [26]:
df = sqlcontext.read.format("com.databricks.spark.csv").options(header = True,inferSchema=True).load(base_path+"Cleaned_2013_Features_less_1hot/*")

Py4JJavaError: An error occurred while calling o565.load.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 39.0 failed 4 times, most recent failure: Lost task 0.3 in stage 39.0 (TID 13256, 172.16.26.223, executor 2): java.io.FileNotFoundException: File file:/media/iitp/disk/smart-meters-in-london/Cleaned_2013_Features_less_1hot/part-01584-e8ecad67-aa5c-450f-ad79-d9eff909a2ca-c000.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:619)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3384)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2545)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2759)
	at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.infer(CSVDataSource.scala:232)
	at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:68)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:63)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:180)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:180)
	at scala.Option.orElse(Option.scala:289)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:179)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:373)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: File file:/media/iitp/disk/smart-meters-in-london/Cleaned_2013_Features_less_1hot/part-01584-e8ecad67-aa5c-450f-ad79-d9eff909a2ca-c000.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:619)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:
df.count()

In [16]:
#outputCols.remove("Acorn_grouped_index")
output_encoding = []
for col in outputCols:
    output_encoding.append("catrgory_"+col)
encoder = OneHotEncoderEstimator(inputCols = outputCols,outputCols = output_encoding)
#pipeline = Pipeline(stages = encoder)
df_encoded = encoder.fit(df).transform(df).cache()
print(df_encoded.take(1))
df_encoded.printSchema()
#for col in outputCols: 
 #   encoder = OneHotEncoder(inputCol=col, outputCol="category_{}".format(col))
  #  df_encoded = encoder.transform(df).cache()
#df_encoded.show()

[Row(LCLid='MAC001858', date=datetime.date(2013, 1, 1), hour=0, energy(kWh/h)=0.968, month=1, weekDay='Tue', 1_diff_energy_t_0=0.761, 2_diff_energy_t_0=0.825, 1_diff_energy_t_1=0.57, 2_diff_energy_t_1=1.241, 1_diff_energy_t_2=0.606, 2_diff_energy_t_2=1.489, diff_energy_week_t_1=0.436, diff_energy_week_t_2=2.26, diff_energy_week_t_3=0.85, diff_energy_week_t_4=0.464, mean_1=0.6940833333333334, mean_2=0.6582500000000001, min_1=0.328, max_1=1.569, min_2=0.3, max_2=1.609, count=8760, visibility=13.28, windBearing=269, dewPoint=2.6, pressure=1008.19, apparentTemperature=3.66, windSpeed=5.46, precipType='rain', humidity=0.73, summary='Partly Cloudy', date2='2013-01-01', temperatureMax=7.49, temperatureMin=3.31, holiday=1, Weekday/end=0, stdorToU='ToU', Acorn_grouped='Comfortable', weekDay_index=4.0, precipType_index=0.0, summary_index=2.0, stdorToU_index=1.0, Acorn_grouped_index=2.0, catrgory_precipType_index=SparseVector(1, {0: 1.0}), catrgory_weekDay_index=SparseVector(6, {4: 1.0}), catrgor

## For declaring feature column

In [29]:
columns = df_encoded.columns
inputcols1 = columns[4:]
inputcols1.append(columns[2])
inputcols1 = set(inputcols1) - set(inputCols)
inputcols1 = inputcols1 - set(outputCols)
inputcols1 = list(inputcols1)
inputcols1.remove("date2")
df_encoded = df_encoded.na.drop()
vecAssembler = VectorAssembler(inputCols=inputcols1, outputCol="features")
df_feature = vecAssembler.transform(df_encoded)
df_feature.take(1)
#len(inputcols1)

[Row(LCLid='MAC001858', date=datetime.date(2013, 1, 1), hour=0, energy(kWh/h)=0.968, month=1, weekDay='Tue', 1_diff_energy_t_0=0.761, 2_diff_energy_t_0=0.825, 1_diff_energy_t_1=0.57, 2_diff_energy_t_1=1.241, 1_diff_energy_t_2=0.606, 2_diff_energy_t_2=1.489, diff_energy_week_t_1=0.436, diff_energy_week_t_2=2.26, diff_energy_week_t_3=0.85, diff_energy_week_t_4=0.464, mean_1=0.6940833333333334, mean_2=0.6582500000000001, min_1=0.328, max_1=1.569, min_2=0.3, max_2=1.609, count=8760, visibility=13.28, windBearing=269, dewPoint=2.6, pressure=1008.19, apparentTemperature=3.66, windSpeed=5.46, precipType='rain', humidity=0.73, summary='Partly Cloudy', date2='2013-01-01', temperatureMax=7.49, temperatureMin=3.31, holiday=1, Weekday/end=0, stdorToU='ToU', Acorn_grouped='Comfortable', weekDay_index=4.0, precipType_index=0.0, summary_index=2.0, stdorToU_index=1.0, Acorn_grouped_index=2.0, catrgory_precipType_index=SparseVector(1, {0: 1.0}), catrgory_weekDay_index=SparseVector(6, {4: 1.0}), catrgor

In [30]:
# row1 = df_feature.agg({"date": "max"}).collect()[0]
#print(type(row1[0]))
df_feature.count()
#from datetime import datetime, timedelta
df_feature = df_feature.withColumnRenamed("energy(kWh/h)","label")

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 52714)
----------------------------------------


Traceback (most recent call last):
  File "/usr/lib/python3.5/socketserver.py", line 313, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 341, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 354, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.5/socketserver.py", line 681, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializers.py", line 714, in read_int
    raise EOFError
EOFError


Py4JError: An error occurred while calling o24074.count

## Train - Test Split

In [27]:
df_feature.printSchema()
# ****************** for 80-20 for each user *******************
# train_df = df_feature.where(col("date") <= ((df_feature.agg({"date": "max"}).collect()[0])[0]-timedelta(df.count()//(24*60))))
# test_df = df_feature.where(col("date") > ((df_feature.agg({"date": "max"}).collect()[0])[0]-timedelta(df.count()//(24*60))))
# ***************8 month train 4 month test
train_df = df_feature.where(df_feature["date"] <= date(2013,8,31))
test_df = df_feature.exceptAll(train_df)
#test_df = df_feature.where(df_feature["date"] > date(2013,10,31))

root
 |-- LCLid: string (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- label: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- weekDay: string (nullable = true)
 |-- 1_diff_energy_t_0: double (nullable = true)
 |-- 2_diff_energy_t_0: double (nullable = true)
 |-- 1_diff_energy_t_1: double (nullable = true)
 |-- 2_diff_energy_t_1: double (nullable = true)
 |-- 1_diff_energy_t_2: double (nullable = true)
 |-- 2_diff_energy_t_2: double (nullable = true)
 |-- diff_energy_week_t_1: double (nullable = true)
 |-- diff_energy_week_t_2: double (nullable = true)
 |-- diff_energy_week_t_3: double (nullable = true)
 |-- diff_energy_week_t_4: double (nullable = true)
 |-- mean_1: double (nullable = true)
 |-- mean_2: double (nullable = true)
 |-- min_1: double (nullable = true)
 |-- max_1: double (nullable = true)
 |-- min_2: double (nullable = true)
 |-- max_2: double (nullable = true)
 |-- count: long (nullable = false)
 |-- visibi

In [28]:
test_df.take(1)

Py4JJavaError: An error occurred while calling o24014.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 71 in stage 19605.0 failed 4 times, most recent failure: Lost task 71.3 in stage 19605.0 (TID 703579, 172.16.26.223, executor 2): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (struct<humidity:double,catrgory_stdorToU_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,windSpeed:double,diff_energy_week_t_1:double,pressure:double,diff_energy_week_t_4:double,mean_1:double,1_diff_energy_t_2:double,month_double_VectorAssembler_387fe7117cc0:double,max_1:double,Weekday/end_double_VectorAssembler_387fe7117cc0:double,catrgory_summary_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,hour_double_VectorAssembler_387fe7117cc0:double,mean_2:double,min_1:double,holiday_double_VectorAssembler_387fe7117cc0:double,min_2:double,count_double_VectorAssembler_387fe7117cc0:double,diff_energy_week_t_2:double,temperatureMin:double,diff_energy_week_t_3:double,visibility:double,2_diff_energy_t_2:double,1_diff_energy_t_1:double,catrgory_weekDay_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,windBearing_double_VectorAssembler_387fe7117cc0:double,dewPoint:double,2_diff_energy_t_0:double,1_diff_energy_t_0:double,catrgory_Acorn_grouped_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,max_2:double,apparentTemperature:double,temperatureMax:double,2_diff_energy_t_1:double,catrgory_precipType_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:619)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "keep". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:287)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:255)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:255)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:144)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:143)
	... 21 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3255)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3255)
	at sun.reflect.GeneratedMethodAccessor106.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (struct<humidity:double,catrgory_stdorToU_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,windSpeed:double,diff_energy_week_t_1:double,pressure:double,diff_energy_week_t_4:double,mean_1:double,1_diff_energy_t_2:double,month_double_VectorAssembler_387fe7117cc0:double,max_1:double,Weekday/end_double_VectorAssembler_387fe7117cc0:double,catrgory_summary_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,hour_double_VectorAssembler_387fe7117cc0:double,mean_2:double,min_1:double,holiday_double_VectorAssembler_387fe7117cc0:double,min_2:double,count_double_VectorAssembler_387fe7117cc0:double,diff_energy_week_t_2:double,temperatureMin:double,diff_energy_week_t_3:double,visibility:double,2_diff_energy_t_2:double,1_diff_energy_t_1:double,catrgory_weekDay_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,windBearing_double_VectorAssembler_387fe7117cc0:double,dewPoint:double,2_diff_energy_t_0:double,1_diff_energy_t_0:double,catrgory_Acorn_grouped_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,max_2:double,apparentTemperature:double,temperatureMax:double,2_diff_energy_t_1:double,catrgory_precipType_index:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11$$anon$1.hasNext(WholeStageCodegenExec.scala:619)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "keep". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:287)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:255)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:255)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:144)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$4.apply(VectorAssembler.scala:143)
	... 21 more


In [59]:
rf = RandomForestRegressor(maxDepth=10, numTrees=10, maxBins=128)
rfmodel = rf.fit(train_df)
pred_val = rfmodel.transform(test_df)
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName="rmse")
accuracy = evaluator.evaluate(pred_val)
# pred_val.show()
print('RMSE = : %.4f'%(accuracy))

Py4JJavaError: An error occurred while calling o3673.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 1403.0 failed 1 times, most recent failure: Lost task 5.0 in stage 1403.0 (TID 125648, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:541)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:537)
	at scala.Array$.tabulate(Array.scala:331)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:537)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:534)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:563)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:130)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:541)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:537)
	at scala.Array$.tabulate(Array.scala:331)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:537)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:534)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50202)


Traceback (most recent call last):
  File "/usr/lib/python3.5/socketserver.py", line 313, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 341, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.5/socketserver.py", line 354, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.5/socketserver.py", line 681, in __init__
    self.handle()
  File "/usr/lib/Spark/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/usr/lib/Spark/python/pyspark/serializers.py", line 685, in read_int
    raise EOFError
EOFError


----------------------------------------
