# Project Big Data Neferu

# Imports

In [1]:
! pip install pyspark



In [2]:
import numpy as np
from pyspark.sql import functions
from pyspark.sql.functions import col
import pyspark
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [3]:
from pyspark import SparkContext

try:
    sc = SparkContext('local', 'Pyspark demo')
except ValueError:
    print('SparkContext already exists!')

from pyspark.sql import SparkSession

try:
    spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()
except ValueError:
    print('SparkSession already exists!')

# Data Preprocessing

In [4]:
df_ = spark.read.option('header', True).format('csv').load('/content/SolarPrediction.csv')

In [5]:
df_.show()

+----------+--------------------+--------+---------+-----------+--------+--------+----------------------+-----+-----------+----------+
|  UNIXTime|                Data|    Time|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|TimeSunRise|TimeSunSet|
+----------+--------------------+--------+---------+-----------+--------+--------+----------------------+-----+-----------+----------+
|1475229326|9/29/2016 12:00:0...|23:55:26|     1.21|         48|   30.46|      59|                177.39| 5.62|   06:13:00|  18:13:00|
|1475229023|9/29/2016 12:00:0...|23:50:23|     1.21|         48|   30.46|      58|                176.78| 3.37|   06:13:00|  18:13:00|
|1475228726|9/29/2016 12:00:0...|23:45:26|     1.23|         48|   30.46|      57|                158.75| 3.37|   06:13:00|  18:13:00|
|1475228421|9/29/2016 12:00:0...|23:40:21|     1.21|         48|   30.46|      60|                137.71| 3.37|   06:13:00|  18:13:00|
|1475228124|9/29/2016 12:00:0...|23:35:24|     1.17|   

In [6]:
columns = ['UNIXTime', 'Data', 'Time', 'Radiation', 'Temperature', 'Pressure', 'Humidity','WindDirection(Degrees)','Speed', 'TimeSunRise','TimeSunSet']
rows = df_.count()
cnt = 0

In [7]:
for index, column in enumerate(columns):

    notNull = df_.filter(col(str(column)).isNotNull()).count()

    if  notNull != rows:
        print('There are '+str(rows - notNull)+' Null values in the date column')
    else:
        cnt += 1

    if cnt != index+1:
        print('There are not NULL values in the '+str(column)+' column')

    elif cnt == len(columns):
        print('There are not NULL values in the data frame')

There are not NULL values in the data frame


In [8]:
# Data

split_col = pyspark.sql.functions.split(df_['Data'], '/')
df_ = df_.withColumn('Month', split_col.getItem(0))
df_ = df_.withColumn('Day', split_col.getItem(1))
df_ = df_.withColumn('YearAux', split_col.getItem(2)) # year + time

In [9]:
# Time

split_col = pyspark.sql.functions.split(df_['Time'], ':')
df_ = df_.withColumn('Hour', split_col.getItem(0))
df_ = df_.withColumn('Minute', split_col.getItem(1))
df_ = df_.withColumn('Second', split_col.getItem(2))

In [10]:
columns.append('Month')
columns.append('Day')
columns.append('Hour')
columns.append('Minute')
columns.append('Second')

In [11]:
df = df_.select([column for column in columns if column not in ['Time', 'Data', 'YearAux', 'TimeSunRise', 'TimeSunSet', 'UNIXTime']])

In [12]:
df.show()

+---------+-----------+--------+--------+----------------------+-----+-----+---+----+------+------+
|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|Month|Day|Hour|Minute|Second|
+---------+-----------+--------+--------+----------------------+-----+-----+---+----+------+------+
|     1.21|         48|   30.46|      59|                177.39| 5.62|    9| 29|  23|    55|    26|
|     1.21|         48|   30.46|      58|                176.78| 3.37|    9| 29|  23|    50|    23|
|     1.23|         48|   30.46|      57|                158.75| 3.37|    9| 29|  23|    45|    26|
|     1.21|         48|   30.46|      60|                137.71| 3.37|    9| 29|  23|    40|    21|
|     1.17|         48|   30.46|      62|                104.95| 5.62|    9| 29|  23|    35|    24|
|     1.21|         48|   30.46|      64|                 120.2| 5.62|    9| 29|  23|    30|    24|
|      1.2|         49|   30.46|      72|                112.45| 6.75|    9| 29|  23|    25|    19|


# Column type conversion

In [13]:
cols = ['Radiation', 'Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'Month', 'Day', 'Hour','Minute','Second']

In [14]:
#for column in cols:
#    df.withColumn( column, col(column).cast('float'))

In [15]:
def isfloat(x):
    try:
        float(x)
    except :
        return False
    else:
        return True
line1 = df.head(1)[0]

df = df.select([c for c in df.columns if not isfloat(line1[c])] + [df[c].cast("float").alias(c) for c in df.columns if isfloat(line1[c])])

# Train Test Splitting

In [16]:
train, test = df.randomSplit([0.7, 0.3], seed = 41)

# Creating freatures assembler

In [17]:
assembler = VectorAssembler(
    inputCols=['Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'Month', 'Day', 'Hour', 'Minute', 'Second'],
    outputCol='Features')


train_data = assembler.transform(train)
test_data = assembler.transform(test)

In [18]:
train_data =  train_data.select([column for column in ['Features', 'Radiation']])
test_data = test_data.select([column for column in ['Features', 'Radiation']])

# ML Models

In [23]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol = 'Radiation', featuresCol = 'Features')
lrModel = lr.fit(train_data)

print("Coef: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept))

test_results = lrModel.evaluate(test_data)

Coef: [44.53853043903141,-388.82784507407104,0.5412385785935172,-0.2578308560088952,4.725503617551229,59.7150350458814,4.847068178981946,-7.522335552875352,0.00947306448028783,-1.4027697073140861] Intercept: 9134.169617105672


In [28]:
#Evaluation LR

print("MAE: {}".format(test_results.meanAbsoluteError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))


MAE: 146.64277488396166
MSE: 37048.48396611931
R2: 0.6252107625484169


In [55]:
#from pyspark.ml.regression import RandomForestRegressor
#from pyspark.ml.evaluation import RegressionEvaluator
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import VectorIndexer


#rf = RandomForestRegressor(labelCol = 'Radiation', featuresCol = 'Features')
#rfModel = rf.fit(train_data)

#rfevaluator = RegressionEvaluator(predictionCol = "Features", labelCol = "Radiation", metricName = "rmse")

#rfpredictions = rfModel.transform(test_data)

#print( 'MAE:' ,rfevaluator.evaluate(rfpredictions))
#featureIndexer = VectorIndexer(inputCol="Features", outputCol="indexedFeatures", maxCategories=4).fit(train_data)

##pipeline = Pipeline(stages = [featureIndexer, rf])

#model = pipeline.fit(train_data)
#predictions = model.transform(test_data)

#evaluator = RegressionEvaluator(labelCol = 'Radiation', featuresCol = 'Features', metricName="rmse")

#rmse = evaluator.evaluate(predictions)
#print(rmse)






In [56]:
#Evaluation RF

#print("MAE: {}".format(test_results_rf.meanAbsoluteError))
#print("MSE: {}".format(test_results_rf.meanSquaredError))
#print("R2: {}".format(test_results_rf#.r2))