<a href="https://colab.research.google.com/github/marcellinusc/solar-radiation/blob/data-prep/ml_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@title Run on TensorFlow 2.x

%tensorflow_version 2.x
from __future__ import absolute_import, division
from __future__ import print_function, unicode_literals

In [0]:
#@title Import relevant modules

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers
from matplotlib import pyplot as plt

# The following lines adjust the granularity of reporting. 
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

print('Imported')

In [0]:
#@title Load raw dataset
data = pd.read_csv('https://raw.githubusercontent.com/marcellinusc/solar-radiation/data-prep/datasets.csv')
print('Dataset length:', str(len(data)))

In [0]:
#@title Exclude raw data between sunset-sunrise

daylight = [data['Time'].values[x] > data['Sunrise'].values[x]
            and data['Time'].values[x] < data['Sunset'].values[x]
            for x in range(len(data))]
data['Daylight'] = daylight
data['Daylight'] = data['Daylight'].astype('float')

# Clean raw dataset from daylight value of 0
data = data[data.Daylight != 0]

print('Dataset length:', str(len(data)))

In [0]:
#@title Convert UNIX time format to UTC

data["TimeConversion"] = pd.to_datetime(data["Time"], format="%H:%M:%S")

# Get the month of the year and the day of the month
data["Month"] = pd.to_datetime(data["UNIXTime"].astype(int),
                               unit="s").dt.month
data["Day"] = pd.to_datetime(data["UNIXTime"].astype(int),
                             unit="s").dt.day 

# Get the hour and the minute of the day
data["Hour"] = pd.to_datetime(data["TimeConversion"],
                              format="%H:%M:%S").dt.hour
data["Minute"] = pd.to_datetime(data["TimeConversion"],
                              format="%H:%M:%S").dt.minute

print('Converted')

In [0]:
#@title Divide clean dataset into training set and test set

# Sort dataset in descending order from the latest to the earliest
data['Data'] = pd.to_datetime(data['Data'])
data = data.sort_values(by=['Data', 'Time'], ascending=False)

# Percentage of dataset to be considered as test set
test_split = 0.2
data_test = data[:][0:round((len(data)*test_split))]
data_train = data[:][round((len(data)*test_split)):]
# Shuffle training set
data_train = data_train.reindex(np.random.permutation(data_train.index))

print('Training set length:', str(len(data_train)),
      '\nTest set length:', str(len(data_test)))

In [0]:
#@title Normalization

# Calculate the Z-scores of each column in the training set:
data_train_mean = data_train.select_dtypes(include=['float64', 'int64']).mean()
data_train_std = data_train.select_dtypes(include=['float64', 'int64']).std()
data_train_norm = (data_train.select_dtypes(include=['float64', 'int64']) 
                   - data_train_mean)/data_train_std

# Calculate the Z-scores of each column in the test set.
data_test_mean = data_test.select_dtypes(include=['float64', 'int64']).mean()
data_test_std = data_test.select_dtypes(include=['float64', 'int64']).std()
data_test_norm = (data_test.select_dtypes(include=['float64', 'int64'])
                  - data_test_mean)/data_test_std

print("Normalized")