In [1]:
# Import findspark and initialise. 
import findspark
findspark.init()

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Specify the path to local CSV file
local_csv_path = "Resources/raindata.csv"

In [4]:
# Read the CSV file into a DataFrame
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(local_csv_path)

In [5]:
# Show the DataFrame
df.show()

+----+-----+---+-----------+-----------+--------+----------------+------------------+--------------+
|Year|Month|Day|stationCode|stationName|rainfall|relativeHumidity|airTemperature_avg|wind_avg_speed|
+----+-----+---+-----------+-----------+--------+----------------+------------------+--------------+
|2023|    1|  1|      AN001|  Allanooka|       0|            66.1|                21|         17.93|
|2023|    1|  2|      AN001|  Allanooka|       0|            52.2|              25.2|         16.49|
|2023|    1|  3|      AN001|  Allanooka|       0|            19.1|              32.2|         24.77|
|2023|    1|  4|      AN001|  Allanooka|       0|            22.9|              32.1|         21.67|
|2023|    1|  5|      AN001|  Allanooka|       0|            17.8|              33.3|         18.12|
|2023|    1|  6|      AN001|  Allanooka|       0|            42.5|                27|         19.73|
|2023|    1|  7|      AN001|  Allanooka|       0|            66.5|              19.7|      

In [6]:
# Count the number of rows in the DataFrame
df.count()

47389

In [7]:
# Drop null values in the original DataFrame
rain_df = df.dropna()

In [8]:
# Count the number of rows in the DataFrame after drop null values
rain_df.count()

47337

In [9]:
import pandas as pd

# Convert PySpark DataFrame to a Pandas DataFrame
rain_df = rain_df.toPandas()

# Display pandas DataFrame
rain_df.head()

Unnamed: 0,Year,Month,Day,stationCode,stationName,rainfall,relativeHumidity,airTemperature_avg,wind_avg_speed
0,2023,1,1,AN001,Allanooka,0,66.1,21.0,17.93
1,2023,1,2,AN001,Allanooka,0,52.2,25.2,16.49
2,2023,1,3,AN001,Allanooka,0,19.1,32.2,24.77
3,2023,1,4,AN001,Allanooka,0,22.9,32.1,21.67
4,2023,1,5,AN001,Allanooka,0,17.8,33.3,18.12


# Machine Learning

In [10]:
rain_df

Unnamed: 0,Year,Month,Day,stationCode,stationName,rainfall,relativeHumidity,airTemperature_avg,wind_avg_speed
0,2023,1,1,AN001,Allanooka,0,66.1,21,17.93
1,2023,1,2,AN001,Allanooka,0,52.2,25.2,16.49
2,2023,1,3,AN001,Allanooka,0,19.1,32.2,24.77
3,2023,1,4,AN001,Allanooka,0,22.9,32.1,21.67
4,2023,1,5,AN001,Allanooka,0,17.8,33.3,18.12
...,...,...,...,...,...,...,...,...,...
47332,2023,8,9,YU002,Yuna NE,0.2,74.2,14.2,5.72
47333,2023,8,10,YU002,Yuna NE,0,72.1,14.8,5.71
47334,2023,8,11,YU002,Yuna NE,0,72.8,16.8,6.62
47335,2023,8,12,YU002,Yuna NE,0,75.7,15.9,7.51


In [28]:
!pip install keras-tuner



In [11]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import sklearn as skl
import pandas as pd
import numpy as np
import tensorflow as tf
import keras_tuner as kt
from pathlib import Path




In [12]:
# Drop NA value from dataframe
rain_df.dropna(subset=['Year', 'Month', 'Day','stationCode','stationName','rainfall','relativeHumidity','airTemperature_avg','wind_avg_speed'], inplace=True)

rain_df

Unnamed: 0,Year,Month,Day,stationCode,stationName,rainfall,relativeHumidity,airTemperature_avg,wind_avg_speed
0,2023,1,1,AN001,Allanooka,0,66.1,21,17.93
1,2023,1,2,AN001,Allanooka,0,52.2,25.2,16.49
2,2023,1,3,AN001,Allanooka,0,19.1,32.2,24.77
3,2023,1,4,AN001,Allanooka,0,22.9,32.1,21.67
4,2023,1,5,AN001,Allanooka,0,17.8,33.3,18.12
...,...,...,...,...,...,...,...,...,...
47332,2023,8,9,YU002,Yuna NE,0.2,74.2,14.2,5.72
47333,2023,8,10,YU002,Yuna NE,0,72.1,14.8,5.71
47334,2023,8,11,YU002,Yuna NE,0,72.8,16.8,6.62
47335,2023,8,12,YU002,Yuna NE,0,75.7,15.9,7.51


In [13]:
# Handle missing or non-numeric values in rainfall column
rain_df["rainfall"] = pd.to_numeric(rain_df["rainfall"], errors='coerce')
rain_df["rainfall"].fillna(rain_df["rainfall"].mean(), inplace=True)

In [14]:
# Split our preprocessed data into our features and target arrays
y = rain_df["rainfall"].values

# X = rain_df.drop(["rainfall","stationName"],axis=1).values
X = rain_df.drop(["rainfall","stationName"],axis=1)

X

Unnamed: 0,Year,Month,Day,stationCode,relativeHumidity,airTemperature_avg,wind_avg_speed
0,2023,1,1,AN001,66.1,21,17.93
1,2023,1,2,AN001,52.2,25.2,16.49
2,2023,1,3,AN001,19.1,32.2,24.77
3,2023,1,4,AN001,22.9,32.1,21.67
4,2023,1,5,AN001,17.8,33.3,18.12
...,...,...,...,...,...,...,...
47332,2023,8,9,YU002,74.2,14.2,5.72
47333,2023,8,10,YU002,72.1,14.8,5.71
47334,2023,8,11,YU002,72.8,16.8,6.62
47335,2023,8,12,YU002,75.7,15.9,7.51


In [15]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X

Unnamed: 0,Year_2023,Year_N/A,Month_1,Month_10,Month_11,Month_12,Month_2,Month_3,Month_4,Month_5,...,wind_avg_speed_9.91,wind_avg_speed_9.92,wind_avg_speed_9.93,wind_avg_speed_9.94,wind_avg_speed_9.95,wind_avg_speed_9.96,wind_avg_speed_9.97,wind_avg_speed_9.98,wind_avg_speed_9.99,wind_avg_speed_N/A
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47332,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47333,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47334,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47335,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [17]:
# Create a StandardScaler instances
scaler =StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create a MinMaxScaler instance
scaler_y = MinMaxScaler()

# Reshape y_train and y_test if necessary (scaler expects 2D array)
y_train_reshaped = y_train.reshape(-1, 1)
y_test_reshaped = y_test.reshape(-1, 1)

# Fit the scaler on the training data and transform both training and testing data
y_train_scaled = scaler_y.fit_transform(y_train_reshaped)
y_test_scaled = scaler_y.transform(y_test_reshaped)

# If you reshaped y_train and y_test, you might want to flatten them back
y_train_scaled = y_train_scaled.flatten()
y_test_scaled = y_test_scaled.flatten()

In [22]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation', ['relu', 'tanh'])
    
    # Allow kerastuner to decide number of neurons in the first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=2), activation=activation, input_dim=len(X_train_scaled[0])))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 20)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="relu"))
    
    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [23]:
# Initialize the Keras Tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)





In [24]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train_scaled,epochs=50,validation_data=(X_test_scaled,y_test_scaled))

Trial 60 Complete [00h 00m 47s]
val_accuracy: 0.6399661898612976

Best val_accuracy So Far: 0.6399661898612976
Total elapsed time: 00h 26m 08s


In [25]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
# Build the model with the best hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hyper)
# Display the summary of the best model
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 3)                 12519     
                                                                 
 dense_9 (Dense)             (None, 7)                 28        
                                                                 
 dense_10 (Dense)            (None, 1)                 8         
                                                                 
 dense_11 (Dense)            (None, 1)                 2         
                                                                 
 dense_12 (Dense)            (None, 1)                 2         
                                                                 
 dense_13 (Dense)            (None, 1)                 2         
                                                                 
Total params: 12561 (49.07 KB)
Trainable params: 12561

In [26]:
# Fit the model to the training data
model.fit(
    X_train_scaled,   # Features of the training set (scaled)
    y_train_scaled,    # Target values of the training set
    epochs=100        # Number of training epochs (iterations over the entire training dataset)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x18a5b339810>

In [27]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_scaled, verbose=2)

# Print the evaluation results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

370/370 - 0s - loss: 0.0931 - accuracy: 0.6400 - 446ms/epoch - 1ms/step
Loss: 0.093067467212677, Accuracy: 0.6399661898612976


In [28]:
# Make predictions on the test and train data
Rain_test_scaled_predict = model.predict(X_test_scaled)
Rain_train_scaled_predict = model.predict(X_train_scaled)

# Inverse transform the scaled predictions
Rain_test_predict = scaler_y.inverse_transform(Rain_test_scaled_predict)
Rain_train_predict = scaler_y.inverse_transform(Rain_train_scaled_predict)

# Combine the arrays
Rain_predictions = np.concatenate((Rain_train_predict, Rain_test_predict))
Rain_actual = np.concatenate((y_train, y_test))


# Create a copy of the DataFrame
rain_predictions_df = rain_df.copy()
rain_predictions_df=rain_predictions_df.drop(["rainfall"],axis=1)

# Add a column to the DataFrame that contains the customer_ratings information
rain_predictions_df['Actual_Rain']=Rain_actual
rain_predictions_df['Predicted_Rain'] = np.floor(Rain_predictions.flatten())


# Review the DataFrame
rain_predictions_df



Unnamed: 0,Year,Month,Day,stationCode,stationName,relativeHumidity,airTemperature_avg,wind_avg_speed,Actual_Rain,Predicted_Rain
0,2023,1,1,AN001,Allanooka,66.1,21,17.93,0.0,0.0
1,2023,1,2,AN001,Allanooka,52.2,25.2,16.49,0.0,0.0
2,2023,1,3,AN001,Allanooka,19.1,32.2,24.77,0.0,0.0
3,2023,1,4,AN001,Allanooka,22.9,32.1,21.67,0.6,0.0
4,2023,1,5,AN001,Allanooka,17.8,33.3,18.12,13.2,0.0
...,...,...,...,...,...,...,...,...,...,...
47332,2023,8,9,YU002,Yuna NE,74.2,14.2,5.72,0.2,0.0
47333,2023,8,10,YU002,Yuna NE,72.1,14.8,5.71,29.6,0.0
47334,2023,8,11,YU002,Yuna NE,72.8,16.8,6.62,7.8,0.0
47335,2023,8,12,YU002,Yuna NE,75.7,15.9,7.51,0.6,0.0


In [31]:
# Save the DataFrame to a CSV file, excluding the index column
rain_predictions_df.to_csv('Resources/raindata_with_perdiction.csv', index=False)