In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from pathlib import Path




In [23]:
# rain data path
file_path = '../../Resources/raindata.csv'

# Read CSV and convert to rain_df
rain_df=pd.read_csv(file_path)

# Drop NA value from dataframe
rain_df.dropna(subset=['Year', 'Month', 'Day','stationCode','stationName','rainfall','relativeHumidity','airTemperature_avg','wind_avg_speed'], inplace=True)

# Convert date into number
# rain_df['Date'] = (pd.to_datetime(rain_df[['Year', 'Month', 'Day']], errors='coerce') - pd.to_datetime("1970-01-01")).dt.days

rain_df

Unnamed: 0,Year,Month,Day,stationCode,stationName,rainfall,relativeHumidity,airTemperature_avg,wind_avg_speed
0,2023.0,1.0,1.0,AN001,Allanooka,0.0,66.1,21.0,17.93
1,2023.0,1.0,2.0,AN001,Allanooka,0.0,52.2,25.2,16.49
2,2023.0,1.0,3.0,AN001,Allanooka,0.0,19.1,32.2,24.77
3,2023.0,1.0,4.0,AN001,Allanooka,0.0,22.9,32.1,21.67
4,2023.0,1.0,5.0,AN001,Allanooka,0.0,17.8,33.3,18.12
...,...,...,...,...,...,...,...,...,...
47384,2023.0,8.0,9.0,YU002,Yuna NE,0.2,74.2,14.2,5.72
47385,2023.0,8.0,10.0,YU002,Yuna NE,0.0,72.1,14.8,5.71
47386,2023.0,8.0,11.0,YU002,Yuna NE,0.0,72.8,16.8,6.62
47387,2023.0,8.0,12.0,YU002,Yuna NE,0.0,75.7,15.9,7.51


In [4]:
# Split our preprocessed data into our features and target arrays
y = rain_df["rainfall"].values

# X = rain_df.drop(["rainfall","stationName"],axis=1).values
X = rain_df.drop(["rainfall","stationName"],axis=1)

X

Unnamed: 0,Year,Month,Day,stationCode,relativeHumidity,airTemperature_avg,wind_avg_speed
0,2023.0,1.0,1.0,AN001,66.1,21.0,17.93
1,2023.0,1.0,2.0,AN001,52.2,25.2,16.49
2,2023.0,1.0,3.0,AN001,19.1,32.2,24.77
3,2023.0,1.0,4.0,AN001,22.9,32.1,21.67
4,2023.0,1.0,5.0,AN001,17.8,33.3,18.12
...,...,...,...,...,...,...,...
47384,2023.0,8.0,9.0,YU002,74.2,14.2,5.72
47385,2023.0,8.0,10.0,YU002,72.1,14.8,5.71
47386,2023.0,8.0,11.0,YU002,72.8,16.8,6.62
47387,2023.0,8.0,12.0,YU002,75.7,15.9,7.51


In [5]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X

Unnamed: 0,Year,Month,Day,relativeHumidity,airTemperature_avg,wind_avg_speed,stationCode_AM001,stationCode_AN001,stationCode_BA,stationCode_BB001,...,stationCode_WN,stationCode_WO001,stationCode_WR,stationCode_WR001,stationCode_WS001,stationCode_WT001,stationCode_YE001,stationCode_YS,stationCode_YU001,stationCode_YU002
0,2023.0,1.0,1.0,66.1,21.0,17.93,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2023.0,1.0,2.0,52.2,25.2,16.49,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2023.0,1.0,3.0,19.1,32.2,24.77,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2023.0,1.0,4.0,22.9,32.1,21.67,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2023.0,1.0,5.0,17.8,33.3,18.12,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47384,2023.0,8.0,9.0,74.2,14.2,5.72,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
47385,2023.0,8.0,10.0,72.1,14.8,5.71,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
47386,2023.0,8.0,11.0,72.8,16.8,6.62,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
47387,2023.0,8.0,12.0,75.7,15.9,7.51,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])
    
    # Allow kerastuner to decide number of neurons in the first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=2), activation=activation, input_dim=len(X_train_scaled[0])))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 20)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    
    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [9]:
# Initialize the Keras Tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)





In [11]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=50,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 01m 55s]
val_accuracy: 0.012212704867124557

Best val_accuracy So Far: 0.6359935402870178
Total elapsed time: 00h 48m 25s


In [12]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
# Build the model with the best hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hyper)
# Display the summary of the best model
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 21)                4704      
                                                                 
 dense_11 (Dense)            (None, 13)                286       
                                                                 
 dense_12 (Dense)            (None, 7)                 98        
                                                                 
 dense_13 (Dense)            (None, 13)                104       
                                                                 
 dense_14 (Dense)            (None, 23)                322       
                                                                 
 dense_15 (Dense)            (None, 11)                264       
                                                                 
 dense_16 (Dense)            (None, 17)               

In [13]:
model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x2666af8a890>

In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)

# Print the evaluation results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

369/369 - 1s - loss: nan - accuracy: 0.6360 - 1s/epoch - 4ms/step
Loss: nan, Accuracy: 0.6359935402870178


In [28]:
# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Create a new dataframe with forecast values and align it with the test set indices
forecast_df = pd.DataFrame({'rainforecast': y_pred.flatten()}, index=X_test.index)

# Merge the forecast_df with the original rain_df dataframe based on the 'Year', 'Month', 'Day', 'relativeHumidity', 'airTemperature_avg', 'wind_avg_speed' columns
rain_df = pd.merge(rain_df, forecast_df, how='left', on=['Year', 'Month', 'Day', 'relativeHumidity', 'airTemperature_avg', 'wind_avg_speed'])

# Display the updated dataframe with the forecast column
rain_df



KeyError: 'Year'

In [27]:
X_test.columns

Index(['Year', 'Month', 'Day', 'relativeHumidity', 'airTemperature_avg',
       'wind_avg_speed', 'stationCode_AM001', 'stationCode_AN001',
       'stationCode_BA', 'stationCode_BB001',
       ...
       'stationCode_WN', 'stationCode_WO001', 'stationCode_WR',
       'stationCode_WR001', 'stationCode_WS001', 'stationCode_WT001',
       'stationCode_YE001', 'stationCode_YS', 'stationCode_YU001',
       'stationCode_YU002'],
      dtype='object', length=223)