In [1]:
# first let's pre-process the data
import pandas as pd

data = pd.read_csv('weatherAUS.csv')
data['Date'] = pd.to_datetime(data["Date"])
data = data.dropna()
data = data.sort_values(by='Date')
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
45587,2007-11-01,Canberra,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,...,68.0,29.0,1019.7,1015.0,7.0,7.0,14.4,23.6,No,Yes
45588,2007-11-02,Canberra,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,...,80.0,36.0,1012.4,1008.4,5.0,3.0,17.5,25.7,Yes,Yes
45589,2007-11-03,Canberra,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,...,82.0,69.0,1009.5,1007.2,8.0,7.0,15.4,20.2,Yes,Yes
45590,2007-11-04,Canberra,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,...,62.0,56.0,1005.5,1007.0,2.0,7.0,13.5,14.1,Yes,Yes
45591,2007-11-05,Canberra,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,...,68.0,49.0,1018.3,1018.5,7.0,7.0,11.1,15.4,Yes,No


In [2]:
from pandas import Timestamp
import re
from tqdm import tqdm

# create the data frame
df = pd.DataFrame(data)
df = df.replace(to_replace=['Yes', 'No'], value=[1,0])

# create the almanac column - for each date, query all the same days, but in different years.
# Then, see if it rained that day (or the next day) and sum up the values.
def build_regex(date: Timestamp):
    day_wo_year = str(date.date()).split('-')[1:]
    month = day_wo_year[0]
    day = day_wo_year[1]
    pattern = r'[\d]{4}[-]{1}'+ f'{month}' + r'[-]{1}' + f'{day}'
    regex = re.compile(pattern)
    return regex

def sum_previous_days_rain(row) -> int:
    # filter the rows based on same day of year and location
    date = row['Date']
    location = row['Location']
    # build regex
    regex = build_regex(date)
    # filter by location
    ldf = df[df['Location'] == location]
    # filter same day, different year
    options = list(filter(regex.match, list(map(lambda d: str(d), ldf['Date'].values))))
    tdf = ldf[ldf['Date'].isin(options)]
    summation: int = tdf['RainToday'].sum()
    return summation

num_of_rows = df.shape[0]
def inner_function(row, pbar: tqdm):
    sum_to_return: int = sum_previous_days_rain(row)
    pbar.update()
    return sum_to_return

# run a moderately long process
with tqdm(total=num_of_rows) as pbar:
    df['almanac'] = df.apply(lambda row: inner_function(row, pbar), axis=1)
# write to file
# file = 'dataframe.txt'
# with open(file, 'w') as f:
#     print(f.write(df.to_markdown()))

100%|██████████| 56420/56420 [06:49<00:00, 137.74it/s]


In [3]:
from sklearn.model_selection import train_test_split
# more formatting
df = pd.get_dummies(df,columns=['Location','WindGustDir','WindDir9am','WindDir3pm'])
# remove columns Date & RainTomorrow. Date might not be relevant. RainTomorrow is the Y variable.
columns = df.columns.values.tolist()
columns.remove('Date')
columns.remove('RainTomorrow')
# df.dropna()
# assign X & Y's
X = df[columns]
Y = df['RainTomorrow']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=42)
# check uniform shape of data
print(f'X_train: {X_train.shape}, Y_train: {Y_train.shape}')
print(f'X_test: {X_test.shape}, Y_test: {Y_test.shape}')

X_train: (45136, 92), Y_train: (45136,)
X_test: (11284, 92), Y_test: (11284,)


In [8]:
# feed the classifier neural network
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

mlpc = MLPClassifier(hidden_layer_sizes=(20))
chistory = mlpc.fit(X_train, Y_train)
yc_prediction = mlpc.predict(X_test)
mlpc_accuracy = accuracy_score(Y_test, yc_prediction)
print(f'MLPC Accuracy: {mlpc_accuracy}')
# print(f'MLPC Weights: {mlpc.coefs_}')

MLPC Accuracy: 0.8494328252392769


In [9]:
# feed the regressor neural network
from sklearn.neural_network import MLPRegressor
import numpy as np

mlpr = MLPRegressor(hidden_layer_sizes=(20), solver='adam', activation='logistic')
rhistory = mlpr.fit(X_train, Y_train)
yr_prediction: np.ndarray = mlpr.predict(X_test)
yrm_prediction = np.array(list(map(lambda y: 1 if y >= 0.5 else 0, yr_prediction)))
mlpr_accuracy = accuracy_score(Y_test, yrm_prediction)
print(f'MLPR Accuracy: {mlpr_accuracy}')
# print(f'MLPR Weights: {mlpr.coefs_}')

MLPR Accuracy: 0.8581176887628501


In [11]:
# mlp with keras
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense

mlpk = Sequential()
mlpk.add(Dense(20, activation='relu'))
mlpk.add(Dense(1, activation='sigmoid'))

# compile and train model
mlpk.compile(loss=tf.keras.losses.binary_crossentropy, optimizer='sgd', metrics=['accuracy'])
mlpk.fit(X_train, Y_train, epochs=10)
# evaluate the model
mlpk_loss, mlpk_accuracy = mlpk.evaluate(X_test, Y_test)
print(f'MLPK accuracy: {mlpk_accuracy}')
# print(f'MLPK weights: {mlpk.weights}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MLPK accuracy: 0.7738390564918518
