In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle

In [2]:
df = pd.read_csv("weather.csv")

In [3]:
# Drop irrelevant columns
df = df.drop(['station', 'year'], axis=1)

# Fill in any null values.
df = (df.ffill()+df.bfill())/2
df = df.bfill().ffill()

In [4]:
# Feature engineering
df['rained_today'] = [0 if x < 0.2 else 1 for x in df['daily_rainfall_total_mm']]
df['rained_in_two_days'] = df['rained_today'] + df['rained_today'].shift(1)
df['rained_in_three_days'] = df['rained_today'] + df['rained_today'].shift(1) + df['rained_today'].shift(2)
df['rained_next_day'] = df['rained_today'].shift(-1)
df['rained_next_next_day'] = df['rained_today'].shift(-2)
df['next_day_maximum_temperature_c'] = df['maximum_temperature_c'].shift(-1)
df['next_next_day_maximum_temperature_c'] = df['maximum_temperature_c'].shift(-2)
df['next_day_minimum_temperature_c'] = df['minimum_temperature_c'].shift(-1)
df['next_next_day_minimum_temperature_c'] = df['minimum_temperature_c'].shift(-2)

In [5]:
df = df.dropna()

In [6]:
df.columns

Index(['month', 'day', 'daily_rainfall_total_mm', 'highest_30_min_rainfall_mm',
       'highest_60_min_rainfall_mm', 'highest_120_min_rainfall_mm',
       'mean_temperature_c', 'maximum_temperature_c', 'minimum_temperature_c',
       'mean_wind_speed_kmh', 'max_wind_speed_kmh', 'rained_today',
       'rained_in_two_days', 'rained_in_three_days', 'rained_next_day',
       'rained_next_next_day', 'next_day_maximum_temperature_c',
       'next_next_day_maximum_temperature_c', 'next_day_minimum_temperature_c',
       'next_next_day_minimum_temperature_c'],
      dtype='object')

# Forecast rain tomorrow

In [7]:
features = ['month', 'day', 'rained_today', 'highest_30_min_rainfall_mm',
       'highest_60_min_rainfall_mm', 'highest_120_min_rainfall_mm',
       'mean_temperature_c', 'maximum_temperature_c', 'minimum_temperature_c',
       'mean_wind_speed_kmh', 'max_wind_speed_kmh',
       'rained_in_two_days']
target = 'rained_next_day'
X = df[features]
y = df[target].values

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pickle.dump(rf, open(r'.\flaskapp\models\model_next_day_rainfall.sav', 'wb'))

In [10]:
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

In [11]:
# Calculate baseline probability.
baseline_accuracy = np.count_nonzero(y_test)/len(y_test)*100
if baseline_accuracy <= 50:
    baseline_accuracy = 100 - baseline_accuracy
baseline_accuracy

52.74725274725275

In [12]:
# Model certainty of prediction.
y_prob[0].max()

0.52

In [13]:
print('classification report:')
print(classification_report(y_test, y_pred, output_dict=False))

classification report:
              precision    recall  f1-score   support

         0.0       0.64      0.67      0.66       144
         1.0       0.61      0.58      0.60       129

    accuracy                           0.63       273
   macro avg       0.63      0.63      0.63       273
weighted avg       0.63      0.63      0.63       273



In [14]:
print('confusion matrix:')
print(confusion_matrix(y_test, y_pred))

confusion matrix:
[[97 47]
 [54 75]]


# Forecast rain in 2 days

In [15]:
features = ['month', 'day', 'rained_today', 'highest_30_min_rainfall_mm',
       'highest_60_min_rainfall_mm', 'highest_120_min_rainfall_mm',
       'mean_temperature_c', 'maximum_temperature_c', 'minimum_temperature_c',
       'mean_wind_speed_kmh', 'max_wind_speed_kmh',
       'rained_in_three_days']
target = 'rained_next_next_day'
X = df[features]
y = df[target].values

In [16]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pickle.dump(rf, open(r'.\flaskapp\models\model_next_next_day_rainfall.sav', 'wb'))

In [18]:
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)

In [19]:
# Calculate baseline probability.
baseline_accuracy = np.count_nonzero(y_test)/len(y_test)*100
if baseline_accuracy <= 50:
    baseline_accuracy = 100 - baseline_accuracy
baseline_accuracy

50.91575091575092

In [20]:
y_pred[0]

1.0

In [21]:
# Model certainty of prediction.
y_prob[0][1]

0.63

In [22]:
print('classification report:')
print(classification_report(y_test, y_pred, output_dict=False))

classification report:
              precision    recall  f1-score   support

         0.0       0.60      0.71      0.65       139
         1.0       0.63      0.51      0.57       134

    accuracy                           0.62       273
   macro avg       0.62      0.61      0.61       273
weighted avg       0.62      0.62      0.61       273



In [23]:
print('confusion matrix:')
print(confusion_matrix(y_test, y_pred))

confusion matrix:
[[99 40]
 [65 69]]
