Uses the [Rain in Australia dataset](https://www.kaggle.com/jsphyg/weather-dataset-rattle-package).

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
url = 'https://gist.githubusercontent.com/DanielKerrigan/5c2528df65aba618c45ce01b54fc5826/raw/bb4b6f0c4934ec4d60c6ed4f751309be840c9f93/weatherAUS.csv'

In [None]:
df = pd.read_csv(url, parse_dates=['Date'])

In [None]:
df.shape

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.strftime('%b')
df.drop(columns=['Date'], inplace=True)

df.rename(columns={'RainTomorrow': 'label'}, inplace=True)
df['label'].replace({ 'No': 0, 'Yes': 1 }, inplace=True)

In [None]:
df.head()

In [None]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train = df_train_split.copy()
df_test = df_test_split.copy()

In [None]:
num_train = df_train.shape[0]

In [None]:
df = pd.concat([df_train, df_test])

In [None]:
df_one_hot = pd.get_dummies(df,
                            columns=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Year', 'Month', 'RainToday'],
                            drop_first=True)
df_train_one_hot = df_one_hot[:num_train]
df_test_one_hot = df_one_hot[num_train:]

In [None]:
df_train.head()

In [None]:
df_train_one_hot.head()

In [None]:
X_train = df_train_one_hot.drop(columns=['label']).values
y_train = df_train_one_hot['label'].values

In [None]:
X_test = df_test_one_hot.drop(columns=['label']).values
y_test = df_test_one_hot['label'].values

In [None]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 4, 8, 16, 32]
}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='f1_weighted')
clf.fit(X_train, y_train)

In [None]:
print(clf.best_score_)
print(clf.best_params_)

In [None]:
train_preds = np.where(clf.predict(X_train) > 0.5, 1, 0)
(train_preds == y_train).sum() / df_train.shape[0]

In [None]:
probs = clf.predict(X_test)

In [None]:
predictions = np.where(probs > 0.5, 1, 0)

In [None]:
df_test['prediction'] = predictions

In [None]:
df_test['label'].replace({ 0: 'no rain tmrw', 1: 'rain tmrw' }, inplace=True)
df_test['prediction'].replace({ 0: 'no rain tmrw', 1: 'rain tmrw' }, inplace=True)

In [None]:
columns_ordered = [
    "Year", "Month", "Location", "MinTemp", "MaxTemp",
    "Rainfall", "Evaporation", "Sunshine", "WindGustDir",
    "WindGustSpeed", "WindDir9am", "WindDir3pm", "WindSpeed9am",
    "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am",
    "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm",
    "RainToday", "label", "prediction"
]

In [None]:
df_test.to_csv('weather-aus.csv', index=False, columns=columns_ordered)