<a href="https://www.kaggle.com/code/lorresprz/ann-cnn-randomforest-predicting-nice-weather?scriptVersionId=145058387" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Given this dataset, the objective of this machine learning task is to decide whether a set of weather measurements lead to  favorable conditions to have a barbecue outdoors in a given city in Europe for the period of 10 years between 2000 and 2010. This task is basically a classification task with 2 labels: True and False. The AI model looks at a set of features (measurements) and classifies whether that set corresponds to a True or False condition for an outdoor barbecue.

# Dataset overview

In [None]:
df = pd.read_csv('../input/weather-prediction/weather_prediction_dataset.csv')
df_bbq = pd.read_csv('../input/weather-prediction/weather_prediction_bbq_labels.csv')

In [None]:
df.head(6)

In [None]:
df_bbq.head(6)

In [None]:
len(df_bbq.columns), df_bbq.columns

In [None]:
df.columns

There are 165 columns for the 18 cities in Europe. For each cities, multiple measurements including wind gust, wind speed, cloud cover, humidity, pressure, global radiation, precipitation, sunshine, minimum temperature, maximum temperature were recorded. Furthermore, the set of measurements varies from city to city (for example, some city might have wind speed in place of cloud cover or some might have wind gust in place of wind speed). For this notebook, we will focus on Dresden, a city located on the eastern side of Germany. The objective of this is to predict whether the weather is okay for to have a  barbecue outdoors. 

# Dresden weather

In [None]:
df.columns[30:41]

In [None]:
df_Dresden = df[['DATE', 'DRESDEN_cloud_cover', 'DRESDEN_wind_speed', 'DRESDEN_wind_gust',
       'DRESDEN_humidity', 'DRESDEN_global_radiation', 'DRESDEN_precipitation',
       'DRESDEN_sunshine', 'DRESDEN_temp_mean', 'DRESDEN_temp_min',
       'DRESDEN_temp_max']]

In [None]:
df_Dresden['BBQ'] = df_bbq['DRESDEN_BBQ_weather'];

In [None]:
df_Dresden

In [None]:
fig, axs = plt.subplots(2,2, figsize = (10,10))
sns.kdeplot(data = df_Dresden, x='DRESDEN_cloud_cover', hue = 'BBQ', fill = True, ax = axs[0,0])
axs[0,0].set_title('Cloud cover condition for BBQ')
sns.kdeplot(data = df_Dresden, x='DRESDEN_wind_speed', hue = 'BBQ', fill = True, ax = axs[0,1])
axs[0,1].set_title('Wind speed condition for BBQ')
sns.kdeplot(data = df_Dresden, x='DRESDEN_sunshine', hue = 'BBQ', fill = True, ax = axs[1,0])
axs[1,0].set_title('Sunshine condition for BBQ')
sns.kdeplot(data = df_Dresden, x='DRESDEN_temp_min', hue = 'BBQ', fill = True, ax = axs[1,1])
axs[1,1].set_title('Minimum temperature condition for BBQ')

plt.show()

In [None]:
fig, axs = plt.subplots(2,2, figsize = (10,10))
sns.kdeplot(data = df_Dresden, x='DRESDEN_humidity', hue = 'BBQ', fill = True, ax = axs[0,0])
axs[0,0].set_title('Humidity condition for BBQ')

sns.kdeplot(data = df_Dresden, x='DRESDEN_wind_gust', hue = 'BBQ', fill = True, ax = axs[0,1])
axs[0,1].set_title('Wind gust condition for BBQ')

sns.kdeplot(data = df_Dresden, x='DRESDEN_precipitation', hue = 'BBQ', fill = True, ax = axs[1,0])
axs[1,0].set_title('Precipitation condition for BBQ')

sns.kdeplot(data = df_Dresden, x='DRESDEN_temp_max', hue = 'BBQ', fill = True, ax = axs[1,1])
axs[1,1].set_title('Max temperature condition for BBQ')

plt.show()

In [None]:
#True vs False: condition for having a nice weather
plt.figure(figsize = (5,5))
explode = [0,0.05]
plt.pie(df_Dresden['BBQ'].value_counts(), 
        explode=explode, autopct='%.1f%%');
plt.legend(['True', 'False']);
plt.show()

# Data preparation for model training

In [None]:
df_Dresden['BBQ'] = df_Dresden['BBQ'].map({True:1, False:0});

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Conv1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [None]:
X = df_Dresden.drop(['DATE','BBQ'], axis = 1)
y = df_Dresden['BBQ']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 28)
len(X_train), len(X_test)

# ANN model building

In [None]:
#A very simple ANN model with 2 layers 
model = Sequential([
        Dense(X_train.shape[1], activation="relu"),
        Dense(X_train.shape[1]/2, activation="relu"),
        Dense(1, activation = 'sigmoid'),
    ])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=150,
          validation_data=(X_test, y_test), verbose=0
          )

In [None]:
model_loss_acc = pd.DataFrame(model.history.history)
model_loss_acc[['loss', 'val_loss']].plot()
model_loss_acc[['accuracy', 'val_accuracy']].plot()

In [None]:
predictions = np.round(model.predict(X_test))
print(classification_report(y_test,predictions))

In [None]:
ann_acc = accuracy_score(y_test, predictions)
ann_acc

In [None]:
plt.figure(figsize = (6,6))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title('Confusion matrix for predicting barbecue weather using ANN');

# CNN model building

In [None]:
from tensorflow.keras.layers import Conv1D, Flatten, Reshape

In [None]:
X_train.shape, X_test.shape

To use CNN with Conv1D layers, we need to reshape the input to have the form (batch_size, num_timesteps, num_features).

In [None]:
model_2 = Sequential(
    [Reshape((1,X_train.shape[1],1)),
     Conv1D(filters=10, kernel_size=2, activation='relu', 
            input_shape = (1,X_train.shape[1],1)),
     Flatten(),
     Dense(1, activation = 'sigmoid')
                     ])

In [None]:
model_2.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [None]:
model_2.fit(x=X_train, 
          y=y_train, 
          epochs=150,
          validation_data=(X_test, y_test), verbose=0
          )

In [None]:
model2_loss_acc = pd.DataFrame(model_2.history.history)
model2_loss_acc[['loss', 'val_loss']].plot()
model2_loss_acc[['accuracy', 'val_accuracy']].plot()

In [None]:
pred_2 = np.round(model_2.predict(X_test))
print(classification_report(y_test,pred_2))

In [None]:
cnn_acc = accuracy_score(y_test, pred_2)
cnn_acc

In [None]:
plt.figure(figsize = (6,6))
sns.heatmap(confusion_matrix(y_test, pred_2), annot=True)
plt.title('Confusion matrix for predicting barbecue weather with CNN');

# RandomForestClassifier

For code reference, see https://www.datacamp.com/tutorial/random-forests-classifier-python

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
pred_rf = rf.predict(X_test)
rf_acc = accuracy_score(y_test, pred_rf)
rf_acc

In [None]:
for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

In [None]:
plt.figure(figsize = (6,6))
sns.heatmap(confusion_matrix(y_test, pred_rf), annot=True)
plt.title('Confusion matrix for predicting barbecue weather with RandomForestClassifier');

# ANN vs CNN vs RandomForestClassifier

In [None]:
d = {'Methods': ['ANN', 'CNN', 'RandomForestClassifier'], 'Accuracy': [ann_acc, cnn_acc, rf_acc]}

dfc = pd.DataFrame(data=d)

plt.figure(figsize = (7,7))
ax = sns.barplot(x = 'Methods', y='Accuracy',
                 data=dfc,
                 errwidth=0);
ax.bar_label(ax.containers[0]);
plt.title('Accuracy of various methods');