### 1. Importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from numpy import unique
from numpy import reshape
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Dense, BatchNormalization, Flatten, MaxPooling1D, Dropout
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [None]:
path = r'C:\Users\nodnarb\Documents\Machine Learning'

In [None]:
df = pd.read_csv(os.path.join(path, '01 Data Sets', 'weather.csv'), index_col = False)

In [None]:
pleasant = pd.read_csv(os.path.join(path, '01 Data Sets', 'pleasant_weather.csv'), index_col = False)

In [None]:
pd.set_option('display.max_columns', None)
df.head()

In [None]:
pleasant.head()

In [None]:
df.shape

In [None]:
pleasant.shape

### 2. Data wrangling

In [None]:
df.shape

In [None]:
# Dropping DATE from pleasant

pleasant.drop(columns = 'DATE', inplace = True)

In [None]:
pleasant.shape

In [None]:
# Dropping 3 weather stations not included in pleasant data

df = df.drop(['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max',
                        'ROMA_cloud_cover', 'ROMA_wind_speed', 'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean',
                        'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'], axis=1)

In [None]:
df.shape

In [None]:
# Checking for observations with missing data

In [None]:
# Extract the different observation types

observation_types = ['cloud_cover', 'wind_speed', 'humidity', 'pressure',
                     'global_radiation', 'precipitation', 'snow_depth', 
                     'sunshine', 'temp_mean', 'temp_min', 'temp_max']

In [None]:
# Create a dictionary to store the count of stations for each observation type
station_counts = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in df.columns if col.endswith(obs)]
    
    # Count the number of stations (i.e., the number of columns) for the current observation type
    station_counts[obs] = len(columns)

# Print the count of stations for each observation type
print("Number of stations covered by each observation type:")
for obs, count in station_counts.items():
    print(f"{obs}: {count} stations")

##### Wind speed and snow depth are missing data

In [None]:
# Dropping columns containing wind speed and snow depth

dropcols = [col for col in df.columns if 'wind_speed' in col or 'snow_depth' in col]

df = df.drop(dropcols, axis=1)

In [None]:
df.shape

In [None]:
# Creating data for 3 missing observations

In [None]:
# Create a list of all unique station names in the dataset

all_stations = set([col.split('_')[0] for col in df.columns if '_' in col])
all_stations

In [None]:
# Find stations missing observation types

observation_types = ['cloud_cover', 'humidity', 'pressure']

missing_stations_by_observation = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in df.columns if col.endswith(obs)]
    
    # Extract station names by removing the observation type from the column names
    station_names = set([col.replace(f'_{obs}', '') for col in columns])
    
    # Identify stations that are in all_stations but missing from the current observation type
    missing_stations = all_stations - station_names
    
    # Store the missing station names in the dictionary
    missing_stations_by_observation[obs] = missing_stations

# Print the missing station names for each observation type
for obs, missing_stations in missing_stations_by_observation.items():
    print(f"\nStations missing from {obs}:")
    if missing_stations:
        for station in missing_stations:
            print(station)
    else:
        print("None")

##### Impute values for these 3 stations with values from nearby stations

In [None]:
# Finding cloud cover info from Ljublijana

df.columns.get_loc('LJUBLJANA_cloud_cover')

In [None]:
# Finding humidity info from Olso

df.columns.get_loc('OSLO_humidity')

In [None]:
# Finding pressure info from Sonnblick

df.columns.get_loc('SONNBLICK_pressure')

In [None]:
# Inserting values for missing data

df.insert(62,'KASSEL_cloud_cover', df['LJUBLJANA_cloud_cover'])
df.insert(98, 'STOCKHOLM_humidity', df['OSLO_humidity'])
df.insert(108,'MUNCHENB_pressure',df['SONNBLICK_pressure'])

In [None]:
df.head()

In [None]:
df.shape

In [33]:
# Export cleaned dataset

df.to_csv(os.path.join(path, '01 Data Sets', 'weather_cleaned.csv'), index = False)

### 3. Data reshaping

In [35]:
# Assigning X and y 

X = df
y = pleasant

In [67]:
# Turn X and y into arrays

X = np.array(X)
y = np.array(y)

In [69]:
# Reshape X to fit dimensions

X = X.reshape(-1,15,9)

In [71]:
X.shape

(22950, 15, 9)

In [73]:
y.shape

(22950, 15)

In [75]:
# Splitting data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [77]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(17212, 15, 9) (17212, 15)
(5738, 15, 9) (5738, 15)


### 4. Keras Model (CNN) #1

In [223]:
epochs = 30
batch_size = 16
n_hidden = 20

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='softmax')) # Options: sigmoid, tanh, softmax, relu

In [224]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [225]:
model.fit(X_train,
          y_train,
          batch_size=batch_size,
          validation_data=(X_test, y_test),
          epochs=epochs)

Epoch 1/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.0948 - loss: 307.2654 - val_accuracy: 0.1148 - val_loss: 3074.1230
Epoch 2/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1338 - loss: 6009.2749 - val_accuracy: 0.1626 - val_loss: 18194.2246
Epoch 3/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1281 - loss: 28022.3438 - val_accuracy: 0.1148 - val_loss: 52868.3125
Epoch 4/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1133 - loss: 72137.6016 - val_accuracy: 0.0997 - val_loss: 115381.1953
Epoch 5/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1126 - loss: 148788.3125 - val_accuracy: 0.0390 - val_loss: 212646.7500
Epoch 6/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1112 - loss: 258269.9219 - val_accuracy

<keras.src.callbacks.history.History at 0x1f9687612e0>

In [102]:
# Define list of stations names

stations = {
0: 'BASEL',
1: 'BELGRADE',
2: 'BUDAPEST',
3: 'DEBILT',
4: 'DUSSELDORF',
5: 'HEATHROW',
6: 'KASSEL',
7: 'LJUBLJANA',
8: 'MAASTRICHT',
9: 'MADRID',
10: 'MUNCHENB',
11: 'OSLO',
12: 'SONNBLICK',
13: 'STOCKHOLM',
14: 'VALENTIA'}

In [104]:
# Creating confusion matrix

def confusion_matrix(y_true, y_pred):
    y_true = pd.Series([stations[y] for y in np.argmax(y_true, axis=1)])
    y_pred = pd.Series([stations[y] for y in np.argmax(y_pred, axis=1)])

    return pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Pred'])

print(confusion_matrix(y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Pred        BASEL  BUDAPEST  DEBILT  DUSSELDORF  HEATHROW  KASSEL  LJUBLJANA  \
True                                                                           
BASEL           4      1807    1212           2        91      21        197   
BELGRADE        0       868     215           0         1       0          5   
BUDAPEST        0       180      32           0         0       0          0   
DEBILT          0        57      25           0         0       0          0   
DUSSELDORF      0        16      13           0         0       0          0   
HEATHROW        0        51      28           0         0       0          2   
KASSEL          0         8       3           0         0       0          0   
LJUBLJANA       0        48       6           0         0       0          6   
MAASTRICHT      0         4       5           0         0       0          0   
MADRID          0       223     151        

### 5. Model #2

In [204]:
epochs = 30
batch_size = 8
n_hidden = 40

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='tanh')) # Options: sigmoid, tanh, softmax, relu

In [206]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [208]:
model.fit(X_train,
          y_train,
          batch_size=batch_size,
          validation_data=(X_test, y_test),
          epochs=epochs)

Epoch 1/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.2643 - loss: 24.8991 - val_accuracy: 0.2743 - val_loss: 26.2923
Epoch 2/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.2874 - loss: 23.2217 - val_accuracy: 0.3031 - val_loss: 17.7642
Epoch 3/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.3165 - loss: 18.2947 - val_accuracy: 0.3156 - val_loss: 17.6996
Epoch 4/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.3195 - loss: 18.1588 - val_accuracy: 0.3372 - val_loss: 17.6434
Epoch 5/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.3446 - loss: 17.9940 - val_accuracy: 0.3297 - val_loss: 17.6434
Epoch 6/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.3360 - loss: 18.0189 - val_accuracy: 0.3303 - val_loss: 17.6434
Epoc

<keras.src.callbacks.history.History at 0x1f967512240>

In [210]:
# Creating confusion matrix

def confusion_matrix(y_true, y_pred):
    y_true = pd.Series([stations[y] for y in np.argmax(y_true, axis=1)])
    y_pred = pd.Series([stations[y] for y in np.argmax(y_pred, axis=1)])

    return pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Pred'])

print(confusion_matrix(y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 944us/step
Pred        BASEL  DUSSELDORF  LJUBLJANA  OSLO  SONNBLICK  STOCKHOLM  VALENTIA
True                                                                          
BASEL        1955         222        266   150        189        813        87
BELGRADE     1039          32          5    15          0          0         1
BUDAPEST      209           4          0     1          0          0         0
DEBILT         82           0          0     0          0          0         0
DUSSELDORF     28           1          0     0          0          0         0
HEATHROW       77           5          0     0          0          0         0
KASSEL         11           0          0     0          0          0         0
LJUBLJANA      60           1          0     0          0          0         0
MAASTRICHT      7           2          0     0          0          0         0
MADRID        379          43         20     5        

### 6. Model #3

In [250]:
epochs = 20
batch_size = 16
n_hidden = 20

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='sigmoid')) # Options: sigmoid, tanh, softmax, relu

In [252]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [254]:
model.fit(X_train,
          y_train,
          batch_size=batch_size,
          validation_data=(X_test, y_test),
          epochs=epochs)

Epoch 1/20
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.1019 - loss: 230.5476 - val_accuracy: 0.1014 - val_loss: 2012.9261
Epoch 2/20
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1165 - loss: 4048.8606 - val_accuracy: 0.2456 - val_loss: 11755.0146
Epoch 3/20
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1187 - loss: 18020.7109 - val_accuracy: 0.0411 - val_loss: 34604.5859
Epoch 4/20
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1020 - loss: 47783.5078 - val_accuracy: 0.0301 - val_loss: 75025.1094
Epoch 5/20
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1054 - loss: 96612.0469 - val_accuracy: 0.0441 - val_loss: 140405.8438
Epoch 6/20
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.1084 - loss: 176048.8281 - val_accuracy: 

<keras.src.callbacks.history.History at 0x1f9653d5d90>

In [256]:
# Creating confusion matrix

def confusion_matrix(y_true, y_pred):
    y_true = pd.Series([stations[y] for y in np.argmax(y_true, axis=1)])
    y_pred = pd.Series([stations[y] for y in np.argmax(y_pred, axis=1)])

    return pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Pred'])

print(confusion_matrix(y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 937us/step
Pred        BASEL  BELGRADE  VALENTIA
True                                 
BASEL        3675         1         6
BELGRADE     1092         0         0
BUDAPEST      214         0         0
DEBILT         82         0         0
DUSSELDORF     29         0         0
HEATHROW       82         0         0
KASSEL         11         0         0
LJUBLJANA      61         0         0
MAASTRICHT      9         0         0
MADRID        458         0         0
MUNCHENB        8         0         0
OSLO            5         0         0
STOCKHOLM       4         0         0
VALENTIA        1         0         0


### 7. Model #4

In [265]:
epochs = 30
batch_size = 8
n_hidden = 20

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='relu')) # Options: sigmoid, tanh, softmax, relu

In [267]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [269]:
model.fit(X_train,
          y_train,
          batch_size=batch_size,
          validation_data=(X_test, y_test),
          epochs=epochs)

Epoch 1/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.3006 - loss: nan - val_accuracy: 0.6417 - val_loss: nan
Epoch 2/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6450 - loss: nan - val_accuracy: 0.6417 - val_loss: nan
Epoch 3/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6473 - loss: nan - val_accuracy: 0.6417 - val_loss: nan
Epoch 4/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6461 - loss: nan - val_accuracy: 0.6417 - val_loss: nan
Epoch 5/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6379 - loss: nan - val_accuracy: 0.6417 - val_loss: nan
Epoch 6/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6413 - loss: nan - val_accuracy: 0.6417 - val_loss: nan
Epoch 7/30
[1m2152/2152[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1f95f08fd10>

In [271]:
# Creating confusion matrix

def confusion_matrix(y_true, y_pred):
    y_true = pd.Series([stations[y] for y in np.argmax(y_true, axis=1)])
    y_pred = pd.Series([stations[y] for y in np.argmax(y_pred, axis=1)])

    return pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Pred'])

print(confusion_matrix(y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 973us/step
Pred        BASEL
True             
BASEL        3682
BELGRADE     1092
BUDAPEST      214
DEBILT         82
DUSSELDORF     29
HEATHROW       82
KASSEL         11
LJUBLJANA      61
MAASTRICHT      9
MADRID        458
MUNCHENB        8
OSLO            5
STOCKHOLM       4
VALENTIA        1


In [11]:
# Dropping DATE and MONTH from df

df.drop(['DATE', 'MONTH'], axis=1, inplace=True)