# 1. Importing data and libraries

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from numpy import unique
from numpy import reshape
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Dense, BatchNormalization, Flatten, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.layers import LSTM
from keras.layers import Dense, Dropout

In [7]:
# Create a path
path = r'/Users/marcela/Library/CloudStorage/OneDrive-Personal/CF/Machine Learning/Achievement/Data Sets'

In [8]:
# Delimiting columns displayed
pd.options.display.max_columns = None

In [9]:
# Import pleasant weather answers data set
df_pleasant_weather = pd.read_csv(os.path.join(path, 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'), index_col = False)

In [10]:
df_pleasant_weather.shape

(22950, 16)

In [11]:
# Import unscaled mean temperatures data set
df_unscaled = pd.read_csv(os.path.join(path, 'Dataset-weather-prediction-dataset-processed.csv'), index_col = False)

In [12]:
df_unscaled.shape

(22950, 170)

# 2. Data Wrangling

In [13]:
# Dropping all columns regarding Gdansk, Roma, Tours from unscaled, as they aren't included in pleasant_weather
columns_to_drop = ['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 
                   'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 
                   'GDANSK_temp_max', 'ROMA_cloud_cover', 'ROMA_wind_speed', 
                   'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean',
                   'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure',
                   'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 
                   'TOURS_temp_min', 'TOURS_temp_max']

# Using the drop() method to drop the specified columns
df_unscaled.drop(columns=columns_to_drop, inplace=True)

In [14]:
df_unscaled.shape 

(22950, 149)

In [15]:
df_unscaled.drop(columns = ['DATE', 'MONTH'], inplace = True)

In [16]:
df_pleasant_weather.drop(columns = 'DATE', inplace = True) 

In [17]:
df_pleasant_weather.shape

(22950, 15)

In [18]:
further_drops = ['BASEL_snow_depth',  'DUSSELDORF_snow_depth', 'HEATHROW_snow_depth',
                  'MUNCHENB_snow_depth', 'OSLO_snow_depth',  'VALENTIA_snow_depth',
                 'BASEL_wind_speed', 'DEBILT_wind_speed', 'DUSSELDORF_wind_speed',
                  'KASSEL_wind_speed', 'LJUBLJANA_wind_speed',  'MAASTRICHT_wind_speed',
                  'MADRID_wind_speed', 'OSLO_wind_speed','SONNBLICK_wind_speed',]

df_unscaled.drop(columns=further_drops, inplace=True)

In [19]:
# Filling missing observations using nearby stations with similar weather:
# Ljubljana -> Kassel
# Sonnblick -> Munchen
# Oslo -> Stockholm

df_unscaled.columns.get_loc('HEATHROW_temp_max')

53

In [20]:
df_unscaled.columns.get_loc('MUNCHENB_humidity')

90

In [21]:
df_unscaled.columns.get_loc('STOCKHOLM_cloud_cover')

115

In [22]:
df_unscaled.insert(54,'KASSEL_cloud_cover', df_unscaled['LJUBLJANA_cloud_cover'])
df_unscaled.insert(92,'MUNCHENB_pressure', df_unscaled['SONNBLICK_pressure'])
df_unscaled.insert(118, 'STOCKHOLM_humidity', df_unscaled['OSLO_humidity'])
df_unscaled.columns.tolist()

['BASEL_cloud_cover',
 'BASEL_humidity',
 'BASEL_pressure',
 'BASEL_global_radiation',
 'BASEL_precipitation',
 'BASEL_sunshine',
 'BASEL_temp_mean',
 'BASEL_temp_min',
 'BASEL_temp_max',
 'BELGRADE_cloud_cover',
 'BELGRADE_humidity',
 'BELGRADE_pressure',
 'BELGRADE_global_radiation',
 'BELGRADE_precipitation',
 'BELGRADE_sunshine',
 'BELGRADE_temp_mean',
 'BELGRADE_temp_min',
 'BELGRADE_temp_max',
 'BUDAPEST_cloud_cover',
 'BUDAPEST_humidity',
 'BUDAPEST_pressure',
 'BUDAPEST_global_radiation',
 'BUDAPEST_precipitation',
 'BUDAPEST_sunshine',
 'BUDAPEST_temp_mean',
 'BUDAPEST_temp_min',
 'BUDAPEST_temp_max',
 'DEBILT_cloud_cover',
 'DEBILT_humidity',
 'DEBILT_pressure',
 'DEBILT_global_radiation',
 'DEBILT_precipitation',
 'DEBILT_sunshine',
 'DEBILT_temp_mean',
 'DEBILT_temp_min',
 'DEBILT_temp_max',
 'DUSSELDORF_cloud_cover',
 'DUSSELDORF_humidity',
 'DUSSELDORF_pressure',
 'DUSSELDORF_global_radiation',
 'DUSSELDORF_precipitation',
 'DUSSELDORF_sunshine',
 'DUSSELDORF_temp_mean',


In [23]:
df_unscaled.shape

(22950, 135)

In [24]:
df_unscaled.to_pickle(os.path.join(path, 'X_cleaned.pkl'))

# 3. Reshaping for ML modeling

In [25]:
X = pd.read_pickle(os.path.join(path, 'X_cleaned.pkl'))

In [26]:
X.shape

(22950, 135)

In [27]:
y = df_pleasant_weather

In [28]:
y.shape

(22950, 15)

In [29]:
# Turning X and y from a df to arrays
X = np.array(X)
y = np.array(y)

In [30]:
X = X.reshape(-1,15,9)

In [31]:
# Verifying array shape
X

array([[[  7.    ,   0.85  ,   1.018 , ...,   6.5   ,   0.8   ,
          10.9   ],
        [  1.    ,   0.81  ,   1.0195, ...,   3.7   ,  -0.9   ,
           7.9   ],
        [  4.    ,   0.67  ,   1.017 , ...,   2.4   ,  -0.4   ,
           5.1   ],
        ...,
        [  4.    ,   0.73  ,   1.0304, ...,  -5.9   ,  -8.5   ,
          -3.2   ],
        [  5.    ,   0.98  ,   1.0114, ...,   4.2   ,   2.2   ,
           4.9   ],
        [  5.    ,   0.88  ,   1.0003, ...,   8.5   ,   6.    ,
          10.9   ]],

       [[  6.    ,   0.84  ,   1.018 , ...,   6.1   ,   3.3   ,
          10.1   ],
        [  6.    ,   0.84  ,   1.0172, ...,   2.9   ,   2.2   ,
           4.4   ],
        [  4.    ,   0.67  ,   1.017 , ...,   2.3   ,   1.4   ,
           3.1   ],
        ...,
        [  6.    ,   0.97  ,   1.0292, ...,  -9.5   , -10.5   ,
          -8.5   ],
        [  5.    ,   0.62  ,   1.0114, ...,   4.    ,   3.    ,
           5.    ],
        [  7.    ,   0.91  ,   1.0007, ...,   8.

# 4. Splitting data (training and test sets)

In [32]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 39)

# 5. Keras Model

In [33]:
def _read_csv(filename):
    return pd.read_csv(filename, delim_whitespace=True, header=None)

def load_signals(subset):
    signals_data = []

    for signal in signals:
        filename = f'{path}/{subset}/Inertial Signals/{signal}_{subset}.txt'
        signals_data.append(
            _read_csv(filename).to_numpy()
        ) 

    # Transpose is used to change the dimensionality of the output,
    # aggregating the signals by combination of sample/timestep.
    # Resultant shape is (7352 train/2947 test samples, 128 timesteps, 9 signals)
    return np.transpose(signals_data, (1, 2, 0))

def load_y(subset):
    """
    The objective that we are trying to predict is a integer, from 1 to 6,
    that represents a human activity. We return a binary representation of 
    every sample objective as a 6 bits vector using One Hot Encoding
    (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html)
    """
    filename = f'{path}/{subset}/y_{subset}.txt'
    y = _read_csv(filename)[0]

    return pd.get_dummies(y).to_numpy()

def load_data():
    """
    Obtain the dataset from multiple files.
    Returns: X_train, X_test, y_train, y_test
    """
    X_train, X_test = load_signals('train'), load_signals('test')
    y_train, y_test = load_y('train'), load_y('test')

    return X_train, X_test, y_train, y_test

def _count_classes(y):
    return len(set([tuple(category) for category in y]))

In [34]:
epochs = 30
batch_size = 16
n_hidden = 32

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = _count_classes(y_train)

model = Sequential()
model.add(LSTM(n_hidden, input_shape=(timesteps, input_dim)))
model.add(Dropout(0.5))
model.add(Dense(15, activation='softmax')) # Don't use relu here!

  super().__init__(**kwargs)


In [35]:
model.summary()

In [36]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [37]:
model.fit(X_train,
          y_train,
          batch_size=batch_size,
          validation_data=(X_test, y_test),
          epochs=epochs)

Epoch 1/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.0526 - loss: 10.5317 - val_accuracy: 0.0324 - val_loss: 9.0341
Epoch 2/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0892 - loss: 10.5263 - val_accuracy: 0.0335 - val_loss: 9.2174
Epoch 3/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0941 - loss: 10.3690 - val_accuracy: 0.0328 - val_loss: 9.6002
Epoch 4/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0870 - loss: 10.6659 - val_accuracy: 0.0368 - val_loss: 10.0036
Epoch 5/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0868 - loss: 10.7717 - val_accuracy: 0.0288 - val_loss: 10.3952
Epoch 6/30
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0886 - loss: 11.1240 - val_accuracy: 0.0462 - val_loss: 10.6843
Epoch 7

<keras.src.callbacks.history.History at 0x30fe30620>

In [38]:
# Change this to Weather true/false
activities = {
0: 'BASEL',
1: 'BELGRADE',
2: 'BUDAPEST',
3: 'DEBILT',
4: 'DUSSELDORF',
5: 'HEATHROW',
6: 'KASSEL',
7: 'LJUBLJANA',
8: 'MAASTRICHT',
9: 'MADRID',
10: 'MUNCHENB',
11: 'OSLO',
12: 'SONNBLICK',
13: 'STOCKHOLM',
14: 'VALENTIA'
}

In [39]:
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([activities[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([activities[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

In [40]:
# Evaluate
print(confusion_matrix(y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 815us/step
Pred        MADRID  OSLO  VALENTIA
True                              
BASEL         1485    59      2163
BELGRADE       965    35        81
BUDAPEST       183     9         4
DEBILT          89     0         0
DUSSELDORF      32     1         0
HEATHROW        96     3         8
KASSEL          14     1         0
LJUBLJANA       54     7         8
MAASTRICHT       6     1         3
MADRID         141    22       246
MUNCHENB         5     2         3
OSLO             5     0         2
STOCKHOLM        1     0         0
VALENTIA         0     1         3


In [41]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Test Accuracy:", accuracy)

180/180 - 0s - 580us/step - accuracy: 0.0251 - loss: 15.2413
Test Accuracy: 0.025095852091908455
