In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from numpy import unique
from numpy import reshape
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv2D, Dense, BatchNormalization, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [21]:
path = r'D:\WORK\Bootcamp\Machine Learning\Achievement 1\Project\Data'

In [22]:
# Read the CSV file
df = pd.read_csv(os.path.join(path, "Original", 'Dataset-weather-prediction-dataset-processed.csv'))
df_ans= pd.read_csv(os.path.join(path, "Original", 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'))

# Wrangling

## Find bases that arent on answer sheet

In [23]:
# Extract bases from df (columns ending with '_humidity')
bases_in_df = {col.split('_')[0] for col in df.columns if col.endswith('_humidity')}

# Extract bases from df_ans (columns ending with '_pleasant_weather')
bases_in_df_ans = {col.split('_')[0] for col in df_ans.columns if col.endswith('_pleasant_weather')}

# Find the missing "bases" in df_ans
missing_bases = bases_in_df - bases_in_df_ans

# Print the result
print("Missing bases in df_ans:", missing_bases)

Missing bases in df_ans: {'ROMA', 'TOURS', 'GDANSK'}


In [24]:
# Drop columns in df_ans that start with any missing base
columns_to_drop = [col for col in df.columns if any(col.startswith(base) for base in missing_bases)]

# Drop these columns from df_ans
df = df.drop(columns=columns_to_drop)

# Print confirmation
print(f"Columns dropped: {columns_to_drop}")

columns_to_drop

Columns dropped: ['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max', 'ROMA_cloud_cover', 'ROMA_wind_speed', 'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean', 'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max']


['GDANSK_cloud_cover',
 'GDANSK_humidity',
 'GDANSK_precipitation',
 'GDANSK_snow_depth',
 'GDANSK_temp_mean',
 'GDANSK_temp_min',
 'GDANSK_temp_max',
 'ROMA_cloud_cover',
 'ROMA_wind_speed',
 'ROMA_humidity',
 'ROMA_pressure',
 'ROMA_sunshine',
 'ROMA_temp_mean',
 'TOURS_wind_speed',
 'TOURS_humidity',
 'TOURS_pressure',
 'TOURS_global_radiation',
 'TOURS_precipitation',
 'TOURS_temp_mean',
 'TOURS_temp_min',
 'TOURS_temp_max']

In [25]:
df

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.0180,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.0180,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.90,1.0180,0.18,0.30,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.0180,0.58,0.00,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.0180,0.65,0.14,0,5.4,...,3,0.80,1.0328,0.46,0.00,0,5.7,5.7,3.0,8.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,20221027,10,1,2.1,0.79,1.0248,1.34,0.22,0,7.7,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22946,20221028,10,6,2.1,0.77,1.0244,1.34,0.22,0,5.4,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22947,20221029,10,4,2.1,0.76,1.0227,1.34,0.22,0,6.1,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22948,20221030,10,5,2.1,0.80,1.0212,1.34,0.22,0,5.8,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5


In [26]:
df_ans

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,20221027,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22946,20221028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22947,20221029,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22948,20221030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Find years with less missing data

In [27]:
# Ensure DATE column is of string type
df['Year'] = df['DATE'].astype(str).str[:4]  # Extract the year from the DATE column

# Count the number of entries for each year
year_counts = df.groupby('Year').size().reset_index(name='Count')

# Display the result
print(year_counts)

    Year  Count
0   1960    366
1   1961    365
2   1962    365
3   1963    365
4   1964    366
..   ...    ...
58  2018    365
59  2019    365
60  2020    366
61  2021    365
62  2022    304

[63 rows x 2 columns]


In [28]:
# Ensure DATE column is of string type and extract the year
df['Year'] = df['DATE'].astype(str).str[:4]  # Extract the year

# Filter out rows where the year is 2022
df_cleaned = df[df['Year'] != '2022']

# Drop the temporary 'Year' column if it's no longer needed
df_cleaned = df_cleaned.drop(columns=['Year'])

# Display the updated DataFrame
print(df.head())

       DATE  MONTH  BASEL_cloud_cover  BASEL_wind_speed  BASEL_humidity  \
0  19600101      1                  7               2.1            0.85   
1  19600102      1                  6               2.1            0.84   
2  19600103      1                  8               2.1            0.90   
3  19600104      1                  3               2.1            0.92   
4  19600105      1                  6               2.1            0.95   

   BASEL_pressure  BASEL_global_radiation  BASEL_precipitation  \
0           1.018                    0.32                 0.09   
1           1.018                    0.36                 1.05   
2           1.018                    0.18                 0.30   
3           1.018                    0.58                 0.00   
4           1.018                    0.65                 0.14   

   BASEL_snow_depth  BASEL_sunshine  ...  VALENTIA_humidity  \
0                 0             0.7  ...               0.88   
1                 0       

In [29]:
# Ensure DATE column is of string type
df['Year'] = df['DATE'].astype(str).str[:4]  # Extract the year from the DATE column

# Count the number of entries for each year
year_counts = df.groupby('Year').size().reset_index(name='Count')

# Display the result
print(year_counts)

    Year  Count
0   1960    366
1   1961    365
2   1962    365
3   1963    365
4   1964    366
..   ...    ...
58  2018    365
59  2019    365
60  2020    366
61  2021    365
62  2022    304

[63 rows x 2 columns]


In [30]:
df_cleaned=df.drop(columns=['DATE', 'MONTH' , 'Year'], inplace=True)
df_ans=df_ans.drop(columns=['DATE'])

In [31]:
df_cleaned

In [32]:
total_missing = df.isnull().sum().sum()

In [33]:
total_missing

0

## Removing Unnessary Observations

In [34]:
columns_list = df.columns.tolist()
print(columns_list)


['BASEL_cloud_cover', 'BASEL_wind_speed', 'BASEL_humidity', 'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation', 'BASEL_snow_depth', 'BASEL_sunshine', 'BASEL_temp_mean', 'BASEL_temp_min', 'BASEL_temp_max', 'BELGRADE_cloud_cover', 'BELGRADE_humidity', 'BELGRADE_pressure', 'BELGRADE_global_radiation', 'BELGRADE_precipitation', 'BELGRADE_sunshine', 'BELGRADE_temp_mean', 'BELGRADE_temp_min', 'BELGRADE_temp_max', 'BUDAPEST_cloud_cover', 'BUDAPEST_humidity', 'BUDAPEST_pressure', 'BUDAPEST_global_radiation', 'BUDAPEST_precipitation', 'BUDAPEST_sunshine', 'BUDAPEST_temp_mean', 'BUDAPEST_temp_min', 'BUDAPEST_temp_max', 'DEBILT_cloud_cover', 'DEBILT_wind_speed', 'DEBILT_humidity', 'DEBILT_pressure', 'DEBILT_global_radiation', 'DEBILT_precipitation', 'DEBILT_sunshine', 'DEBILT_temp_mean', 'DEBILT_temp_min', 'DEBILT_temp_max', 'DUSSELDORF_cloud_cover', 'DUSSELDORF_wind_speed', 'DUSSELDORF_humidity', 'DUSSELDORF_pressure', 'DUSSELDORF_global_radiation', 'DUSSELDORF_precipitation', 'DU

In [35]:
# Extract observations (second part after '_')
observations = [col.split('_', 1)[1] for col in columns_list if '_' in col]

# Find unique observations
unique_observations = sorted(set(observations))

# Display unique observations
print(unique_observations)

['cloud_cover', 'global_radiation', 'humidity', 'precipitation', 'pressure', 'snow_depth', 'sunshine', 'temp_max', 'temp_mean', 'temp_min', 'wind_speed']


In [36]:
from collections import defaultdict

# Count the observations for each base
base_observations = defaultdict(set)

for col in columns_list:
    if '_' in col:
        base, observation = col.split('_', 1)
        base_observations[base].add(observation)

# Find bases with differing observation counts
all_observation_counts = [len(observations) for observations in base_observations.values()]
expected_count = max(all_observation_counts)  # Assume max is the expected count

# Identify bases with missing/extra observations
inconsistent_bases = {base: observations for base, observations in base_observations.items() if len(observations) != expected_count}

# Display results
if inconsistent_bases:
    print("Bases with inconsistent observation counts:")
    for base, observations in inconsistent_bases.items():
        print(f"{base}: {len(observations)} observations")
else:
    print("All bases have consistent observation counts.")

Bases with inconsistent observation counts:
BELGRADE: 9 observations
BUDAPEST: 9 observations
DEBILT: 10 observations
HEATHROW: 10 observations
KASSEL: 9 observations
LJUBLJANA: 10 observations
MAASTRICHT: 10 observations
MADRID: 10 observations
MUNCHENB: 9 observations
SONNBLICK: 10 observations
STOCKHOLM: 8 observations
VALENTIA: 10 observations


In [37]:
# Extract all unique observations, including the second and third parts of the column names
all_observations = {tuple(col.split('_')[1:]) for col in columns_list if '_' in col}

# Create a dictionary to track observations per base
base_observations = defaultdict(set)

# Populate the dictionary
for col in columns_list:
    if '_' in col:
        parts = col.split('_')
        base = parts[0]
        observation = tuple(parts[1:])  # Include both the second and third parts
        base_observations[base].add(observation)

# Identify missing observations for each base
missing_observations = {
    base: all_observations - observations
    for base, observations in base_observations.items()
    if all_observations - observations  # Only include bases with missing observations
}

# Display results
if missing_observations:
    print("Bases with missing observations:")
    for base, missing in missing_observations.items():
        print(f"{base}: {missing}")
else:
    print("All bases have all observations.")

Bases with missing observations:
BELGRADE: {('wind', 'speed'), ('snow', 'depth')}
BUDAPEST: {('wind', 'speed'), ('snow', 'depth')}
DEBILT: {('snow', 'depth')}
HEATHROW: {('wind', 'speed')}
KASSEL: {('snow', 'depth'), ('cloud', 'cover')}
LJUBLJANA: {('snow', 'depth')}
MAASTRICHT: {('snow', 'depth')}
MADRID: {('snow', 'depth')}
MUNCHENB: {('wind', 'speed'), ('pressure',)}
SONNBLICK: {('snow', 'depth')}
STOCKHOLM: {('wind', 'speed'), ('humidity',), ('snow', 'depth')}
VALENTIA: {('wind', 'speed')}


In [38]:
# Define the substrings to remove
substrings_to_remove = ['_snow_depth','_wind_speed']

# Filter columns that do not contain the substrings
filtered_columns = [col for col in df.columns if not any(sub in col for sub in substrings_to_remove)]

# Keep only the filtered columns in the DataFrame
df_cleaned = df[filtered_columns]

In [39]:
df_cleaned.shape

(22950, 132)

# 

In [40]:
# Initialize a dictionary to count observations for each base
base_counts = defaultdict(lambda: defaultdict(int))

# Assuming `columns_list` is already defined and contains the column names
for col in columns_list:
    if '_' in col:  # Ensure column has the format BASE_observation
        base, observation = col.split('_', 1)
        base_counts[base][observation] += 1

# Prepare the data to calculate the total count for each base
total_counts = {}

# Calculate total counts for each base
for base, observations in base_counts.items():
    total_counts[base] = sum(observations.values())  # Total number of observations for each base

# Convert the total counts to a DataFrame for easy readability
df_total_counts = pd.DataFrame(list(total_counts.items()), columns=['Base', 'Total Observation Count'])

# Display the DataFrame
print(df_total_counts)

          Base  Total Observation Count
0        BASEL                       11
1     BELGRADE                        9
2     BUDAPEST                        9
3       DEBILT                       10
4   DUSSELDORF                       11
5     HEATHROW                       10
6       KASSEL                        9
7    LJUBLJANA                       10
8   MAASTRICHT                       10
9       MADRID                       10
10    MUNCHENB                        9
11        OSLO                       11
12   SONNBLICK                       10
13   STOCKHOLM                        8
14    VALENTIA                       10


# Add missing oberservation

In [41]:
# Create a list of all unique station names in the dataset

all_stations = set([col.split('_')[0] for col in df_cleaned.columns if '_' in col])
all_stations

{'BASEL',
 'BELGRADE',
 'BUDAPEST',
 'DEBILT',
 'DUSSELDORF',
 'HEATHROW',
 'KASSEL',
 'LJUBLJANA',
 'MAASTRICHT',
 'MADRID',
 'MUNCHENB',
 'OSLO',
 'SONNBLICK',
 'STOCKHOLM',
 'VALENTIA'}

In [42]:
observation_types = ['cloud_cover', 'humidity', 'pressure']

missing_stations_by_observation = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in df_cleaned.columns if col.endswith(obs)]
    
    # Extract station names by removing the observation type from the column names
    station_names = set([col.replace(f'_{obs}', '') for col in columns])
    
    # Identify stations that are in all_stations but missing from the current observation type
    missing_stations = all_stations - station_names
    
    # Store the missing station names in the dictionary
    missing_stations_by_observation[obs] = missing_stations

# Print the missing station names for each observation type
for obs, missing_stations in missing_stations_by_observation.items():
    print(f"\nStations missing from {obs}:")
    if missing_stations:
        for station in missing_stations:
            print(station)
    else:
        print("None")


Stations missing from cloud_cover:
KASSEL

Stations missing from humidity:
STOCKHOLM

Stations missing from pressure:
MUNCHENB


In [43]:
ljubljana_columns = df_cleaned.filter(like='LJUBLJANA')

print(ljubljana_columns)

       LJUBLJANA_cloud_cover  LJUBLJANA_humidity  LJUBLJANA_pressure  \
0                          8                1.00              1.0173   
1                          6                0.94              1.0173   
2                          8                0.96              1.0173   
3                          6                0.94              1.0173   
4                          7                0.94              1.0173   
...                      ...                 ...                 ...   
22945                      4                0.80              1.0289   
22946                      3                0.82              1.0291   
22947                      3                0.81              1.0270   
22948                      3                0.77              1.0238   
22949                      3                0.77              1.0114   

       LJUBLJANA_global_radiation  LJUBLJANA_precipitation  \
0                            0.20                     0.00   
1          

In [44]:
kassel_columns = df_cleaned.filter(like='KASSEL')

print(kassel_columns)

       KASSEL_humidity  KASSEL_pressure  KASSEL_global_radiation  \
0                 0.82           1.0094                     0.28   
1                 0.86           1.0086                     0.12   
2                 0.91           1.0129                     0.12   
3                 0.87           1.0290                     0.12   
4                 0.86           1.0262                     0.13   
...                ...              ...                      ...   
22945             0.77           1.0161                     1.14   
22946             0.77           1.0161                     1.14   
22947             0.77           1.0161                     1.14   
22948             0.77           1.0161                     1.14   
22949             0.77           1.0161                     1.14   

       KASSEL_precipitation  KASSEL_sunshine  KASSEL_temp_mean  \
0                      0.48              1.6               7.9   
1                      0.27              0.0       

In [45]:
df_cleaned['OSLO_humidity']

0        0.98
1        0.62
2        0.69
3        0.98
4        0.96
         ... 
22945    0.98
22946    1.00
22947    0.85
22948    0.94
22949    0.97
Name: OSLO_humidity, Length: 22950, dtype: float64

In [46]:
# If df_cleaned is a slice of another DataFrame, ensure it's a copy first
df_cleaned = df_cleaned.copy()

# Now perform the operations
df_cleaned['KASSEL_cloud_cover'] = df_cleaned['LJUBLJANA_cloud_cover']
df_cleaned['MUNCHENB_pressure'] = df_cleaned['SONNBLICK_pressure']
df_cleaned['STOCKHOLM_humidity'] = df_cleaned['OSLO_humidity']

In [47]:
df_cleaned

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max,KASSEL_cloud_cover,MUNCHENB_pressure,STOCKHOLM_humidity
0,7,0.85,1.0180,0.32,0.09,0.7,6.5,0.8,10.9,1,...,1.0003,0.45,0.34,4.7,8.5,6.0,10.9,8,1.0304,0.98
1,6,0.84,1.0180,0.36,1.05,1.1,6.1,3.3,10.1,6,...,1.0007,0.25,0.84,0.7,8.9,5.6,12.1,6,1.0292,0.62
2,8,0.90,1.0180,0.18,0.30,0.0,8.5,5.1,9.9,6,...,1.0096,0.17,0.08,0.1,10.5,8.1,12.9,8,1.0320,0.69
3,3,0.92,1.0180,0.58,0.00,4.1,6.3,3.8,10.6,8,...,1.0184,0.13,0.98,0.0,7.4,7.3,10.6,6,1.0443,0.98
4,6,0.95,1.0180,0.65,0.14,5.4,3.0,-0.7,6.0,8,...,1.0328,0.46,0.00,5.7,5.7,3.0,8.4,7,1.0430,0.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,1,0.79,1.0248,1.34,0.22,7.7,15.9,11.4,21.4,2,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,4,1.0263,0.98
22946,6,0.77,1.0244,1.34,0.22,5.4,16.7,14.3,21.9,0,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,3,1.0263,1.00
22947,4,0.76,1.0227,1.34,0.22,6.1,16.7,13.1,22.4,2,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,3,1.0263,0.85
22948,5,0.80,1.0212,1.34,0.22,5.8,15.4,11.6,21.1,1,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,3,1.0263,0.94


In [48]:
# Create a list of all unique station names in the dataset

all_stations = set([col.split('_')[0] for col in df_cleaned.columns if '_' in col])
all_stations

{'BASEL',
 'BELGRADE',
 'BUDAPEST',
 'DEBILT',
 'DUSSELDORF',
 'HEATHROW',
 'KASSEL',
 'LJUBLJANA',
 'MAASTRICHT',
 'MADRID',
 'MUNCHENB',
 'OSLO',
 'SONNBLICK',
 'STOCKHOLM',
 'VALENTIA'}

In [49]:
df_cleaned.to_csv(os.path.join(path, 'Prepared', 'df_cleaned.csv'))

# Reshape Data

In [50]:
#Create an 'X' matrix by dropping the unneeded columns.
X = df_cleaned
y = df_ans

In [51]:
X.shape

(22950, 135)

In [52]:
# Turn X and y from a df to arrays

X = np.array(X)
y = np.array(y)

In [53]:
# reshape
X = X.reshape(-1,15,9)

In [54]:
# Check Shape

X

array([[[ 7.0000e+00,  8.5000e-01,  1.0180e+00, ...,  6.5000e+00,
          8.0000e-01,  1.0900e+01],
        [ 1.0000e+00,  8.1000e-01,  1.0195e+00, ...,  3.7000e+00,
         -9.0000e-01,  7.9000e+00],
        [ 4.0000e+00,  6.7000e-01,  1.0170e+00, ...,  2.4000e+00,
         -4.0000e-01,  5.1000e+00],
        ...,
        [ 1.0304e+00,  4.8000e-01,  1.0000e-02, ..., -3.2000e+00,
          5.0000e+00,  1.0114e+00],
        [ 5.0000e-02,  3.2000e-01,  0.0000e+00, ...,  5.0000e+00,
          8.8000e-01,  1.0003e+00],
        [ 4.5000e-01,  3.4000e-01,  4.7000e+00, ...,  8.0000e+00,
          1.0304e+00,  9.8000e-01]],

       [[ 6.0000e+00,  8.4000e-01,  1.0180e+00, ...,  6.1000e+00,
          3.3000e+00,  1.0100e+01],
        [ 6.0000e+00,  8.4000e-01,  1.0172e+00, ...,  2.9000e+00,
          2.2000e+00,  4.4000e+00],
        [ 4.0000e+00,  6.7000e-01,  1.0170e+00, ...,  2.3000e+00,
          1.4000e+00,  3.1000e+00],
        ...,
        [ 1.0292e+00,  2.1000e-01,  6.1000e-01, ..., -

# Data Splite

In [55]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,y,random_state = 42)

In [56]:
print(X_train.shape, "|", Y_train.shape)
print(X_test.shape, "|", Y_test.shape)

(17212, 15, 9) | (17212, 15)
(5738, 15, 9) | (5738, 15)


In [57]:
tf.random.set_seed(42)

In [58]:
X_train

array([[[  2.    ,   0.69  ,   1.0131, ...,  20.4   ,  13.6   ,
          26.9   ],
        [  3.    ,   0.53  ,   1.0132, ...,  22.5   ,  16.2   ,
          28.6   ],
        [  5.    ,   0.54  ,   1.0125, ...,  22.5   ,  18.6   ,
          27.1   ],
        ...,
        [  1.029 ,   2.49  ,   0.18  , ...,   3.8   ,   5.    ,
           1.0192],
        [  2.19  ,   0.    ,   7.2   , ...,   6.    ,   0.83  ,
           1.0082],
        [  1.13  ,   0.28  ,   2.1   , ...,   6.    ,   1.029 ,
           0.48  ]],

       [[  8.    ,   0.82  ,   1.0166, ...,  -1.8   ,  -3.2   ,
          -0.5   ],
        [  1.    ,   0.66  ,   1.0209, ...,  -3.6   ,  -6.    ,
           0.1   ],
        [  4.    ,   0.67  ,   1.017 , ...,  -3.3   ,  -7.2   ,
          -0.2   ],
        ...,
        [  1.0327,   0.96  ,   0.11  , ..., -15.6   ,   5.    ,
           1.0076],
        [  0.17  ,   0.    ,   1.4   , ...,   5.    ,   0.82  ,
           1.0142],
        [  0.23  ,   0.41  ,   3.4   , ...,   2.

In [59]:
#len(X_train[0])

In [60]:
#len(X_train[0][0])

# RNN

In [61]:
epochs = 30
batch_size = 32
n_hidden = 256

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(Y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='sigmoid')) 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [62]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [63]:
model.fit(X_train,
          Y_train,
          batch_size=batch_size,
          validation_data=(X_test, Y_test),
          epochs=epochs)

Epoch 1/30
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.0933 - loss: 1119.8735 - val_accuracy: 0.1441 - val_loss: 9450.6162
Epoch 2/30
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.1419 - loss: 15915.4863 - val_accuracy: 0.1664 - val_loss: 41877.7852
Epoch 3/30
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.1310 - loss: 58449.5156 - val_accuracy: 0.1044 - val_loss: 108567.4375
Epoch 4/30
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.1278 - loss: 140537.2344 - val_accuracy: 0.1860 - val_loss: 217134.6406
Epoch 5/30
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.1257 - loss: 272657.9688 - val_accuracy: 0.1520 - val_loss: 379254.0625
Epoch 6/30
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.1184 - loss: 466015.1875 - val_accuracy: 0.

<keras.src.callbacks.history.History at 0x2344a927b00>

# Compiling and Running

In [64]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [65]:
model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=2)

Epoch 1/30
538/538 - 7s - 14ms/step - accuracy: 0.1420 - loss: 27418038.0000
Epoch 2/30
538/538 - 4s - 7ms/step - accuracy: 0.1294 - loss: 30670732.0000
Epoch 3/30
538/538 - 5s - 9ms/step - accuracy: 0.1279 - loss: 34420416.0000
Epoch 4/30
538/538 - 3s - 5ms/step - accuracy: 0.1275 - loss: 38293948.0000
Epoch 5/30
538/538 - 3s - 5ms/step - accuracy: 0.1262 - loss: 42294608.0000
Epoch 6/30
538/538 - 3s - 5ms/step - accuracy: 0.1232 - loss: 46635484.0000
Epoch 7/30
538/538 - 3s - 5ms/step - accuracy: 0.1255 - loss: 51148672.0000
Epoch 8/30
538/538 - 3s - 5ms/step - accuracy: 0.1285 - loss: 55626676.0000
Epoch 9/30
538/538 - 6s - 11ms/step - accuracy: 0.1230 - loss: 60502844.0000
Epoch 10/30
538/538 - 3s - 5ms/step - accuracy: 0.1212 - loss: 65581712.0000
Epoch 11/30
538/538 - 3s - 5ms/step - accuracy: 0.1227 - loss: 70954160.0000
Epoch 12/30
538/538 - 3s - 5ms/step - accuracy: 0.1204 - loss: 76534568.0000
Epoch 13/30
538/538 - 3s - 5ms/step - accuracy: 0.1217 - loss: 82246248.0000
Epoch 

<keras.src.callbacks.history.History at 0x2344c2f12e0>

In [66]:
# Define list of stations names

stations = {
0: 'BASEL',
1: 'BELGRADE',
2: 'BUDAPEST',
3: 'DEBILT',
4: 'DUSSELDORF',
5: 'HEATHROW',
6: 'KASSEL',
7: 'LJUBLJANA',
8: 'MAASTRICHT',
9: 'MADRID',
10: 'MUNCHENB',
11: 'OSLO',
12: 'SONNBLICK',
13: 'STOCKHOLM',
14: 'VALENTIA'

}

In [67]:
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([stations[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([stations[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

In [68]:
station_names

{'BASEL',
 'BELGRADE',
 'BUDAPEST',
 'DEBILT',
 'DUSSELDORF',
 'HEATHROW',
 'KASSEL',
 'LJUBLJANA',
 'MAASTRICHT',
 'MADRID',
 'OSLO',
 'SONNBLICK',
 'STOCKHOLM',
 'VALENTIA'}

In [69]:
# Evaluate
print(confusion_matrix(Y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Pred        BASEL  VALENTIA
True                       
BASEL        3679         3
BELGRADE     1092         0
BUDAPEST      214         0
DEBILT         82         0
DUSSELDORF     29         0
HEATHROW       82         0
KASSEL         11         0
LJUBLJANA      61         0
MAASTRICHT      9         0
MADRID        458         0
MUNCHENB        8         0
OSLO            5         0
STOCKHOLM       4         0
VALENTIA        1         0


In [72]:
# Evaluate
print(confusion_matrix(Y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Pred        BASEL  BELGRADE  DEBILT  DUSSELDORF  LJUBLJANA  MADRID  MUNCHENB  \
True                                                                           
BASEL          48        14     302         479       1211      72       645   
BELGRADE        3         9       0           9        698      10       218   
BUDAPEST        0         0       0           0        112       1        69   
DEBILT          0         0       0           0         50       0        26   
DUSSELDORF      0         0       0           0         14       0        13   
HEATHROW        0         2       0           0         24       0        45   
KASSEL          0         0       0           0          4       0         2   
LJUBLJANA       0         1       0           0          6       3        37   
MAASTRICHT      0         0       0           0          5       0         3   
MADRID          0         5       6          