In [21]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [11]:
DATA_PATH = r"C:/Users/ZIRA/Desktop/MASTER/RP/mahir"
CITY = "New York"
WINDOW_SIZE = 24

In [12]:
csv_files = [
    "humidity.csv",
    "pressure.csv",
    "temperature.csv",
    "wind_direction.csv",
    "wind_speed.csv",
    "weather_description.csv"
]

def load_feature(file):
    df = pd.read_csv(os.path.join(DATA_PATH, file))
    df = df[["datetime", CITY]]
    df["datetime"] = pd.to_datetime(df["datetime"])
    feature_name = file.replace(".csv", "")
    return df.rename(columns={CITY: feature_name}).set_index("datetime")

#učitavanje svih csv-ova
dfs = [load_feature(f) for f in csv_files]

In [13]:
print(load_feature('humidity.csv'));

                     humidity
datetime                     
2012-10-01 12:00:00       NaN
2012-10-01 13:00:00      58.0
2012-10-01 14:00:00      57.0
2012-10-01 15:00:00      57.0
2012-10-01 16:00:00      57.0
...                       ...
2017-11-29 20:00:00       NaN
2017-11-29 21:00:00       NaN
2017-11-29 22:00:00       NaN
2017-11-29 23:00:00       NaN
2017-11-30 00:00:00       NaN

[45253 rows x 1 columns]


In [14]:
data = pd.concat(dfs, axis=1)

In [5]:
print(data)

                     humidity  pressure  temperature  wind_direction  \
datetime                                                               
2012-10-01 12:00:00       NaN       NaN          NaN             NaN   
2012-10-01 13:00:00      58.0    1012.0   288.220000           260.0   
2012-10-01 14:00:00      57.0    1012.0   288.247676           260.0   
2012-10-01 15:00:00      57.0    1012.0   288.326940           260.0   
2012-10-01 16:00:00      57.0    1012.0   288.406203           260.0   
...                       ...       ...          ...             ...   
2017-11-29 20:00:00       NaN       NaN          NaN             NaN   
2017-11-29 21:00:00       NaN       NaN          NaN             NaN   
2017-11-29 22:00:00       NaN       NaN          NaN             NaN   
2017-11-29 23:00:00       NaN       NaN          NaN             NaN   
2017-11-30 00:00:00       NaN       NaN          NaN             NaN   

                     wind_speed weather_description  
datetime 

In [16]:
data = data.ffill().bfill()

In [18]:
data = pd.get_dummies(data, columns=["weather_description"])

In [19]:
data = data.fillna(0)

In [22]:
data = data.astype(np.float32)

In [23]:
print(data)

                     humidity  pressure  temperature  wind_direction  \
datetime                                                               
2012-10-01 12:00:00      58.0    1012.0   288.220001           260.0   
2012-10-01 13:00:00      58.0    1012.0   288.220001           260.0   
2012-10-01 14:00:00      57.0    1012.0   288.247681           260.0   
2012-10-01 15:00:00      57.0    1012.0   288.326935           260.0   
2012-10-01 16:00:00      57.0    1012.0   288.406189           260.0   
...                       ...       ...          ...             ...   
2017-11-29 20:00:00      58.0    1020.0   284.980011             0.0   
2017-11-29 21:00:00      58.0    1020.0   284.980011             0.0   
2017-11-29 22:00:00      58.0    1020.0   284.980011             0.0   
2017-11-29 23:00:00      58.0    1020.0   284.980011             0.0   
2017-11-30 00:00:00      58.0    1020.0   284.980011             0.0   

                     wind_speed  weather_description_broken clo

In [27]:
numerical_cols = [col for col in data.columns if not col.startswith("weather_description_")]
scaler = MinMaxScaler()

In [25]:
print(numerical_cols)

['humidity', 'pressure', 'temperature', 'wind_direction', 'wind_speed']


In [28]:
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [29]:
print(data)

                     humidity  pressure  temperature  wind_direction  \
datetime                                                               
2012-10-01 12:00:00  0.533333  0.569893     0.629704        0.722222   
2012-10-01 13:00:00  0.533333  0.569893     0.629704        0.722222   
2012-10-01 14:00:00  0.522222  0.569893     0.630170        0.722222   
2012-10-01 15:00:00  0.522222  0.569893     0.631503        0.722222   
2012-10-01 16:00:00  0.522222  0.569893     0.632835        0.722222   
...                       ...       ...          ...             ...   
2017-11-29 20:00:00  0.533333  0.655914     0.575220        0.000000   
2017-11-29 21:00:00  0.533333  0.655914     0.575220        0.000000   
2017-11-29 22:00:00  0.533333  0.655914     0.575220        0.000000   
2017-11-29 23:00:00  0.533333  0.655914     0.575220        0.000000   
2017-11-30 00:00:00  0.533333  0.655914     0.575220        0.000000   

                     wind_speed  weather_description_broken clo

In [40]:
def create_sequences(data, target_col, window):
    X, y = [], []
    target_idx = data.columns.get_loc(target_col)
    print(data.columns)
    print(target_idx)
    print(len(data))
    print(len(data)-window)
    for i in range(len(data) - window):
        X.append(data.iloc[i:i+window].values)
        y.append(data.iloc[i+window, target_idx])
    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)
    return X, y

In [41]:
target_column = "temperature"

X, y = create_sequences(data, target_column, WINDOW_SIZE)

Index(['humidity', 'pressure', 'temperature', 'wind_direction', 'wind_speed',
       'weather_description_broken clouds', 'weather_description_drizzle',
       'weather_description_dust', 'weather_description_few clouds',
       'weather_description_fog', 'weather_description_freezing rain',
       'weather_description_haze',
       'weather_description_heavy intensity drizzle',
       'weather_description_heavy intensity rain',
       'weather_description_heavy snow',
       'weather_description_heavy thunderstorm',
       'weather_description_light intensity drizzle',
       'weather_description_light intensity shower rain',
       'weather_description_light rain',
       'weather_description_light rain and snow',
       'weather_description_light snow', 'weather_description_mist',
       'weather_description_moderate rain',
       'weather_description_overcast clouds',
       'weather_description_proximity thunderstorm',
       'weather_description_proximity thunderstorm with drizzl

In [46]:
print(X[1][1])
print(y[1])

[0.5222222  0.5698929  0.63016987 0.72222227 0.28       0.
 0.         0.         1.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        ]
0.66082716


In [33]:
print(y)

[0.6594944  0.66082716 0.6621599  ... 0.57521963 0.57521963 0.57521963]


In [51]:
print("X dtype:", X.dtype, "shape:", X.shape)
print("y dtype:", y.dtype, "shape:", y.shape)
print(len(data))

X dtype: float32 shape: (45229, 24, 41)
y dtype: float32 shape: (45229,)
45253
