In [7]:
import datetime
import definitions as d
import geopy.distance
import neural_network as nn
import numpy as np
import os
import pandas as pd

from sklearn.preprocessing import OneHotEncoder


def load_raw_data(path, type):
    files = os.listdir(f"{path}/{type}")
    data = {}
    surfix = f"_{type}.csv"
    for file in files:
        name = file[:file.find(surfix)]
        df = pd.read_csv(f"{path}/{type}/{file}", sep=";")
        data[name] = df
    return data

def get_nearest_cities(city_attributes):
    nearest_cities = {}
    for _, row in city_attributes.iterrows():
        source = (row["Latitude"], row["Longitude"])
        city_dist = []
        for _, row2 in city_attributes.iterrows():
            if row["City"] is row2["City"]:
                continue
            destination = (row2["Latitude"], row2["Longitude"])
            city_dist.append((row2["City"], geopy.distance.geodesic(source, destination).km))
        city_dist.sort(key=lambda x: x[1])
        nearest_cities[row["City"]] = [cd[0] for cd in city_dist[:3]]
    return nearest_cities
    
def load_train():
    dict = load_raw_data("data", "train")
    dict.pop("weather_description")
    for key, df in dict.items():
        dict[key] = df.iloc[12:, :]
    return dict

def load_test():
    dict = load_raw_data("data", "test")
    dict.pop("weather_description")
    for key, df in dict.items():
        dict[key] = df.iloc[:-1, :]
    return dict

def get_normalization_params(raw):
    params = {}
    for key, df in raw.items():
        all = np.reshape(df.to_numpy()[:, 1:], -1)
        params[key] = (np.nanmean(all), np.nanstd(all))
    return params

def to_city_time_vect(raw):
    cities = next(iter(raw.values())).columns[1:]
    hours = next(iter(raw.values()))[["datetime"]]
    ctvs = {c: hours.copy() for c in cities}
    for city in cities:
        for key, df in raw.items():
            ctvs[city][key] = df[[city]]
    return ctvs

def normalize(ctv, params):
    for df in ctv.values():
        for param, ms in params.items():
            mean, std = ms
            df[param] = (df[param] - mean) / std 

def normalize_city_attributes(city_attributes):
    latitude_mean = city_attributes["Latitude"].mean()
    latitude_std = city_attributes["Latitude"].std()

    longitude_mean = city_attributes["Longitude"].mean()
    longitude_std = city_attributes["Longitude"].std()

    city_attributes["Latitude"] = (city_attributes["Latitude"] - latitude_mean) / latitude_std
    city_attributes["Longitude"] = (city_attributes["Longitude"] - longitude_mean) / longitude_std

def to_city_day_vect(ctv, wind_treshold):
    cdv = {}
    for city, df in ctv.items():
        u = 0
        cdr = []
        while u < len(df):
            w = u + 24
            date = df.iloc[u, 0]
            vec = np.reshape(df.iloc[u : w, 1:].to_numpy(), -1)
            temp_mean = df.iloc[u : w]["temperature"].mean()
            wind_cat = int(np.any(df.iloc[u:w]["wind_speed"].to_numpy() > wind_treshold))
            cdr.append((date, vec, temp_mean, wind_cat))
            u = w
        cdv[city] = cdr
    return cdv

def get_city_encoder(cities_attr):
    cities = np.reshape(city_attributes_raw["City"].to_numpy(), (-1, 1))
    cohe = OneHotEncoder()
    cohe.fit(cities)
    return cohe

def get_wind_treshold(souce, params):
    mean, std = params["wind_speed"]
    return (souce - mean) / std

def drop_nan_records(data_set):
    mask = [np.any(np.isnan(val), axis=0) for val in data_set.values()]
    mask = np.vstack(mask)
    mask = np.any(mask, axis=0)
    return {key: val[:, ~mask] for key, val in data_set.items()}

def get_set1(cdv, city_encoder, city_attributes_raw):
    d1 = []
    d2 = []
    d3 = []
    output_temp = []
    output_wind = []
    date = []
    city_one_hot = []
    cord = []

    for city, dv in cdv.items():
        d1 += [r[1] for r in dv[:-4]]
        d2 += [r[1] for r in dv[1:-3]]
        d3 += [r[1] for r in dv[2:-2]]
        output_temp += [r[2] for r in dv[4:]]
        output_wind += [r[3] for r in dv[4:]]
        date_str = [r[0] for r in dv[4:]]
        date += [datetime.datetime.strptime(d, "%d.%m.%Y %H:%M").timetuple().tm_yday / 365 - 0.5 for d in date_str]
        size = len(date_str)
        city_one_hot += [city_encoder.transform([[city]]).toarray()[0]] * size
        cord += [city_attributes_raw.loc[city_attributes_raw["City"] == city][["Latitude", "Longitude"]].to_numpy()] * size

    set = {
        "d1": d1,
        "d2": d2,
        "d3": d3,
        "output_temp": output_temp,
        "output_wind": output_wind,
        "date": date,
        "city_one_hot": city_one_hot,
        "cord": cord
    }

    return {key: np.vstack(val).T for key, val in set.items()}

city_attributes_raw = pd.read_csv("data/city_attributes.csv", sep=";")

train_raw = load_train()
nearest_cities = get_nearest_cities(city_attributes_raw)
normalization_params = get_normalization_params(train_raw)
train_ctv = to_city_time_vect(train_raw)
normalize(train_ctv, normalization_params)
normalize_city_attributes(city_attributes_raw)
wind_treshold = get_wind_treshold(6, normalization_params)
train_cdv = to_city_day_vect(train_ctv, wind_treshold)
city_encoder = get_city_encoder(city_attributes_raw)
train_set = get_set1(train_cdv, city_encoder, city_attributes_raw)
# train_set = drop_nan_records(train_set)

test_raw = load_test()
test_ctv = to_city_time_vect(test_raw)
normalize(test_ctv, normalization_params)
test_cdv = to_city_day_vect(test_ctv, wind_treshold)
test_set = get_set1(test_cdv, city_encoder, city_attributes_raw)
# test_set = drop_nan_records(test_set)

In [8]:
import neural_network as nn
import definitions as d

rng = np.random.default_rng(0)

def get_day_layer(num):
    l = nn.InputLayer(120, f"d{num}")
    return nn.FullConnectLayer(l, 60, d.relu, rng)

def get_days_layer():
    ls = [get_day_layer(1), get_day_layer(2), get_day_layer(3)]
    l = nn.MergeLayer(ls)
    return nn.FullConnectLayer(l, 60, d.relu, rng)

def get_city_layer():
    coh = nn.InputLayer(36, "city_one_hot")
    date = nn.InputLayer(1, "date")
    cord = nn.InputLayer(2, "cord")
    return nn.MergeLayer([coh, date, cord])

def get_nn(layer_sizes, activations, loss):
    assert len(layer_sizes) == len(activations)

    ds = get_days_layer()
    c = get_city_layer()
    l = nn.MergeLayer([ds, c])
    for (n, activation) in zip(layer_sizes, activations):
        l = nn.FullConnectLayer(l, n, activation, rng)
    return nn.NeuralNetwork(l, loss)

In [10]:
net2 = get_nn([60, 20, 2], [d.relu, d.sigmoid, d.softmax], d.cross_entropy_loss)
net2.train(train_set, test_set, 1024, "output_wind", rng)

KeyboardInterrupt: 

In [3]:
net = get_nn([60, 1], [d.relu, d.linear], d.l2_loss)
net.train(train_set, test_set, 1024, "output_temp", rng)

Train: 0.23989644624107975, test: 1.549964456313118
Train: 0.11328532739660888, test: 1.6732014095030603
Train: 0.10447389469839531, test: 1.7348990708800631
Train: 0.09915099681070151, test: 1.7099880592021324
Train: 0.0953802111296878, test: 1.6726657026078102
Train: 0.09187527024395371, test: 1.636069154311085
Train: 0.08869845809289927, test: 1.57824380989809
Train: 0.08590923306149792, test: 1.5451232939103718
Train: 0.08380313094703273, test: 1.5019675628669582
Train: 0.08273224193283998, test: 1.4599433875430916
Train: 0.08091403524347734, test: 1.4703741700289505
Train: 0.07984315748641627, test: 1.496271700265284
Train: 0.07768734791819104, test: 1.45642805118814
Train: 0.0766122089205214, test: 1.4766793432173422
Train: 0.07594758653617394, test: 1.418715028507999
Train: 0.0748376553635245, test: 1.4305429581089846
Train: 0.07374439199246278, test: 1.3987005072175998
Train: 0.07316565872409382, test: 1.3659230405843878
Train: 0.07208331507284216, test: 1.397997137563094
Train

KeyboardInterrupt: 