## Model prosty

Model prosty opiera się na średnim czasie dostawy dla danego miasta.

In [1]:
import numpy as np
import pandas as pd

In [2]:
def loadData(file_name):
    return pd.read_csv("../data/processed/" + file_name)

train_data = loadData("train.csv")
test_data = loadData("test.csv")

### Obliczenie średniego czasu dostawy do poszczególnych miast

In [3]:
cities = []

for col in train_data.columns:
    if col.find("city_") != -1:
        cities.append(col)

In [4]:
city_mean_time = []

for city in cities:
    city_data = train_data[train_data[city] == 1]
    mean = city_data["time"].mean()
    city_mean_time.append({"city": city, "mean": mean})

In [5]:
city_mean_time

[{'city': 'city_Gdynia', 'mean': 64.26805205119678},
 {'city': 'city_Kraków', 'mean': 62.62076919196445},
 {'city': 'city_Poznań', 'mean': 34.08967932140434},
 {'city': 'city_Radom', 'mean': 33.88207843723295},
 {'city': 'city_Szczecin', 'mean': 45.3281002498343},
 {'city': 'city_Warszawa', 'mean': 67.42875904248407},
 {'city': 'city_Wrocław', 'mean': 36.56925459679495}]

### Zapisanie prostego modelu ze średnim czasem dostawy do poszczególnych miast

In [6]:
city_mean_time_df = pd.DataFrame(city_mean_time)

In [7]:
city_mean_time_df.to_json("../models/simple_model.jsonl", orient = "records", lines = True)

### Obliczenie błędu średniokwadratowego dla modelu prostego na zbiorze testowym

In [8]:
def calcSE(expected_values, predicted_value):
    squared_errors = []
    for expected_value in expected_values:
        squared_errors.append(np.square(expected_value - predicted_value))

    return squared_errors

In [9]:
cities_MSE = []
all_cities_SE = []

for i in city_mean_time:
    city = i["city"]
    predicted_value = i["mean"]
    expected_values = test_data[test_data[city] == 1]["time"]
    SE = calcSE(expected_values, predicted_value)
    cities_MSE.append({"city": city, "MSE": np.mean(SE)})
    all_cities_SE += SE
    
MSE = np.mean(all_cities_SE)

In [10]:
for i in cities_MSE:
    print(i["city"] + ": " + str(i["MSE"]))

city_Gdynia: 233.55553585217558
city_Kraków: 237.8301371682691
city_Poznań: 193.39858162645535
city_Radom: 281.588009546685
city_Szczecin: 161.68071110843812
city_Warszawa: 196.210032624335
city_Wrocław: 259.67779589676746


In [11]:
MSE

218.92768308594898

### Zapisanie obliczonych błędów średniokwadratowych jako dane statystyczne

In [12]:
cities_MSE_df = pd.DataFrame(cities_MSE)
cities_MSE_df.to_json("../statistics/" + "cities_MSE_test_simple_model.jsonl", orient = "records", lines = True)

In [13]:
MSE_df.to_json("../statistics/" + "MSE_test_simple_model.jsonl", orient = "records", lines = True)

NameError: name 'MSE_df' is not defined