# Predykcja wypożyczania rowerów z użyciem PyTorch
+ Autorzy: Łukasz Staniszewski, Łukasz Topolski
+ Uczelnia: Politechnika Warszawska

## Hiperparametry

In [68]:
NUM_HIDDEN_1 = 14
NUM_HIDDEN_2 = 6
BATCH_SIZE = 128
LEARN_RATE = 0.05
EPOCHS = 500

## Biblioteki

In [69]:
import torch
from torch import nn as nn
import pandas as pd
import torch.utils.data as data
import numpy as np

## Przygotowanie PyTorch

In [70]:
# GPU operations have a separate seed we also want to set
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

# Additionally, some operations on a GPU are implemented stochastic for efficiency
# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [71]:
device = torch.device("cuda")
torch.cuda.is_available()

True

## Przygotowanie danych treningowych i ewaluacyjnych

In [72]:
data_training = pd.read_csv('data/data.csv')
data_training

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,17089,2012-12-19,4,1,12,19,0,3,1,1,0.38,0.3939,0.50,0.3881,7,329,336
10882,17090,2012-12-19,4,1,12,20,0,3,1,1,0.36,0.3485,0.57,0.2239,10,231,241
10883,17091,2012-12-19,4,1,12,21,0,3,1,1,0.34,0.3182,0.61,0.2239,4,164,168
10884,17092,2012-12-19,4,1,12,22,0,3,1,1,0.34,0.3485,0.61,0.0896,12,117,129


In [73]:
data_test = pd.read_csv('data/evaluation_data.csv')
data_test

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,2011-01-20,1,0,1,0,0,4,1,1,0.26,0.2273,0.56,0.3881
1,2011-01-20,1,0,1,1,0,4,1,1,0.26,0.2727,0.56,0.0000
2,2011-01-20,1,0,1,2,0,4,1,1,0.26,0.2727,0.56,0.0000
3,2011-01-20,1,0,1,3,0,4,1,1,0.26,0.2576,0.56,0.1642
4,2011-01-20,1,0,1,4,0,4,1,1,0.26,0.2576,0.56,0.1642
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642
6489,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642
6490,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642
6491,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343


+ Ze względu na to, że w zbiorze testowym nie ma kolumn 'instant', 'casual' i 'registered':

In [74]:
data_training = data_training.drop(['instant', 'casual', 'registered'], axis=1)
data_training

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,16
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,40
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,32
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,13
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19,4,1,12,19,0,3,1,1,0.38,0.3939,0.50,0.3881,336
10882,2012-12-19,4,1,12,20,0,3,1,1,0.36,0.3485,0.57,0.2239,241
10883,2012-12-19,4,1,12,21,0,3,1,1,0.34,0.3182,0.61,0.2239,168
10884,2012-12-19,4,1,12,22,0,3,1,1,0.34,0.3485,0.61,0.0896,129


+ Ze względu na to, że 'dteday' zawiera informacje zawarte w kolejnych kolumnach, można się go pozbyć z obu zbiorów:

In [75]:
data_training = data_training.drop(['dteday'], axis=1)
data_test = data_test.drop(['dteday'], axis=1)

In [76]:
data_training

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,4,1,12,19,0,3,1,1,0.38,0.3939,0.50,0.3881,336
10882,4,1,12,20,0,3,1,1,0.36,0.3485,0.57,0.2239,241
10883,4,1,12,21,0,3,1,1,0.34,0.3182,0.61,0.2239,168
10884,4,1,12,22,0,3,1,1,0.34,0.3485,0.61,0.0896,129


In [77]:
data_test

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,1,0,1,0,0,4,1,1,0.26,0.2273,0.56,0.3881
1,1,0,1,1,0,4,1,1,0.26,0.2727,0.56,0.0000
2,1,0,1,2,0,4,1,1,0.26,0.2727,0.56,0.0000
3,1,0,1,3,0,4,1,1,0.26,0.2576,0.56,0.1642
4,1,0,1,4,0,4,1,1,0.26,0.2576,0.56,0.1642
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642
6489,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642
6490,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642
6491,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343


+ Sprawdzmy dane:

In [78]:
data_training.describe()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.501929,6.521495,11.541613,0.028569,2.998622,0.680875,1.418427,0.493436,0.473102,0.618865,0.191036,191.574132
std,1.116174,0.500019,3.444373,6.915838,0.166599,2.00777,0.466159,0.633839,0.190039,0.169492,0.19245,0.121859,181.144454
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0152,0.0,0.0,1.0
25%,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.47,0.1045,42.0
50%,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.62,0.194,145.0
75%,4.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.64,0.6212,0.77,0.2537,284.0
max,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,0.9091,1.0,0.8507,977.0


## Model sieci neuronowej

In [79]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_inputs, num_hidden_1, num_hidden_2, num_outputs):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden_1)
        self.act_fn_1 = nn.ReLU()
        self.linear2 = nn.Linear(num_hidden_1, num_hidden_2)
        self.act_fn_2 = nn.ReLU()
        self.linear3 = nn.Linear(num_hidden_2, num_outputs)
        self.fn_out = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.act_fn_1(x)
        x = self.linear2(x)
        x = self.act_fn_2(x)
        x = self.linear3(x)
        x = self.fn_out(x)
        return x

+ Model wraz z optymalizacją i kryterium jakości:

In [80]:
model = NeuralNetwork(num_inputs=12,
                      num_hidden_1=NUM_HIDDEN_1,
                      num_hidden_2=NUM_HIDDEN_2,
                      num_outputs=1)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)
loss_module = nn.MSELoss()

## Wydzielenie zbioru testowego i walidacyjnego

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
training_set_args = data_training.values[:, :-1]
training_set_vals = data_training.values[:, -1]
x_train, x_val, y_train, y_val = train_test_split(training_set_args,
                                                  training_set_vals,
                                                  test_size=0.25,
                                                  random_state=42)

In [83]:
train_dataset = data.TensorDataset(torch.from_numpy(x_train),
                                   torch.from_numpy(y_train))

In [84]:
next(iter(train_dataset))

(tensor([3.0000, 0.0000, 7.0000, 0.0000, 0.0000, 1.0000, 1.0000, 1.0000, 0.7000,
         0.6515, 0.6500, 0.1940], dtype=torch.float64),
 tensor(35., dtype=torch.float64))

In [85]:
train_data_loader = data.DataLoader(train_dataset,
                                    batch_size=BATCH_SIZE,
                                    shuffle=True)

## Trenowanie

In [86]:
model.train()
for epoch in range(EPOCHS):
    for data_inputs, data_labels in train_data_loader:
        data_inputs = data_inputs.to(device)
        data_labels = data_labels.to(device)

        preds = model(data_inputs.float())
        preds = preds.squeeze(
            dim=1)  # Output is [Batch size, 1], but we want [Batch size]

        loss = loss_module(preds, data_labels.float())

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
    if (epoch % 10 == 1):
        print(f"Epoch: {epoch}, loss: {loss.item():.3}")

Epoch: 1, loss: 1.81e+04
Epoch: 11, loss: 1.62e+04
Epoch: 21, loss: 1.11e+04
Epoch: 31, loss: 1.67e+04
Epoch: 41, loss: 1.04e+04
Epoch: 51, loss: 1.5e+04
Epoch: 61, loss: 9.9e+03
Epoch: 71, loss: 1.14e+04
Epoch: 81, loss: 1.45e+04
Epoch: 91, loss: 1.07e+04
Epoch: 101, loss: 1.55e+04
Epoch: 111, loss: 1.8e+04
Epoch: 121, loss: 1.39e+04
Epoch: 131, loss: 6.97e+03
Epoch: 141, loss: 8.29e+03
Epoch: 151, loss: 6.11e+03
Epoch: 161, loss: 6.44e+03
Epoch: 171, loss: 6.04e+03
Epoch: 181, loss: 4.36e+03
Epoch: 191, loss: 4.76e+03
Epoch: 201, loss: 2.2e+03
Epoch: 211, loss: 2.48e+03
Epoch: 221, loss: 6e+03
Epoch: 231, loss: 2.74e+03
Epoch: 241, loss: 3.2e+03
Epoch: 251, loss: 3.12e+03
Epoch: 261, loss: 4.78e+03
Epoch: 271, loss: 3.62e+03
Epoch: 281, loss: 3.35e+03
Epoch: 291, loss: 2.52e+03
Epoch: 301, loss: 5.47e+03
Epoch: 311, loss: 5.54e+03
Epoch: 321, loss: 4.42e+03
Epoch: 331, loss: 3.64e+03
Epoch: 341, loss: 3.41e+03
Epoch: 351, loss: 3.93e+03
Epoch: 361, loss: 2.4e+03
Epoch: 371, loss: 2.5

## Obliczenie błędu na walidacyjnym

+ Funkcja oceniająca - rmsle:

In [87]:
def rmsle(y_true, y_pred):
    n = len(y_true)
    msle = np.mean([
        (np.log(max(y_pred[i], 0) + 1) - np.log(y_true[i] + 1))**2.0
        for i in range(n)
    ])
    return np.sqrt(msle)

In [88]:
val_dataset = data.TensorDataset(
    torch.from_numpy(x_val).float(),
    torch.from_numpy(y_val).float())
next(iter(val_dataset))

(tensor([ 3.0000,  0.0000,  7.0000, 11.0000,  0.0000,  2.0000,  1.0000,  1.0000,
          0.8200,  0.8030,  0.5900,  0.0000]),
 tensor(127.))

+ Wynik na walidacyjnym:

In [89]:
model.eval()
preds_list = []
with torch.no_grad():
    for data_inputs, data_target in val_dataset:
        data_inputs = data_inputs.to(device)
        preds = model(data_inputs.float())
        preds_cp = preds.cpu()
        preds_list.append(float(preds_cp))

rmsle(y_val, preds_list)

0.7711444460096335

## Predykcja na zbiorze testowym i zapisanie do csv

In [90]:
test_dataset = data.TensorDataset(torch.from_numpy(data_test.values).float())
next(iter(test_dataset))

(tensor([1.0000, 0.0000, 1.0000, 0.0000, 0.0000, 4.0000, 1.0000, 1.0000, 0.2600,
         0.2273, 0.5600, 0.3881]),)

In [91]:
model.eval()
preds_list_test = []
with torch.no_grad():
    for data_input_test in test_dataset:
        data_input_test = data_input_test[0].to(device)
        preds_val = model(data_input_test.float())
        preds_val = preds_val.cpu()
        preds_list_test.append(float(preds_val))
df_output = pd.DataFrame(preds_list_test)
csv = df_output.to_csv(index=False, header=False)
with open('data/result.csv', 'w', newline="") as f:
    f.write(csv)

In [92]:
df_output.min()

0    4.625254
dtype: float64