In [1]:
import torch
from torch import nn as nn
import pandas as pd
import torch.utils.data as data
import numpy as np

In [2]:
# GPU operations have a separate seed we also want to set
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

# Additionally, some operations on a GPU are implemented stochastic for efficiency
# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cpu")
device

device(type='cpu')

## Data preprocessing

In [4]:
data_training = pd.read_csv('data.csv')
data_training.head(5)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
data_test = pd.read_csv('evaluation_data.csv')
data_test.head(5)

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,2011-01-20,1,0,1,0,0,4,1,1,0.26,0.2273,0.56,0.3881
1,2011-01-20,1,0,1,1,0,4,1,1,0.26,0.2727,0.56,0.0
2,2011-01-20,1,0,1,2,0,4,1,1,0.26,0.2727,0.56,0.0
3,2011-01-20,1,0,1,3,0,4,1,1,0.26,0.2576,0.56,0.1642
4,2011-01-20,1,0,1,4,0,4,1,1,0.26,0.2576,0.56,0.1642


Pozbywamy się następujących kolumn:
- _dteday_ ze zbioru treningowego i ewaluacyjnego, ponieważ dni tygodnia są zawarte w innej kolumnie
- _instant_, _casual_, _registered_ ze zbioru treningowego, ponieważ te kolumny nie występują w zbiorze ewaluacyjnym

In [6]:
# zakodowac miesiace

In [7]:
# dropping unnecessary data
data_training.drop(columns=['instant', 'casual', 'registered', 'dteday'], inplace=True)
data_test.drop(columns=['dteday'], inplace=True)

In [8]:
data_training.describe()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.501929,6.521495,11.541613,0.028569,2.998622,0.680875,1.418427,0.493436,0.473102,0.618865,0.191036,191.574132
std,1.116174,0.500019,3.444373,6.915838,0.166599,2.00777,0.466159,0.633839,0.190039,0.169492,0.19245,0.121859,181.144454
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0152,0.0,0.0,1.0
25%,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.47,0.1045,42.0
50%,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.62,0.194,145.0
75%,4.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.64,0.6212,0.77,0.2537,284.0
max,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,0.9091,1.0,0.8507,977.0


In [9]:
data_test.describe()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
count,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0
mean,2.4933,0.503619,6.56507,11.555367,0.029108,3.012167,0.685815,1.436778,0.502942,0.480257,0.641252,0.188525
std,1.091258,0.500025,3.429462,6.912526,0.168123,2.002541,0.464226,0.64839,0.196575,0.175655,0.192934,0.123137
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.16,0.0
25%,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.49,0.1045
50%,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.52,0.5,0.65,0.1642
75%,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.81,0.2537
max,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,0.98,1.0,1.0,0.8358


In [10]:
def normalize_columns(df):
    for column in df.columns:
        df[column] = (df[column] - df[column].mean())/df[column].var()
    return df

## Splitting dataset into training and testing

In [11]:
X = data_training.iloc[:, :-1]
y = data_training.iloc[:, -1]

In [12]:
X_norm = normalize_columns(X)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X_norm, y,
                                                                test_size=0.20,
                                                                random_state=13)

# Architektura sieci

In [14]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_inputs, num_outputs, num_hidden_1, num_hidden_2):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden_1)
        self.act_fn_1 = nn.ReLU()
        self.linear2 = nn.Linear(num_hidden_1, num_hidden_2)
        self.act_fn_2 = nn.ReLU()
        self.linear3 = nn.Linear(num_hidden_2, num_outputs)
        self.out = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.act_fn_1(x)
        x = self.linear2(x)
        x = self.act_fn_2(x)
        x = self.linear3(x)
        x = self.out(x)
        return x

### Model definition

In [15]:
N_INPUTS = len(X_train.columns)
N_OUTPUTS = 1
N_HIDDEN_1 = 64
N_HIDDEN_2 = 100
BATCH_SIZE = 512
model = NeuralNetwork(N_INPUTS, N_OUTPUTS, N_HIDDEN_1, N_HIDDEN_2)

In [16]:
learning_rate = 0.05
EPOCHS = 50

In [17]:
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_module = nn.MSELoss()

In [18]:
from torch.utils.data import TensorDataset, DataLoader

tensor_x = torch.Tensor(X_train.values)
tensor_y = torch.Tensor(y_train.values)

my_dataset = TensorDataset(tensor_x,tensor_y)
train_data_loader = DataLoader(my_dataset)

In [None]:
model.train()
for epoch in range(EPOCHS):
    for data_inputs, data_labels in train_data_loader:
        data_inputs = data_inputs.to(device)
        data_labels = data_labels.to(device)

        preds = model(data_inputs.float())
        preds = preds.squeeze(dim=1)

        loss = loss_module(preds, data_labels.float())

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
    if epoch % 2 == 1:
        print(f"Epoch: {epoch}, loss: {loss.item():.4}")

### Validation dataset

In [None]:
validation_data_loader = data.TensorDataset(
    torch.from_numpy(X_validation.values).float(),
    torch.from_numpy(y_validation.values).float())

In [None]:
def rmsle(y_true,y_pred):
    n = len(y_true)
    msle = np.mean([(np.log(max(y_pred[i],0) + 1) - np.log(y_true[i] + 1)) ** 2.0 for i in range(n)])
    return np.sqrt(msle)

In [None]:
model.eval()
preds_list = []
with torch.no_grad():
    for data_inputs, data_target in validation_data_loader:
        data_inputs = data_inputs.to(device)
        preds = model(data_inputs.float())
        preds_cp = preds.cpu()
        preds_list.append(float(preds_cp))

In [None]:
rmsle(y_validation.values, preds_list)

# Producing predictions

In [24]:
df_eval = pd.read_csv("evaluation_data.csv")

In [25]:
df_eval.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,2011-01-20,1,0,1,0,0,4,1,1,0.26,0.2273,0.56,0.3881
1,2011-01-20,1,0,1,1,0,4,1,1,0.26,0.2727,0.56,0.0
2,2011-01-20,1,0,1,2,0,4,1,1,0.26,0.2727,0.56,0.0
3,2011-01-20,1,0,1,3,0,4,1,1,0.26,0.2576,0.56,0.1642
4,2011-01-20,1,0,1,4,0,4,1,1,0.26,0.2576,0.56,0.1642


In [26]:
df_eval.drop(columns=['dteday'], inplace=True)

In [27]:
eval_dataset = data.TensorDataset(torch.from_numpy(df_eval.values).float())
next(iter(eval_dataset))

(tensor([1.0000, 0.0000, 1.0000, 0.0000, 0.0000, 4.0000, 1.0000, 1.0000, 0.2600,
         0.2273, 0.5600, 0.3881]),)

In [28]:
model.eval()

eval_list = []
with torch.no_grad():
    for data_input_test in eval_dataset:
        data_input_test = data_input_test[0].to(device)
        preds_val = model(data_input_test.float())
        preds_val = preds_val.cpu()
        eval_list.append(float(preds_val))

In [29]:
df_pred = pd.DataFrame(eval_list)

In [32]:
df_pred.to_csv("piatek_Bedkowski_Sulkowski.csv", index=False)