## Projekt 1

In [65]:
import pandas as pd
import numpy as np
import torch

In [66]:
data_df = pd.read_csv('data.csv')
data_df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### Prep the data

In [67]:
data_df.drop('instant', axis=1, inplace=True)
data_df.drop('dteday', axis=1, inplace=True)
data_df.drop('casual', axis=1, inplace=True)
data_df.drop('registered', axis=1, inplace=True)
data_df.columns

Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

In [68]:
data_df.dtypes

season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object

In [69]:
X = data_df.iloc[:, :-1]
y = data_df.iloc[:, -1]

In [70]:
print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (10886, 12)
y: (10886,)


### Normalize the data

In [71]:
from sklearn.preprocessing import MinMaxScaler

In [72]:
scaler = MinMaxScaler()
normalized_X = scaler.fit_transform(X)
normalized_X.shape

(10886, 12)

### Split the data

In [73]:
import torch.utils.data as data

In [74]:
dataset = data.TensorDataset(
    torch.from_numpy(normalized_X).float(),
    torch.from_numpy(y.to_numpy()).float()
)


In [75]:
train_dataset, val_dataset = data.random_split(dataset, [0.8, 0.2])


In [76]:
print(f'train: {len(train_dataset)}')
print(f'val_dataset: {len(val_dataset)}')

train: 8709
val_dataset: 2177


### Create dataloaders

In [77]:
train_dataloader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = data.DataLoader(val_dataset, batch_size=128)

### Create model class

In [78]:
import torch.nn as nn

In [79]:
class RegressionModel(nn.Module):
    
    def __init__(self, num_inputs, num_hidden1, num_outputs):
        super().__init__()
        self.linear1 = nn.Linear(num_inputs, num_hidden1)
        self.act_fn1 = nn.Tanh()
        self.linear2 = nn.Linear(num_hidden1, num_outputs)

    def forward(self, x):
        x = self.linear1(x)
        x = self.act_fn1(x)
        x = self.linear2(x)
        return x

In [80]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [81]:
model = RegressionModel(12, 20, 1)
model.to(device)

RegressionModel(
  (linear1): Linear(in_features=12, out_features=20, bias=True)
  (act_fn1): Tanh()
  (linear2): Linear(in_features=20, out_features=1, bias=True)
)

In [82]:
optimizer = torch.optim.Adam(model.parameters(), lr=.1)
loss_module = nn.MSELoss()

### Train the model

In [83]:
model.train()

for epoch in range(100):
    for data_inputs, data_labels in train_dataloader:
        data_inputs = data_inputs.to(device)
        data_labels = data_labels.to(device)

        preds = model(data_inputs)
        preds = preds.squeeze(dim=1)

        loss = loss_module(preds, data_labels.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, loss: {loss.item():.3}")

Epoch: 0, loss: 6.85e+04
Epoch: 0, loss: 6.11e+04
Epoch: 0, loss: 5.64e+04
Epoch: 0, loss: 5.72e+04
Epoch: 0, loss: 5.4e+04
Epoch: 0, loss: 7.68e+04
Epoch: 0, loss: 4.75e+04
Epoch: 0, loss: 7.08e+04
Epoch: 0, loss: 6.82e+04
Epoch: 0, loss: 4.89e+04
Epoch: 0, loss: 7.11e+04
Epoch: 0, loss: 5.73e+04
Epoch: 0, loss: 6.95e+04
Epoch: 0, loss: 6.32e+04
Epoch: 0, loss: 6.21e+04
Epoch: 0, loss: 5.39e+04
Epoch: 0, loss: 5.71e+04
Epoch: 0, loss: 6.28e+04
Epoch: 0, loss: 6.7e+04
Epoch: 0, loss: 5.2e+04
Epoch: 0, loss: 5.96e+04
Epoch: 0, loss: 7.21e+04
Epoch: 0, loss: 6.34e+04
Epoch: 0, loss: 5.44e+04
Epoch: 0, loss: 4.1e+04
Epoch: 0, loss: 4.21e+04
Epoch: 0, loss: 5.05e+04
Epoch: 0, loss: 5.69e+04
Epoch: 0, loss: 5.84e+04
Epoch: 0, loss: 6.16e+04
Epoch: 0, loss: 4.51e+04
Epoch: 0, loss: 4.37e+04
Epoch: 0, loss: 4.92e+04
Epoch: 0, loss: 4.52e+04
Epoch: 0, loss: 3.67e+04
Epoch: 0, loss: 4e+04
Epoch: 0, loss: 4.1e+04
Epoch: 0, loss: 4.5e+04
Epoch: 0, loss: 3.76e+04
Epoch: 0, loss: 3.96e+04
Epoch: 0,

### Evaluate set

In [92]:
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for data_inputs, data_labels in val_dataloader:
        data_inputs = data_inputs.to(device)
        data_labels = data_labels.to(device)
        
        preds = model(data_inputs)
        preds = preds.squeeze(dim=1)
        
        all_preds.append(preds.cpu())
        all_targets.append(data_labels.cpu())

all_preds = torch.cat(all_preds, dim=0).numpy()
all_targets = torch.cat(all_targets, dim=0).numpy()


In [93]:
def rmsle(y_true,y_pred):
    n = len(y_true)
    msle = np.mean([(np.log(max(y_pred[i],0) + 1) - np.log(y_true[i] + 1)) ** 2.0 for i in range(n)])
    return np.sqrt(msle)

In [94]:
rmsle_value = rmsle(all_targets, all_preds)
print("RMSLE:", rmsle_value)

RMSLE: 0.7024933223752428


In [95]:
torch.save(model.state_dict(), 'model_state.pth')

### Predictions on evaluation data

In [96]:
eval_df = pd.read_csv('evaluation_data.csv')
eval_df.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,2011-01-20,1,0,1,0,0,4,1,1,0.26,0.2273,0.56,0.3881
1,2011-01-20,1,0,1,1,0,4,1,1,0.26,0.2727,0.56,0.0
2,2011-01-20,1,0,1,2,0,4,1,1,0.26,0.2727,0.56,0.0
3,2011-01-20,1,0,1,3,0,4,1,1,0.26,0.2576,0.56,0.1642
4,2011-01-20,1,0,1,4,0,4,1,1,0.26,0.2576,0.56,0.1642


In [98]:
eval_df.drop('dteday', axis=1, inplace=True)
eval_df.shape

(6493, 12)

### Normalize eval data

In [99]:
normalized_eval_X = scaler.transform(eval_df)
normalized_eval_X

array([[0.        , 0.        , 0.        , ..., 0.23727486, 0.56      ,
        0.45621253],
       [0.        , 0.        , 0.        , ..., 0.28806354, 0.56      ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.28806354, 0.56      ,
        0.        ],
       ...,
       [0.        , 1.        , 1.        , ..., 0.27117127, 0.6       ,
        0.19301751],
       [0.        , 1.        , 1.        , ..., 0.28806354, 0.56      ,
        0.15786999],
       [0.        , 1.        , 1.        , ..., 0.28806354, 0.65      ,
        0.15786999]], shape=(6493, 12))

In [100]:
class InferenceDataset(data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample

In [101]:
eval_dataset = InferenceDataset(
    torch.from_numpy(normalized_eval_X).float()
)
eval_dataloader = data.DataLoader(eval_dataset, batch_size=128)

In [112]:
model.eval()
all_preds = []

with torch.no_grad():
    for data_inputs in eval_dataloader:
        data_inputs = data_inputs.to(device)

        preds = model(data_inputs)
        all_preds.append(preds)

all_preds = torch.cat(all_preds, dim=0)

In [118]:
all_preds = torch.clamp(all_preds, min=0)
preds_np = all_preds.cpu().detach().numpy()
preds_df = pd.DataFrame(preds_np, columns=['predictions'])
preds_df.to_csv('my_predictions.csv', index=False, header=False)
