In [49]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.utils.data as data
import torch.nn.functional as F
import torch.optim as optim

import pickle

In [50]:
# device = torch.device('cpu')
device = torch.device('cuda')
print(device)

cuda


In [51]:
try:
    from google.colab import drive
    drive.mount('/content/drive')

    file_path = '/content/drive/My Drive/ssne/train.pkl'
    test_path = '/content/drive/My Drive/ssne/test_no_target.pkl'

    print('Successfully loaded from Google Drive')

except Exception:

    file_path = 'train.pkl'
    test_path = 'test_no_target.pkl'

    print('Failed to load from Google Drive')

Failed to load from Google Drive


In [52]:
with open(file_path, 'rb') as handle:
    train_data_raw = pickle.load(handle)

with open(test_path, 'rb') as handle:
    test_data = pickle.load(handle)

# Prepare the data

In [53]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

def pad_collate(batch, pad_value=0):
    xx, yy = zip(*batch)

    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=pad_value)
    yy_stacked = torch.stack(yy)

    return xx_pad, yy_stacked, x_lens, y_lens

In [54]:
batch_size = 64
split = 0.8

train_set_size = int(split * len(train_data_raw))
val_set_size = len(train_data_raw) - train_set_size

train_set_raw, val_set_raw = data.random_split(
    train_data_raw,
    [train_set_size, val_set_size]
)

In [55]:
class_count = 5

## Train data

In [56]:
train_xx = []
train_yy = []

for x, y in train_set_raw:
    train_xx.append(torch.tensor(x))
    train_yy.append(F.one_hot(torch.tensor(y), class_count))

train_set = [(x, y) for x, y in zip(train_xx, train_yy)]

## Validation data

In [57]:
val_xx = []
val_yy = []

for x, y in val_set_raw:
    val_xx.append(torch.tensor(x))
    val_yy.append(F.one_hot(torch.tensor(y), class_count))

val_set = [(x, y) for x, y in zip(val_xx, val_yy)]

# Test data

In [58]:
from torch.utils.data import DataLoader, Dataset

class TestDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [59]:
# Funkcja pad_collate do wyrównania sekwencji
def pad_collate_for_test(batch):
    sequences = [torch.tensor(seq) for seq in batch]
    # Padding sekwencji
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    # Tensor długości sekwencji
    lengths = torch.tensor([len(seq) for seq in sequences])
    return padded_sequences, lengths

# Take into account the imbalance of classes

In [60]:
classes = list(range(class_count))

class_occurence_count = {cls: 0 for cls in classes}

def one_hot_decode(encoding):
    return torch.argmax(encoding).item()

for y in train_yy:
    y_decoded = one_hot_decode(y)
    class_occurence_count[y_decoded] += 1

print(class_occurence_count)

{0: 1309, 1: 381, 2: 122, 3: 344, 4: 195}


In [61]:
# weights = {cls : 1.0 / class_occurence_count[cls] for cls in class_occurence_count.keys()}
weights = {0: 75, 1: 150, 2: 275, 3: 245, 4: 275}
print(weights)

y_weights = [weights[one_hot_decode(y)] for y in train_yy]
y_weights = torch.tensor(y_weights)

sampler = data.WeightedRandomSampler(weights=y_weights, num_samples=len(y_weights))

train_dloader = data.DataLoader(
    train_set,
    sampler=sampler,
    batch_size=batch_size,
    num_workers=2,
    collate_fn=pad_collate
)

val_dloader = data.DataLoader(
    val_set,
    batch_size=batch_size,
    num_workers=2,
    collate_fn=pad_collate
)
test_dataset = TestDataset(test_data)

test_dloader = data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    num_workers=2,
    collate_fn=pad_collate_for_test
)

{0: 75, 1: 150, 2: 275, 3: 245, 4: 275}


# Model

In [62]:
class LstmClassifier(nn.Module):

    def __init__(self):
        super().__init__()

        self.num_layers = 1
        self.hidden_size = 64

        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers
        )

        self.linear = nn.Sequential(
            nn.Linear(self.hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 5),
        )

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden, state

    def forward(self, x, hidden):
        x = torch.transpose(x, 0, 1)
        all_outputs, hidden = self.lstm(x, hidden)
        out = all_outputs[-1]
        x = self.linear(out)
        return x, hidden

In [63]:
model = LstmClassifier().to(device)
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training

In [64]:
num_epochs = 1500
for epoch in range(num_epochs):

    loss_sum = 0.0
    for x, y, x_len, y_len in train_dloader:

        model.train()

        x, y = x.to(device), y.to(device)
        x = x.unsqueeze(2)
        x = x.float()
        y = y.float()

        hidden, state = model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device)

        preds, _ = model.forward(x, (hidden, state))

        preds.squeeze(1)

        optimizer.zero_grad()
        loss = loss_fun(preds, y)
        loss_sum += loss.item()
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, loss: {loss_sum:.7f}")

Epoch: 0, loss: 57.6282749
Epoch: 10, loss: 56.5606825
Epoch: 20, loss: 56.7194697
Epoch: 30, loss: 56.4044673
Epoch: 40, loss: 56.1349300
Epoch: 50, loss: 56.7503422
Epoch: 60, loss: 56.0696146
Epoch: 70, loss: 56.1365284
Epoch: 80, loss: 56.4641567
Epoch: 90, loss: 55.9011843
Epoch: 100, loss: 55.3892013
Epoch: 110, loss: 54.5438774
Epoch: 120, loss: 55.6352043
Epoch: 130, loss: 54.8349533
Epoch: 140, loss: 54.3642347
Epoch: 150, loss: 54.9541868
Epoch: 160, loss: 54.7391346
Epoch: 170, loss: 54.4955227
Epoch: 180, loss: 54.5583125
Epoch: 190, loss: 53.9115940
Epoch: 200, loss: 54.1150939
Epoch: 210, loss: 54.8716518
Epoch: 220, loss: 54.2924843
Epoch: 230, loss: 54.0169945
Epoch: 240, loss: 55.1207857
Epoch: 250, loss: 53.6280262
Epoch: 260, loss: 52.7944185
Epoch: 270, loss: 53.2416140
Epoch: 280, loss: 53.1874624
Epoch: 290, loss: 52.2048815
Epoch: 300, loss: 53.1951106
Epoch: 310, loss: 52.3710210
Epoch: 320, loss: 52.4750799
Epoch: 330, loss: 51.5421333
Epoch: 340, loss: 52.7118

# Validate

In [67]:
correct_preds = {cls: 0 for cls in classes}
all_preds = {cls: 0 for cls in classes}

for x, y, x_len, y_len in val_dloader:

    model.eval()

    x, y = x.to(device), y.to(device)
    x = x.unsqueeze(2)
    x = x.float()
    y = y.float()

    hidden, state = model.init_hidden(x.size(0))
    hidden, state = hidden.to(device), state.to(device)

    preds, _ = model.forward(x, (hidden, state))
    preds.squeeze(1)

    predicted_classes = torch.argmax(preds, dim=1)
    actual_classes = torch.argmax(y, dim=1)

    for predicted, actual in zip(predicted_classes, actual_classes):
        all_preds[actual.item()] += 1

        if predicted == actual:
            correct_preds[actual.item()] += 1

for cls in classes:
    accuracy = float(correct_preds[cls]) / float(all_preds[cls])
    print(f'Class {cls} accuracy: {accuracy:.5f}')

total_accuracy = float(sum(correct_preds.values())) / float(sum(all_preds.values()))
print(f'Total accuracy: {total_accuracy:.5f}')

Class 0 accuracy: 0.87539
Class 1 accuracy: 0.52577
Class 2 accuracy: 0.25000
Class 3 accuracy: 0.69072
Class 4 accuracy: 0.78049
Total accuracy: 0.74660


Dla wag wyliczanych wzorem "1 / liczba wystąpień klasy" wyniki były niezadowalające, accuracy oscylowało w okolicach 55%. Dlatego postanowiliśmy poeksperymentać z ręcznie ustawianymi wagami klas. Staraliśmy się dobierać je w oparciu o accuracy dla poszczególnych klas. Poniżej kilka ostatnich rezultatów:

Batch size: 64, liczba epok 1500

 - dla wag: {0: 70, 1: 130, 2: 270, 3: 230, 4: 260} total accuracy: 0.71599
 - dla wag: {0: 70, 1: 140, 2: 280, 3: 250, 4: 270} total accuracy: 0.71769
 - dla wag: {0: 90, 1: 150, 2: 275, 3: 245, 4: 275} total accuracy: 0.73200
 - dla wag: {0: 75, 1: 150, 2: 275, 3: 245, 4: 275} total accuracy: 0.74660

In [70]:
combined_preds = np.empty((0,))

print(len(test_dloader))
for features, _ in test_dloader:
    model.eval()

    features = features.to(device)
    features = features.unsqueeze(2)
    features = features.float()

    hidden, state = model.init_hidden(features.size(0))
    hidden, state = hidden.to(device), state.to(device)

    preds, _ = model.forward(features, (hidden, state))
    preds.squeeze(1)

    predicted_classes = torch.argmax(preds, dim=1)
    preds_np = predicted_classes.detach().cpu().numpy()
    print(preds_np)
    combined_preds = np.concatenate((combined_preds, preds_np)).astype(int)
np.savetxt('piatek_Kubiszyn_Sobiech.csv', combined_preds, delimiter=',', fmt='%d')

18
[2 2 0 0 0 0 3 0 0 2 0 0 0 1 0 0 0 0 3 0 0 1 0 0 1 0 0 0 1 0 0 4 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 3 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 0 3 0 1 3 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 3 0 0 0 0 0 0 0 0 0 0 0 1 0 3 0 3 0 2 0 0 3 0 0 3 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 3 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 3 3 0 0 0
 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 3 4 0 0 0 0 0 0 0]
[0 0 1 0 0 0 3 1 0 3 0 0 0 0 0 0 0 0 1 0 0 0 3 1 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 4 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 3 0 0 0 0 0 0 0 3 0 0 3 0 3 3 0 0 0 0 0 0 0
 0 0 

In [69]:
torch.save(model, 'model.pth')