In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import re

In [None]:
WORD2VEC_PATH = "./word2vec/word2vec_100_3_polish.bin"
#nie ma tego pliku na gicie bo jest za duży i nie chciał przejść
TRAIN_DATA_PATH = "./sport-text-classification-ball-isi-public/train/train.tsv"
EMBEDDING_DIM = 100

In [24]:
df = pd.read_csv(TRAIN_DATA_PATH, sep="\t", header=None, names=["label", "text"])
texts = df["text"].astype(str).tolist()
labels = df["label"].astype(int).tolist()
# labels:
# 1 - piłka
# 0 - nie piłka


In [25]:
print(df["text"][0])
print(df["label"][0])

Mindaugas Budzinauskas wierzy w odbudowę formy Kevina Johnsona. Czy ktoś opuści Polpharmę? Mindaugas Budzinauskas w rozmowie z WP SportoweFakty opowiada o transferze Kevina Johnsona, ewentualnych odejściach z Polpharmy i kolejnym meczu PLK z Anwilem. - Potrzebowaliśmy takiego gracza, jak Johnson - podkreśla szkoleniowiec starogardzian.
1


In [26]:
word2vec = KeyedVectors.load(WORD2VEC_PATH)

In [27]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-ząćęłńóśźż ]", "", text)
    return text.split()

def document_vector(doc):
    words = preprocess(doc)
    vectors = [word2vec[word] for word in words if word in word2vec]
    if len(vectors) == 0:
        return np.zeros(EMBEDDING_DIM)
    return np.mean(vectors, axis=0)

In [28]:
X = np.array([document_vector(text) for text in tqdm(texts)])
y = np.array(labels)

100%|██████████| 98132/98132 [00:07<00:00, 12585.47it/s]


In [29]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
epochs = 20
batch_size = 32
learning_rate = 0.0001
device = "cpu"

In [31]:
model = SimpleNN(EMBEDDING_DIM).to("cpu")
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)

for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(X_train_tensor.size()[0])
    epoch_loss = 0
    for i in range(0, X_train_tensor.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train_tensor[indices], y_train_tensor[indices]

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")


Epoch 1/20, Loss: 790.2599
Epoch 2/20, Loss: 502.3510
Epoch 3/20, Loss: 472.4102
Epoch 4/20, Loss: 456.6169
Epoch 5/20, Loss: 443.7824
Epoch 6/20, Loss: 432.2923
Epoch 7/20, Loss: 421.3313
Epoch 8/20, Loss: 410.6806
Epoch 9/20, Loss: 400.7026
Epoch 10/20, Loss: 390.4054
Epoch 11/20, Loss: 381.5728
Epoch 12/20, Loss: 372.8536
Epoch 13/20, Loss: 364.8642
Epoch 14/20, Loss: 358.6896
Epoch 15/20, Loss: 352.4240
Epoch 16/20, Loss: 346.4127
Epoch 17/20, Loss: 340.4470
Epoch 18/20, Loss: 336.3832
Epoch 19/20, Loss: 331.3942
Epoch 20/20, Loss: 326.8809


In [32]:
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)

    outputs = model(X_test_tensor)
    predicted = (outputs > 0.5).float()
    acc = accuracy_score(y_test_tensor.cpu(), predicted.cpu())
    print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.9446
