### Imports

In [1]:
import sys
import pandas as pd
import numpy as np
import gc

import scipy
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

sys.path.append("scripts")
from scripts.review_dataloader import *

### GPU acceleration

In [2]:
if torch.cuda.is_available():
    print(f"Using CUDA version {torch.version.cuda}")
elif torch.backends.mps.is_available():
    print("Using MPS backend")
else:
    print("No backend detected, using CPU")

Using CUDA version 12.1


### Load dataset

In [None]:
# review_dl = SteamReviewDataset("../data/reviews_100k.csv.bz2", shuffle=True)

reviews_df = pd.read_csv("../data/reviews_100k.csv.bz2", low_memory=False)
reviews_df["review"] = reviews_df["review"].astype(str)
reviews_df 

# optionally shuffle (games are in order!)
reviews_df = reviews_df.sample(frac=1).reset_index(drop=True)

# fewer data for testing
# 100k full corpus takes 5h+
# 10k takes 3 min
# 1k takes 1s
reviews_df = reviews_df[:20_000] 

print(f"loaded {len(reviews_df):,} reviews")





loaded 20000 reviews


In [4]:
# Daten aufsplitten
x_train_raw, x_test_raw, y_train_raw, y_test_raw = train_test_split(reviews_df["review"], reviews_df["voted_up"], test_size=.33, random_state=42)   

# Kontrolle
print(f"y_train true/false ratio is {len(y_train_raw[y_train_raw == True])/len(y_train_raw):.2f}")
print(f"y_test true/false ratio is {len(y_test_raw[y_test_raw == True])/len(y_test_raw):.2f}")      


y_train true/false ratio is 0.68
y_test true/false ratio is 0.69


### Build feature representation

In [5]:
tfid_vectorizer = TfidfVectorizer(ngram_range=(1,3))

# ca. 45s
x_train_tfidf = tfid_vectorizer.fit_transform(x_train_raw) # document term matrix
x_test_tfidf = tfid_vectorizer.transform(x_test_raw)

print(tfid_vectorizer.get_feature_names_out()[:10]) # Vorschau

['aah' 'aah man' 'aah man that' 'aah what' 'aah what lovely' 'ab' 'ab rb'
 'ab rb or' 'ab system' 'ab system this']


In [6]:
# How the data looks
# can't load whole dataset als dense matrix (1.5 TB!)
test: np.matrix = scipy.sparse.csr_matrix.todense(x_train_tfidf[2]) # from scipy.sparse.csr_matrix.todense(x_train)
print(type(test))
print(test.shape) # (1, 3317704)
np.count_nonzero(test)

<class 'numpy.matrix'>
(1, 972118)


281

### Configure PyTorch dataloader

In [7]:
class MinimalDataLoader(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = np.array(y_train_raw)
        print(self.reviews.shape)
    
    def __len__(self):
        return self.reviews.shape[0]
    
    def __getitem__(self, idx):
        review = torch.tensor(scipy.sparse.csr_matrix.todense(self.reviews[idx]), dtype=torch.float32, device=torch.device('cuda:0')).squeeze(0)
        label = torch.tensor(self.labels[idx], dtype=torch.float32, device=torch.device('cuda:0'))
        return review, label   

In [8]:
train_data_provider = MinimalDataLoader(x_train_tfidf, y_train_raw)
print(len(train_data_provider))

(13400, 972118)
13400


In [9]:
train_data_provider.__getitem__(0)

(tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0'),
 tensor(1., device='cuda:0'))

### Defining the model

In [10]:
class BinaryClassifier(nn.Module): # could also do this with nn.Sequential()
    def __init__(self, input_dim): # input shape depends on tfidf matrix shape
        super(BinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid() 
        )

    def forward(self, x):
        return self.model(x)

### Training

In [11]:

def train_model(train_loader, input_dim, num_epochs=10, lr=0.001, device="cuda:0"):
    model = BinaryClassifier(input_dim=input_dim).to(device)
    
    # Loss function
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # iterating
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        for batch_X, batch_y in progress_bar:
            optimizer.zero_grad() # reset gradients
            outputs = model(batch_X).squeeze() # fit weights
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs} completed. Average Loss: {total_loss / len(train_loader):.4f}")
        
    return model

In [12]:
batch_size = 32
input_dim = x_train_tfidf.shape[1]

# Create dataset and DataLoader
train_loader = DataLoader(train_data_provider, batch_size=batch_size, shuffle=True)

torch.cuda.empty_cache()
gc.collect()

# Train the model
trained_model = train_model(train_loader, input_dim=input_dim, num_epochs=5, lr=0.001)

Epoch 1/5: 100%|██████████| 419/419 [01:27<00:00,  4.78batch/s]


Epoch 1/5 completed. Average Loss: 0.3627


Epoch 2/5: 100%|██████████| 419/419 [01:26<00:00,  4.86batch/s]


Epoch 2/5 completed. Average Loss: 0.0320


Epoch 3/5: 100%|██████████| 419/419 [01:34<00:00,  4.45batch/s]


Epoch 3/5 completed. Average Loss: 0.0033


Epoch 4/5: 100%|██████████| 419/419 [01:36<00:00,  4.32batch/s]


Epoch 4/5 completed. Average Loss: 0.0017


Epoch 5/5: 100%|██████████| 419/419 [01:26<00:00,  4.87batch/s]

Epoch 5/5 completed. Average Loss: 0.0011





### Evaluate model

In [13]:
# generated by gpt
def evaluate_model(model, test_loader, device="cuda:0"):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            # Move data to device (GPU or CPU)
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            # Get model predictions
            outputs = model(batch_X).squeeze()
            predictions = (outputs >= 0.5).float()  # Apply a threshold of 0.5
            
            all_preds.append(predictions.cpu().numpy())  # Collect predictions
            all_targets.append(batch_y.cpu().numpy())   # Collect true labels

    # Flatten the lists to evaluate metrics
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)

    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds)
    recall = recall_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


In [None]:
# Prepare the test dataset and DataLoader
test_dataset = MinimalDataLoader(x_test_tfidf, y_test_raw)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Evaluate the model
evaluate_model(trained_model, test_loader)


# Current Highscores

# 10k samples batch 32
# Accuracy: 0.5545
# Precision: 0.6795
# Recall: 0.6615
# F1 Score: 0.6704

# 20k samples batch 32
# Accuracy: 0.5676
# Precision: 0.6948
# Recall: 0.6671
# F1 Score: 0.6807

(6600, 972118)
Accuracy: 0.5676
Precision: 0.6948
Recall: 0.6671
F1 Score: 0.6807
