### Imports

In [21]:
import sys
import pandas as pd
import numpy as np
import scipy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


import torch
import torch.nn as nn

sys.path.append("scripts")
from scripts.review_dataloader import *

In [50]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


### Load dataset

In [2]:
# review_dl = SteamReviewDataset("../data/reviews_100k.csv.bz2", shuffle=True)

reviews_df = pd.read_csv("../data/reviews_100k.csv.bz2", low_memory=False)
reviews_df["review"] = reviews_df["review"].astype(str)

# optionally shuffle (games are in order!)
reviews_df = reviews_df.sample(frac=1).reset_index(drop=True)

# Daten aufsplitten
x_train_raw, x_test_raw, y_train_raw, y_test_raw = train_test_split(reviews_df["review"], reviews_df["voted_up"], test_size=.33, random_state=42)   

# Kontrolle
print(f"y_train true/false ratio is {len(y_train_raw[y_train_raw == True])/len(y_train_raw):.2f}")
print(f"y_test true/false ratio is {len(y_test_raw[y_test_raw == True])/len(y_test_raw):.2f}")      
# # Example
# print(review_dl.__getitem__(0))
# print(review_dl.__getitems__([1,2,3]))




y_train true/false ratio is 0.68
y_test true/false ratio is 0.68


### Build feature representation

In [3]:
tfid_vectorizer = TfidfVectorizer(ngram_range=(1,3))

# ca. 45s
x_train_tfidf = tfid_vectorizer.fit_transform(x_train_raw) # document term matrix
x_test_tfidf = tfid_vectorizer.transform(x_test_raw)

print(tfid_vectorizer.get_feature_names_out()[:10]) # Vorschau

['aah' 'aah hey' 'aah hey little' 'aah man' 'aah man that' 'ab' 'ab able'
 'ab able to' 'ab helped' 'ab helped could']


In [46]:
# How the data looks
# can't load whole dataset als dense matrix (1.5 TB!)
test: np.matrix = scipy.sparse.csr_matrix.todense(x_train_tfidf[2]) # from scipy.sparse.csr_matrix.todense(x_train)
print(type(test))
print(test.shape) # (1, 3317704)
np.count_nonzero(test)

<class 'numpy.matrix'>
(1, 3317704)


152

### Configure PyTorch dataloader

In [48]:
class MinimalDataLoader(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = scipy.sparse.csr_matrix.todense(self.reviews[idx])
        label = int(self.labels[idx])
        return review, label   

In [49]:
train_data_provider = MinimalDataLoader(x_train_tfidf, y_train_raw)

### Defining the model

In [52]:
model = nn.Sequential(
    nn.Linear(train_data_provider.__getitem__(0)[0].shape[1], 1024),
    nn.Dropout(0.1),
    nn.Linear(1024, 512),
    nn.Dropout(0.1),                                     
    nn.Linear(512, 2),
    nn.Sigmoid() # 0 .. 1
)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

### Training

In [None]:
num_epochs = 1

data_it = iter(train_data_provider)

for epoch in range(num_epochs):
    data, label = next(data_it)
    data = torch.tensor(data).float()
    label = torch.tensor(label)
    
    optimizer.zero_grad()
    output = model.forward(data)
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()
    
    # WIP

ValueError: Expected input batch_size (1) to match target batch_size (0).