# Assignment <span style="color:red">option Four</span> - News Categorization  using PyTorch 
Download the dataset from https://www.kaggle.com/uciml/news-aggregator-dataset and develop a news classification or categorization model. The dataset contain only titles of a news item and some metadata. The categories of the news items include one of: –<span  style="color:red"> b</span> : business – <span  style="color:red">t</span> : science and technology – <span  style="color:red">e</span> : entertainment and –<span  style="color:red">m</span> : health. 

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.
2. Binary classification: produce training data for each two categories, such as <span  style="color:red">b </span> and <span  style="color:red"> t</span>, <span  style="color:red">b</span> and <span  style="color:red"> m</span>, <span  style="color:red">e</span> and <span  style="color:red">t</span> and so on. Evaluate the performance and report which categories are easier for the models.
3. Adapt the Text Categorization PyTorch code (see above) and evaluate the performance of the system for these task
4. Use a pre-trained embeddings and compare your result. When you use pre-trained embeddings, you have to average the word embeddings of each tokens in ach document to get the unique representation of the document. DOC_EMBEDDING = (TOKEN1_EMBEDDING + ... + TOKENn_EMBEDDING). You can also use some of the <span  style="color:red">spacy/FLAIR </span>document embedding methods
6. Report the recall, precision, and F1 scores for both binary and multi-class classification.
 

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.

In [15]:
%pip install torch torchvision


Defaulting to user installation because normal site-packages is not writeable
Collecting torchvision
  Downloading torchvision-0.16.1-cp311-cp311-manylinux1_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torchvision
Successfully installed torchvision-0.16.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Daten einlesen
# data = pd.read_csv("./data/uci-news-aggregator.csv")
data = pd.read_csv("./uci-news-aggregator.csv")
# Daten in Features (X) und Labels (y) aufteilen
X = data[['TITLE', 'PUBLISHER', 'CATEGORY']]
X = data[['TITLE', 'CATEGORY']]
y = data['CATEGORY']

TEST_SIZE = 0.2

# Aufteilung in Trainings- und Testdaten. Der Parameter stratify führt dazu, dass das feature "Category" gleich aufgeteilt sind
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42, stratify=data['CATEGORY'])

print("Trainingsdaten: ", len(X_train))
print("Testdaten: ", len(X_test))

Trainingsdaten:  337935
Testdaten:  84484


2. Binary classification: produce training data for each two categories, such as <span  style="color:red">b </span> and <span  style="color:red"> t</span>, <span  style="color:red">b</span> and <span  style="color:red"> m</span>, <span  style="color:red">e</span> and <span  style="color:red">t</span> and so on. Evaluate the performance and report which categories are easier for the models.

–<span  style="color:red"> b</span> : business – <span  style="color:red">t</span> : science and technology – <span  style="color:red">e</span> : entertainment and –<span  style="color:red">m</span> : health

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split

# Daten einlesen
data = pd.read_csv("./uci-news-aggregator.csv")

# Daten vorbereiten
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(data['TITLE'])

# Aufteilung in Trainings- und Testdaten
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, data['CATEGORY'], test_size=0.2, random_state=42, stratify=data['CATEGORY'])

# PyTorch Dataset und Dataloader erstellen
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = MyDataset(X_train_text, y_train)
test_dataset = MyDataset(X_test_text, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Modell erstellen
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.fc(x)

model = SimpleClassifier(input_size=X_train_text.shape[1], output_size=len(data['CATEGORY'].unique()))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Modell trainieren
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Modell auswerten
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on test data: {accuracy}')


: 

: 

In [10]:
# from collections import Counter

# vocab = Counter()
# # Indexing words from the training data
# for text in X_train:
#     for word in text.split(" "):
#         vocab[word.lower()] += 1

# # Indexing words from the test data
# for text in X_test:
#     for word in text.split(" "):
#         vocab[word.lower()] += 1

# total_words = len(vocab)
# print(total_words)

6


In [11]:
from itertools import combinations
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter

In [12]:
# Parameters
LEARNING_RATE = 0.01
NUM_EPOCHS = 10
BATCH_SIZE = 150
DISPLAY_STEP = 1

# Network Parameters
HIDDEN_SIZE = 100  # 1st layer and 2nd layer number of features
INPUT_SIZE = total_words  # Words in vocab
NUM_CLASSES = 2  # b, t, e, m (only 2 of them each)

NUM_EPOCHS = 10

# Liste der Kategorien
categories = ['b', 't', 'e', 'm']

# Alle möglichen Paarkombinationen
pairs = list(combinations(categories, 2))

In [13]:
class News_Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(News_Net, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

    def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

class Dataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [14]:
model = News_Net(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_classes=NUM_CLASSES)
criterion = nn.CrossEntropyLoss()  # Verlustfunktion für Klassifikationsprobleme
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam-Optimierer

loader = DataLoader(list(zip(X_train, y_train)), shuffle=True, batch_size=16)
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=64, shuffle=True)
test_loader = DataLoader(list(zip(X_test, y_test)), batch_size=64, shuffle=False)

# # Trainings- und Test-Datasets erstellen
# train_dataset = Dataset(X_train, y_train)
# test_dataset = Dataset(X_test, y_test)

# # DataLoader erstellen
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

model.eval()

correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total


TypeError: linear(): argument 'input' (position 1) must be Tensor, not tuple