In [1]:
# FastText-Style Text Classification with PyTorch

# ## 1. Import Required Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from itertools import chain
import re


In [2]:

# ## 2. Dataset Preparation
class FastTextDataset(Dataset):
    def __init__(self, texts, labels, vocab=None, ngram_range=(3, 6)):
        """
        Initialize the dataset with texts, labels, and optional vocabulary.
        """
        self.texts = texts
        self.labels = labels
        self.ngram_range = ngram_range

        if vocab is None:
            self.vocab, self.label_map = self.build_vocab_and_labels(texts, labels)
        else:
            self.vocab, self.label_map = vocab

        self.encoded_texts = [self.text_to_ngrams(text) for text in texts]
        self.encoded_labels = [self.label_map[label] for label in labels]

    def build_vocab_and_labels(self, texts, labels):
        """
        Create vocab of n-grams and map labels to indices.
        """
        ngrams = list(chain.from_iterable(self.text_to_ngrams(text) for text in texts))
        vocab = {ngram: idx + 1 for idx, ngram in enumerate(set(ngrams))}  # 1-based indexing
        vocab["<pad>"] = 0  # Add padding token
        label_map = {label: idx for idx, label in enumerate(set(labels))}
        return vocab, label_map

    def text_to_ngrams(self, text):
        """
        Tokenize text into n-grams.
        """
        tokens = text.split()
        ngrams = []
        for token in tokens:
            token = f"<{token}>"
            for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
                ngrams.extend([token[i : i + n] for i in range(len(token) - n + 1)])
        return ngrams

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        """
        Get a single data sample.
        """
        text = self.encoded_texts[idx]
        label = self.encoded_labels[idx]
        return text, label

    def collate_fn(self, batch):
        """
        Collate function for padding and batching.
        """
        texts, labels = zip(*batch)
        max_length = max(len(text) for text in texts)
        padded_texts = [
            text + ["<pad>"] * (max_length - len(text)) for text in texts
        ]
        text_indices = torch.tensor(
            [[self.vocab.get(ngram, 0) for ngram in text] for text in padded_texts]
        )
        labels = torch.tensor(labels)
        return text_indices, labels


In [3]:

# ## 3. FastText Model
class FastTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(FastTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        """
        Forward pass of the model.
        """
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embed_dim)
        doc_vector = embedded.mean(dim=1)  # Average over sequence length
        output = self.fc(doc_vector)  # Shape: (batch_size, num_classes)
        return output


In [10]:

# ## 4. Training the Model
# Sample data
texts = [
    "I loved the movie",
    "The plot was dull",
    "Amazing direction and acting",
    "Waste of my time",
    "Outstanding performances",
    "Awful performance"
]
labels = ["positive", "negative", "positive", "negative", "positive","negative]

# Prepare dataset and DataLoader
dataset = FastTextDataset(texts, labels)
dataloader = DataLoader(
    dataset, batch_size=2, shuffle=True, collate_fn=dataset.collate_fn
)

# Model parameters
vocab_size = len(dataset.vocab)
embed_dim = 100
num_classes = len(dataset.label_map)

# Initialize model, loss function, and optimizer
model = FastTextClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        texts, labels = batch
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")


Epoch 1/20, Loss: 2.1027
Epoch 2/20, Loss: 2.0600
Epoch 3/20, Loss: 2.0535
Epoch 4/20, Loss: 2.0179
Epoch 5/20, Loss: 1.9871
Epoch 6/20, Loss: 1.9602
Epoch 7/20, Loss: 1.9044
Epoch 8/20, Loss: 1.9049
Epoch 9/20, Loss: 1.8511
Epoch 10/20, Loss: 1.8323
Epoch 11/20, Loss: 1.8591
Epoch 12/20, Loss: 1.8234
Epoch 13/20, Loss: 1.7356
Epoch 14/20, Loss: 1.7888
Epoch 15/20, Loss: 1.7113
Epoch 16/20, Loss: 1.6873
Epoch 17/20, Loss: 1.6423
Epoch 18/20, Loss: 1.6057
Epoch 19/20, Loss: 1.5728
Epoch 20/20, Loss: 1.6442


In [11]:
# ## 5. Testing and Predictions
def predict(text, model, dataset):
    model.eval()
    text_ngrams = dataset.text_to_ngrams(text)
    text_indices = torch.tensor([[dataset.vocab.get(ngram, 0) for ngram in text_ngrams]])
    with torch.no_grad():
        outputs = model(text_indices)
        predicted_label = outputs.argmax(dim=1).item()
        return list(dataset.label_map.keys())[predicted_label]


In [13]:

# Example
test_text = "The direction was dull"
predicted_sentiment = predict(test_text, model, dataset)
print(f"Predicted sentiment: {predicted_sentiment}")



Predicted sentiment: positive


In [4]:
import pandas as pd

df_train = pd.read_csv('dbpedia_train.csv',header=None)
df_train.count()

0    560000
1    560000
2    560000
dtype: int64

In [None]:
texts=df_train[2]
labels=df_train[0]

dataset = FastTextDataset(texts, labels)

dataloader = DataLoader(
    dataset, batch_size=50000, shuffle=True, collate_fn=dataset.collate_fn
)


# Model parameters
vocab_size = len(dataset.vocab)
print(vocab_size)
embed_dim = 100
num_classes = len(dataset.label_map)

# Initialize model, loss function, and optimizer
model = FastTextClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        texts, labels = batch
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")

4008733
