In [2]:
# FastText-Style Text Classification with PyTorch

# ## 1. Import Required Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from itertools import chain
import re


In [3]:

# ## 2. Dataset Preparation
class FastTextDataset(Dataset):
    def __init__(self, texts, labels, vocab=None, ngram_range=(3, 6)):
        """
        Initialize the dataset with texts, labels, and optional vocabulary.
        """
        self.texts = texts
        self.labels = labels
        self.ngram_range = ngram_range

        if vocab is None:
            self.vocab, self.label_map = self.build_vocab_and_labels(texts, labels)
        else:
            self.vocab, self.label_map = vocab

        self.encoded_texts = [self.text_to_ngrams(text) for text in texts]
        self.encoded_labels = [self.label_map[label] for label in labels]

    def build_vocab_and_labels(self, texts, labels):
        """
        Create vocab of n-grams and map labels to indices.
        """
        ngrams = list(chain.from_iterable(self.text_to_ngrams(text) for text in texts))
        vocab = {ngram: idx + 1 for idx, ngram in enumerate(set(ngrams))}  # 1-based indexing
        vocab["<pad>"] = 0  # Add padding token
        label_map = {label: idx for idx, label in enumerate(set(labels))}
        return vocab, label_map

    def text_to_ngrams(self, text):
        """
        Tokenize text into n-grams.
        """
        tokens = text.split()
        ngrams = []
        for token in tokens:
            token = f"<{token}>"
            for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
                ngrams.extend([token[i : i + n] for i in range(len(token) - n + 1)])
        return ngrams

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        """
        Get a single data sample.
        """
        text = self.encoded_texts[idx]
        label = self.encoded_labels[idx]
        return text, label

    def collate_fn(self, batch):
        """
        Collate function for padding and batching.
        """
        texts, labels = zip(*batch)
        max_length = max(len(text) for text in texts)
        padded_texts = [
            text + ["<pad>"] * (max_length - len(text)) for text in texts
        ]
        text_indices = torch.tensor(
            [[self.vocab.get(ngram, 0) for ngram in text] for text in padded_texts]
        )
        labels = torch.tensor(labels)
        return text_indices, labels


In [4]:

# ## 3. FastText Model
class FastTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(FastTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        """
        Forward pass of the model.
        """
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embed_dim)
        doc_vector = embedded.mean(dim=1)  # Average over sequence length
        output = self.fc(doc_vector)  # Shape: (batch_size, num_classes)
        return output


In [10]:

# ## 4. Training the Model
# Sample data
texts = [
    "I loved the movie",
    "The plot was dull",
    "Amazing direction and acting",
    "Waste of my time",
    "Outstanding performances",
    "Awful performance"
]
labels = ["positive", "negative", "positive", "negative", "positive","negative"]

# Prepare dataset and DataLoader
dataset = FastTextDataset(texts, labels)
dataloader = DataLoader(
    dataset, batch_size=2, shuffle=True, collate_fn=dataset.collate_fn
)

# Model parameters
vocab_size = len(dataset.vocab)
embed_dim = 100
num_classes = len(dataset.label_map)

# Initialize model, loss function, and optimizer
model = FastTextClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        texts, labels = batch
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")


Epoch 1/500, Loss: 2.0608
Epoch 2/500, Loss: 2.0107
Epoch 3/500, Loss: 2.0038
Epoch 4/500, Loss: 1.9605
Epoch 5/500, Loss: 1.9611
Epoch 6/500, Loss: 1.9215
Epoch 7/500, Loss: 1.8812
Epoch 8/500, Loss: 1.8722
Epoch 9/500, Loss: 1.8484
Epoch 10/500, Loss: 1.8583
Epoch 11/500, Loss: 1.7928
Epoch 12/500, Loss: 1.8133
Epoch 13/500, Loss: 1.7903
Epoch 14/500, Loss: 1.7144
Epoch 15/500, Loss: 1.7422
Epoch 16/500, Loss: 1.7205
Epoch 17/500, Loss: 1.6958
Epoch 18/500, Loss: 1.6738
Epoch 19/500, Loss: 1.5814
Epoch 20/500, Loss: 1.5361
Epoch 21/500, Loss: 1.5999
Epoch 22/500, Loss: 1.5730
Epoch 23/500, Loss: 1.5455
Epoch 24/500, Loss: 1.5203
Epoch 25/500, Loss: 1.3417
Epoch 26/500, Loss: 1.4635
Epoch 27/500, Loss: 1.3451
Epoch 28/500, Loss: 1.2587
Epoch 29/500, Loss: 1.2043
Epoch 30/500, Loss: 1.2315
Epoch 31/500, Loss: 1.2003
Epoch 32/500, Loss: 1.2895
Epoch 33/500, Loss: 1.2600
Epoch 34/500, Loss: 1.1194
Epoch 35/500, Loss: 1.1978
Epoch 36/500, Loss: 1.1692
Epoch 37/500, Loss: 1.1441
Epoch 38/5

In [11]:
# ## 5. Testing and Predictions
def predict(text, model, dataset):
    model.eval()
    text_ngrams = dataset.text_to_ngrams(text)
    text_indices = torch.tensor([[dataset.vocab.get(ngram, 0) for ngram in text_ngrams]])
    with torch.no_grad():
        outputs = model(text_indices)
        predicted_label = outputs.argmax(dim=1).item()
        return list(dataset.label_map.keys())[predicted_label]


In [15]:

# Example
test_text = "The direction was greater than what I expected"
predicted_sentiment = predict(test_text, model, dataset)
print(f"Predicted sentiment: {predicted_sentiment}")



Predicted sentiment: positive


In [16]:
import pandas as pd

df_train = pd.read_csv('dbpedia_train.csv',header=None)
df_train.count()

0    560000
1    560000
2    560000
dtype: int64

In [None]:
%%time

sample=df_train.sample(10000)
texts=sample[2]
labels=sample[0]
print (sample.count())


dataset = FastTextDataset(texts, labels)


dataloader = DataLoader(
    dataset, batch_size=128, shuffle=True, collate_fn=dataset.collate_fn, num_workers=16
)




# Model parameters
vocab_size = len(dataset.vocab)
print(vocab_size)
embed_dim = 100
num_classes = len(dataset.label_map)

# Initialize model, loss function, and optimizer
model = FastTextClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        texts, labels = batch
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        epoch_loss += loss.item()
    scheduler.step()  # Adjust learning rate
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")

0    10000
1    10000
2    10000
dtype: int64
392368
Epoch 1/30, Loss: 138.6678
