# **Training FFN Model**

In [None]:
!pip install gensim torch torchvision torchaudio --quiet

import torch
import torch.nn as nn
import torch.optim as optim
from gensim import downloader as api
from collections import defaultdict
import math
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# GLOVE
embedding_dim = 50
word2vec = api.load('glove-wiki-gigaword-50')

# Preprocesing 
def read_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

def tokenize_sentences(sentences):
    return [['<s>'] + s.split() + ['</s>'] for s in sentences]

def load_and_tokenize(path):
    return tokenize_sentences(read_file(path))

def build_vocab(corpus):
    vocab = set(word for sentence in corpus for word in sentence)
    vocab = vocab.union({'<unk>'})
    word2idx = {word: i for i, word in enumerate(vocab)}
    idx2word = {i: word for word, i in word2idx.items()}
    return word2idx, idx2word

#  Embedding
def create_embedding_matrix(word2idx, word2vec, embedding_dim):
    matrix = torch.randn(len(word2idx), embedding_dim) * 0.01
    for word, idx in word2idx.items():
        if word in word2vec:
            matrix[idx] = torch.tensor(word2vec[word])
        elif word.lower() in word2vec:
            matrix[idx] = torch.tensor(word2vec[word.lower()])
    return matrix

#  Model
class FFNLanguageModel(nn.Module):
    def __init__(self, context_size, embedding_matrix, hidden_dim=128):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embeddings = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs).view(context_idxs.size(0), -1)
        out = self.linear1(embeds)
        out = self.relu(out)
        return self.linear2(out)

#  Dataset Building
def make_context_target_pairs(corpus, n, word2idx):
    data = []
    for sentence in corpus:
        sentence = ['<s>'] * (n - 1) + sentence + ['</s>']
        for i in range(n - 1, len(sentence)):
            context = sentence[i - n + 1:i]
            target = sentence[i]
            context_idxs = [word2idx.get(w, word2idx['<unk>']) for w in context]
            target_idx = word2idx.get(target, word2idx['<unk>'])
            data.append((torch.tensor(context_idxs, dtype=torch.long), target_idx))
    return data

#  Training 
def train_model(model, data, epochs=5, batch_size=128, lr=0.001):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        random.shuffle(data)
        total_loss = 0
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            context_batch = torch.stack([x[0] for x in batch]).to(device)
            target_batch = torch.tensor([x[1] for x in batch]).to(device)

            optimizer.zero_grad()
            outputs = model(context_batch)
            loss = loss_fn(outputs, target_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(data):.4f}")

#  ppl 
def compute_perplexity(model, data):
    model.eval()
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0
    with torch.no_grad():
        for context, target in data:
            context = context.unsqueeze(0).to(device)
            target = torch.tensor([target]).to(device)
            output = model(context)
            loss = loss_fn(output, target)
            total_loss += loss.item()
    avg_loss = total_loss / len(data)
    return math.exp(avg_loss)

train_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/train.csv')
val_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/val.csv')
test_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/test.csv')

word2idx, idx2word = build_vocab(train_corpus + val_corpus + test_corpus)
embedding_matrix = create_embedding_matrix(word2idx, word2vec, embedding_dim).to(device)

results = {}

for n in [1, 2, 3]:
    print(f"\n--- Training {n}-gram FFN Model ---")
    train_data = make_context_target_pairs(train_corpus, n, word2idx)
    test_data = make_context_target_pairs(test_corpus, n, word2idx)

    model = FFNLanguageModel(context_size=n-1, embedding_matrix=embedding_matrix).to(device)
    train_model(model, train_data, epochs=5)

    ppl = compute_perplexity(model, test_data)
    results[f"{n}-gram"] = ppl
    print(f"{n}-gram FFN Perplexity: {ppl:.2f}")

#  Results
print("\n--- Perplexity Comparison ---")
for model_name, perplexity in results.items():
    print(f"{model_name}: {perplexity:.2f}")


Using device: cuda

--- Training 1-gram FFN Model ---




Epoch 1, Loss: 0.0603
Epoch 2, Loss: 0.0504
Epoch 3, Loss: 0.0505
Epoch 4, Loss: 0.0506
Epoch 5, Loss: 0.0507
1-gram FFN Perplexity: 805.84

--- Training 2-gram FFN Model ---
Epoch 1, Loss: 0.0425
Epoch 2, Loss: 0.0372
Epoch 3, Loss: 0.0353
Epoch 4, Loss: 0.0343
Epoch 5, Loss: 0.0336
2-gram FFN Perplexity: 195.57

--- Training 3-gram FFN Model ---
Epoch 1, Loss: 0.0402
Epoch 2, Loss: 0.0351
Epoch 3, Loss: 0.0333
Epoch 4, Loss: 0.0322
Epoch 5, Loss: 0.0314
3-gram FFN Perplexity: 212.27

--- Perplexity Comparison ---
1-gram: 805.84
2-gram: 195.57
3-gram: 212.27



# **Hyperparameter Tuning**

In [None]:
# Function For LR Search 
def tune_learning_rate(n, train_corpus, val_corpus, word2idx, embedding_matrix, lr_list, epochs=5):
    print(f"\n Tuning learning rate for {n}-gram model...")
    best_lr = None
    best_val_ppl = float('inf')

    train_data = make_context_target_pairs(train_corpus, n, word2idx)
    val_data = make_context_target_pairs(val_corpus, n, word2idx)

    for lr in lr_list:
        print(f"   Trying LR={lr}...")
        model = FFNLanguageModel(context_size=n-1, embedding_matrix=embedding_matrix).to(device)
        train_model(model, train_data, epochs=epochs, lr=lr)
        val_ppl = compute_perplexity(model, val_data)
        print(f"    ‚Üí Validation PPL: {val_ppl:.2f}")

        if val_ppl < best_val_ppl:
            best_val_ppl = val_ppl
            best_lr = lr

    print(f"Best LR for {n}-gram is {best_lr} with Val PPL: {best_val_ppl:.2f}")
    return best_lr

# LR Tuning 
train_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/train.csv')
val_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/val.csv')
test_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/test.csv')

word2idx, idx2word = build_vocab(train_corpus + val_corpus + test_corpus)
embedding_matrix = create_embedding_matrix(word2idx, word2vec, embedding_dim).to(device)

learning_rates = [0.1, 0.01, 0.001]
results = {}

for n in [1, 2, 3]:
    print(f"\n--- {n}-gram FFN Model ---")
    best_lr = tune_learning_rate(n, train_corpus, val_corpus, word2idx, embedding_matrix, learning_rates)

    # Retrain with best LR on train data
    train_data = make_context_target_pairs(train_corpus, n, word2idx)
    test_data = make_context_target_pairs(test_corpus, n, word2idx)

    model = FFNLanguageModel(context_size=n-1, embedding_matrix=embedding_matrix).to(device)
    train_model(model, train_data, epochs=5, lr=best_lr)

    test_ppl = compute_perplexity(model, test_data)
    results[f"{n}-gram"] = test_ppl
    print(f"{n}-gram Test Perplexity: {test_ppl:.2f}")

#  Results
print("\n Final Perplexity Comparison (after LR tuning):")
for model_name, perplexity in results.items():
    print(f"{model_name}: {perplexity:.2f}")



--- 1-gram FFN Model ---

üîç Tuning learning rate for 1-gram model...
  ‚ñ∂ Trying LR=0.1...




Epoch 1, Loss: 0.0529
Epoch 2, Loss: 0.0522
Epoch 3, Loss: 0.0522
Epoch 4, Loss: 0.0522
Epoch 5, Loss: 0.0522
    ‚Üí Validation PPL: 941.52
  ‚ñ∂ Trying LR=0.01...
Epoch 1, Loss: 0.0522
Epoch 2, Loss: 0.0511
Epoch 3, Loss: 0.0511
Epoch 4, Loss: 0.0511
Epoch 5, Loss: 0.0511
    ‚Üí Validation PPL: 838.63
  ‚ñ∂ Trying LR=0.001...
Epoch 1, Loss: 0.0603
Epoch 2, Loss: 0.0504
Epoch 3, Loss: 0.0505
Epoch 4, Loss: 0.0506
Epoch 5, Loss: 0.0507
    ‚Üí Validation PPL: 806.11
‚úÖ Best LR for 1-gram is 0.001 with Val PPL: 806.11
Epoch 1, Loss: 0.0603
Epoch 2, Loss: 0.0504
Epoch 3, Loss: 0.0505
Epoch 4, Loss: 0.0506
Epoch 5, Loss: 0.0507
1-gram Test Perplexity: 805.94

--- 2-gram FFN Model ---

üîç Tuning learning rate for 2-gram model...
  ‚ñ∂ Trying LR=0.1...
Epoch 1, Loss: 0.0633
Epoch 2, Loss: 0.0581
Epoch 3, Loss: 0.0558
Epoch 4, Loss: 0.0549
Epoch 5, Loss: 0.0530
    ‚Üí Validation PPL: 985.26
  ‚ñ∂ Trying LR=0.01...
Epoch 1, Loss: 0.0470
Epoch 2, Loss: 0.0454
Epoch 3, Loss: 0.0455
Epoch 4

KeyboardInterrupt: 

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# GLOVE 
embedding_dim = 50
word2vec = api.load('glove-wiki-gigaword-50')

# Preproccesing 
def read_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

def tokenize_sentences(sentences):
    return [['<s>'] + s.split() + ['</s>'] for s in sentences]

def load_and_tokenize(path):
    return tokenize_sentences(read_file(path))

def build_vocab(corpus):
    vocab = set(word for sentence in corpus for word in sentence)
    vocab = vocab.union({'<unk>'})
    word2idx = {word: i for i, word in enumerate(vocab)}
    idx2word = {i: word for word, i in word2idx.items()}
    return word2idx, idx2word

# Embedding 
def create_embedding_matrix(word2idx, word2vec, embedding_dim):
    matrix = torch.randn(len(word2idx), embedding_dim) * 0.01
    for word, idx in word2idx.items():
        if word in word2vec:
            matrix[idx] = torch.tensor(word2vec[word])
        elif word.lower() in word2vec:
            matrix[idx] = torch.tensor(word2vec[word.lower()])
    return matrix

# Model 
class FFNLanguageModel(nn.Module):
    def __init__(self, context_size, embedding_matrix, hidden_dim=128):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embeddings = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs).view(context_idxs.size(0), -1)
        out = self.linear1(embeds)
        out = self.relu(out)
        return self.linear2(out)

# Dataset Building 
def make_context_target_pairs(corpus, n, word2idx):
    data = []
    for sentence in corpus:
        sentence = ['<s>'] * (n - 1) + sentence + ['</s>']
        for i in range(n - 1, len(sentence)):
            context = sentence[i - n + 1:i]
            target = sentence[i]
            context_idxs = [word2idx.get(w, word2idx['<unk>']) for w in context]
            target_idx = word2idx.get(target, word2idx['<unk>'])
            data.append((torch.tensor(context_idxs, dtype=torch.long), target_idx))
    return data

# Training 
def train_model(model, data, epochs=5, batch_size=128, lr=0.001):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        random.shuffle(data)
        total_loss = 0
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            context_batch = torch.stack([x[0] for x in batch]).to(device)
            target_batch = torch.tensor([x[1] for x in batch]).to(device)

            optimizer.zero_grad()
            outputs = model(context_batch)
            loss = loss_fn(outputs, target_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(data):.4f}")

# Ppl
def compute_perplexity(model, data):
    model.eval()
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0
    with torch.no_grad():
        for context, target in data:
            context = context.unsqueeze(0).to(device)
            target = torch.tensor([target]).to(device)
            output = model(context)
            loss = loss_fn(output, target)
            total_loss += loss.item()
    avg_loss = total_loss / len(data)
    return math.exp(avg_loss)

# Prediction Func 
def predict_next_word(model, sentence, word2idx, idx2word, n=2):
    model.eval()
    tokens = ['<s>'] * (n - 1) + sentence.split()
    context = tokens[-(n - 1):]
    context_idxs = torch.tensor([[word2idx.get(w, word2idx['<unk>']) for w in context]], dtype=torch.long).to(device)
    with torch.no_grad():
        output = model(context_idxs)
        predicted_idx = torch.argmax(output, dim=1).item()
        return idx2word[predicted_idx]


train_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/train.csv')
val_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/val.csv')
test_corpus = load_and_tokenize('/content/drive/MyDrive/Colab Notebooks/test.csv')

word2idx, idx2word = build_vocab(train_corpus + val_corpus + test_corpus)
embedding_matrix = create_embedding_matrix(word2idx, word2vec, embedding_dim).to(device)

n = 2  # Bigram
train_data = make_context_target_pairs(train_corpus, n, word2idx)
test_data = make_context_target_pairs(test_corpus, n, word2idx)

model = FFNLanguageModel(context_size=n-1, embedding_matrix=embedding_matrix).to(device)
train_model(model, train_data, epochs=5, lr=0.001)

# Perplexity on Test Set
ppl = compute_perplexity(model, test_data)
print(f"\nBigram FFN Test Perplexity: {ppl:.2f}")



predicted_words = []
for sent in sample_df['Truncated Text']:
    next_word = predict_next_word(model, sent, word2idx, idx2word, n=2)
    predicted_words.append(next_word)

sample_df['predicted'] = predicted_words

sample_df.to_csv('/content/drive/MyDrive/Colab Notebooks/sample_with_predictions.csv', index=False)





Using device: cuda
Epoch 1, Loss: 0.0424
Epoch 2, Loss: 0.0371
Epoch 3, Loss: 0.0353
Epoch 4, Loss: 0.0342
Epoch 5, Loss: 0.0335

Bigram FFN Test Perplexity: 198.71

üîÆ Predictions for sample.csv:
Truncated Text ‚Üí </s>
"GROUP RAISES TEXSTYRENE &lt;FOAM> STAKE TO 11.7 PCT An investor group led by Dart Container Corp, a Sarasota, Fla., plastic utensil maker, told the Securities and Exchange Commission it raised its stake in Texstyrene Corp to 420,500 common shares or 11.7 of the total outstanding. The group said it bought the shares for investment and intends to continue to buy Texstyrene stock in the future. The group said its most recent purchases included 106,000 shares bought March 10-31" ‚Üí </s>
"&lt;BROAD> ACQUIRES &lt;VOGT AND CONANT> UNIT Broad Corp said it acquired the construction activities of Vogt and Conant Co of Cleveland. The combined companies, to be called Broad, Vogt and Conant INc, will be the largest structural steel erection company in the U.S. Combined sales of

In [None]:
import pandas as pd
sample_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sample.csv') 


In [9]:
sample_df

Unnamed: 0,Truncated Text
0,GROUP RAISES TEXSTYRENE &lt;FOAM> STAKE TO 11....
1,&lt;BROAD> ACQUIRES &lt;VOGT AND CONANT> UNIT ...
2,BANKERS TRUST CO RAISES BROKER LOAN RATE Banke...
3,SINGAPORE BANKS SAY DIVERSIFICATION KEY TO GRO...
4,UNION PACIFIC &lt;UNP> TO SELL PART OF REFINER...
...,...
94,DUTCH PARLIAMENT BACKS STAND AGAINST OILS TAX ...
95,&lt;GEMINI FOOD CORP> SIX MTHS JANUARY 31 NET ...
96,H.K. M3 MONEY SUPPLY RISES 1.4 PCT IN FEBRUARY...
97,AMERICAN MEDICAL INTERNATIONAL INC &lt;AMI> NE...


In [11]:
sample_df_new = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sample_with_predictions.csv')


In [12]:
sample_df_new

Unnamed: 0,Truncated Text,predicted
0,GROUP RAISES TEXSTYRENE &lt;FOAM> STAKE TO 11....,</s>
1,&lt;BROAD> ACQUIRES &lt;VOGT AND CONANT> UNIT ...,cts
2,BANKERS TRUST CO RAISES BROKER LOAN RATE Banke...,for
3,SINGAPORE BANKS SAY DIVERSIFICATION KEY TO GRO...,said.
4,UNION PACIFIC &lt;UNP> TO SELL PART OF REFINER...,</s>
...,...,...
94,DUTCH PARLIAMENT BACKS STAND AGAINST OILS TAX ...,</s>
95,&lt;GEMINI FOOD CORP> SIX MTHS JANUARY 31 NET ...,loss
96,H.K. M3 MONEY SUPPLY RISES 1.4 PCT IN FEBRUARY...,</s>
97,AMERICAN MEDICAL INTERNATIONAL INC &lt;AMI> NE...,the
