In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json

# Replace the path with the actual path to your file in Google Drive
file_path = '/content/drive/My Drive/NLP/train.jsonl'

# Open the .jsonl file and load data
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

In [None]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame(data)

# Check the first few rows to ensure it's loaded correctly
print(df.head())

In [None]:
import re
from urllib.parse import urlparse

# Define your categories and keywords
categories = {
    'architecture': ['architect', 'building', 'construction', 'structure'],
    'food': ['recipe', 'cuisine', 'cook', 'restaurant', 'food', 'dish', 'chef', 'healthy'],
    'entertainment': ['movie', 'show', 'music', 'celebrity', 'actor', 'tv', 'film', 'hollywood', 'cinema', 'theatre'],
    'sports': ['sport', 'athlete', 'league', 'tournament', 'game', 'football', 'cricket', 'hockey'],
    'technology': ['tech', 'software', 'hardware', 'programming', 'gadget', 'app', 'phone', 'laptop', 'apple', 'microsoft', 'meta']
}

# Function to extract keywords from URL
def extract_keywords_from_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    keywords = re.findall(r'/([a-zA-Z0-9-]+)', path)
    return [keyword.lower() for keyword in keywords if keyword]

# Function to categorize URLs based on keywords
def categorize_url(url):
    url = url.lower()
    for category, keywords in categories.items():
        pattern = re.compile('|'.join(f'.*{re.escape(keyword)}.*' for keyword in keywords))
        if any(pattern.match(keyword) for keyword in extract_keywords_from_url(url)):
            return category
    return 'other'

# Apply the function to the 'url' column to create a new 'category' column
df['category'] = df['url'].apply(categorize_url)

# Display the first few rows to verify the categorization
print(df[['url', 'category']].head())

                                                 url category
0  http://www.nytimes.com/2006/06/04/sports/socce...   sports
1  http://www.nytimes.com/2005/12/24/politics/24s...    other
2  http://www.nytimes.com/2006/04/23/business/you...    other
3  http://www.nydailynews.com/archives/gossip/199...    other
4  http://www.nydailynews.com/archives/entertainm...    other


In [None]:
category_counts = df['category'].value_counts()
print(category_counts)

category
other            807420
entertainment     60980
sports            59724
technology        49978
food              13454
architecture       3485
Name: count, dtype: int64


In [None]:
# Filter the DataFrame to only include 'technology' category
technology_df = df[df['category'] == 'technology']

# Check the size of the filtered DataFrame
print(f"Number of technology samples: {technology_df.shape[0]}")

Number of technology samples: 49978


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Custom Dataset class for loading data
class SummaryDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512, summary_max_len=150):
        self.tokenizer = tokenizer
        self.data = df
        self.max_len = max_len
        self.summary_max_len = summary_max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = str(self.data['text'].iloc[index])
        summary = str(self.data['summary'].iloc[index])

        # Encode the inputs
        input_encoding = self.tokenizer.encode_plus(
            article,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        # Encode the summaries (target)
        target_encoding = self.tokenizer.encode_plus(
            summary,
            max_length=self.summary_max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Create the dataset and DataLoader
train_dataset = SummaryDataset(technology_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
from transformers import AdamW
import torch
from tqdm import tqdm

# Initialize T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 6248/6248 [13:34<00:00,  7.67it/s, loss=0.454]
Epoch 1: 100%|██████████| 6248/6248 [13:34<00:00,  7.67it/s, loss=0.119]
Epoch 2: 100%|██████████| 6248/6248 [13:34<00:00,  7.67it/s, loss=0.478]
Epoch 3: 100%|██████████| 6248/6248 [13:35<00:00,  7.66it/s, loss=0.333]
Epoch 4: 100%|██████████| 6248/6248 [13:35<00:00,  7.67it/s, loss=0.109]
Epoch 5: 100%|██████████| 6248/6248 [13:35<00:00,  7.66it/s, loss=0.352]
Epoch 6: 100%|██████████| 6248/6248 [13:35<00:00,  7.67it/s, loss=0.226]
Epoch 7: 100%|██████████| 6248/6248 [13:35<00:00,  7.66it/s, loss=0.406]
Epoch 8: 100%|██████████| 6248/6248 [13:35<00:00,  7.66it/s, loss=0.702]
Epoch 9: 100%|██████████| 6248/6248 [13:35<00:00,  7.66it/s, loss=0.18]


In [None]:
from rouge_score import rouge_scorer
import numpy as np

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate(model, dataloader):
    model.eval()
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Generate predictions
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

            # Decode the predictions and labels
            pred_summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
            real_summaries = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

            # Compute ROUGE scores for each sample
            for pred, real in zip(pred_summaries, real_summaries):
                score = scorer.score(real, pred)
                scores['rouge1'].append(score['rouge1'].fmeasure)
                scores['rouge2'].append(score['rouge2'].fmeasure)
                scores['rougeL'].append(score['rougeL'].fmeasure)

    # Return average scores
    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    return avg_scores

# Evaluate on a sample of the technology data
eval_dataset = SummaryDataset(technology_df.sample(1000), tokenizer)
eval_loader = DataLoader(eval_dataset, batch_size=8)

rouge_scores = evaluate(model, eval_loader)
print(rouge_scores)

In [None]:
model.save_pretrained('/content/drive/MyDrive/NLP/t5-final-model')
tokenizer.save_pretrained('/content/drive/MyDrive/NLP/t5-final-model')