In [None]:
#importing libraries

import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

import pandas as pd
import numpy as np
import warnings

from tqdm import tqdm
warnings.filterwarnings("ignore")

In [None]:
# Loading Technology Train Data
df = pd.read_csv('/storage/ice1/6/4/tchavan3/technology_train.csv')

In [None]:
# Using the BART tokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Preprocess the input and output text
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], max_length=1024, truncation=True, padding='max_length')
    labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')

    # Set labels for training
    inputs['labels'] = labels['input_ids']
    return inputs

In [None]:
# Loading the paraphrased datasets

df_food = pd.read_csv('./food-aug.csv')
df_sports = pd.read_csv('./sports-aug.csv')
df_entertainment = pd.read_csv('./ent-aug.csv')
df_architecture = pd.read_csv('./arch-aug.csv')

In [None]:
#getting a combined dataset

result = pd.concat([df_food[['text', 'summary']], df_sports[['text', 'summary']], df_entertainment[['text', 'summary']], df_architecture[['text', 'summary']], df[['text', 'summary']]], ignore_index=True)
result

Unnamed: 0,text,summary
0,"**""Taste of the Past: Revival of Classic Briti...",Traditional British cuisine is experiencing a ...
1,**Federal Regulations Set to Revolutionize Ame...,Federal regulations aim to reduce food waste a...
2,**Sustainable Seafood: A Growing Concern for C...,Coastal communities face unprecedented pressur...
3,"""Foodies on the Move: The Rise of Plant-Based ...",Plant-based cuisine is revolutionizing the foo...
4,"**""Revolutionizing the World of Fruit: The Ris...",Apple's versatility is revolutionizing the cul...
...,...,...
53973,Perhaps Microsoft has finally gotten a little ...,...
53974,Microsoft today released updates to fix at lea...,...
53975,Microsoft today issued software updates to fix...,"The latest news on computer, technology and ne..."
53976,"Vote: Abolish Middle-East ""THEOCRACY & MONARCH...",Need to Know - PostGlobal on PostGlobal; blog ...


In [None]:
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(result)

# Tokenize the dataset
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True, remove_columns=["text", "summary"])


Map: 100%|██████████| 53978/53978 [02:25<00:00, 371.73 examples/s]


In [None]:
#splitting the dataset 80-20 train test split

from sklearn.model_selection import train_test_split
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
# We use the BART large dataset, loading it

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_paraphrased/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='epoch',    
)


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # 20% data for evaluation
)

# Train the model with a progress bar (tqdm is automatically included)
trainer.train()