In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_json("/content/drive/My Drive/data/ready_to_feed_22-7-2025_eng.jsonl", lines=True)
df.dropna(inplace=True)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

# Save splits for later use
train_df.to_json("/content/drive/My Drive/data/train.jsonl", orient="records", lines=True)
val_df.to_json("/content/drive/My Drive/data/val.jsonl", orient="records", lines=True)
test_df.to_json("/content/drive/My Drive/data/test.jsonl", orient="records", lines=True)


In [None]:
# Main Training Script - CONFIGURED FOR MAXIMUM STABLE SPEED

import pandas as pd
import os
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# --- Ensure a clean start on the GPU ---
torch.cuda.empty_cache()

# --- Configuration ---
#this
# DATASET_FILENAME = "/content/drive/My Drive/data/ready_to_feed_22-7-2025_eng.jsonl"
TEXT_COLUMN = "full_text_minimal"
SUMMARY_COLUMN = "Summary_minimal"
MODEL_NAME = "facebook/bart-large-cnn"
OUTPUT_DIR = "bart-english-news-summarizer"
# --------------------

# 1. Load your prepared English data
# print(f"Loading data from '{DATASET_FILENAME}'...")
# For training
df_train = pd.read_json("/content/drive/My Drive/data/train.jsonl", lines=True)
df_val = pd.read_json("/content/drive/My Drive/data/val.jsonl", lines=True)
#this
# df = pd.read_json(DATASET_FILENAME, lines=True)
#this
# df.dropna(inplace=True)
#this
# dataset = Dataset.from_pandas(df)
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
# print(f"✅ Data loaded successfully with {len(dataset)} examples.")
#this
# dataset = dataset.shuffle(seed=42)
print("✅ Dataset shuffled.")

# 2. Load the pre-trained BART model & tokenizer
print(f"Loading model and tokenizer for '{MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).cuda()

# --- REVERTED OPTIMIZATION: torch.compile has been removed ---
# This feature caused downstream library conflicts. Removing it restores stability.
# model = torch.compile(model)

# --- We can now go back to the standard Trainer behavior ---

# 3. Preprocessing function
def preprocess(batch):
    inputs = tokenizer(
        batch[TEXT_COLUMN], max_length=1024, truncation=True, padding="max_length"
    )
    targets = tokenizer(
        batch[SUMMARY_COLUMN], max_length=128, truncation=True, padding="max_length"
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# 4. Tokenize the entire dataset with multiprocessing
print("Tokenizing the dataset using multiple cores...")
# This is a key, stable speedup for data prep.
#change to dataset
tokenized_train = train_dataset.map(
    preprocess,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=train_dataset.column_names,
    desc="Running tokenizer on dataset"
)
tokenized_val = val_dataset.map(
    preprocess,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=val_dataset.column_names,
    desc="Running tokenizer on dataset"
)
print("✅ Dataset tokenized.")

# We no longer need to manually set the format, the Trainer will handle it.
# tokenized_dataset.set_format(...)

# 5. Define Training Arguments for Maximum Stable Performance
training_args = Seq2SeqTrainingArguments(
    output_dir=f"./{OUTPUT_DIR}-checkpoints",

    # We no longer need the workaround for torch.compile
    # remove_unused_columns=False,

    per_device_train_batch_size=8, # TUNE THIS FOR YOUR GPU (Try 16, 24, 32...)
    gradient_accumulation_steps=2,
    fp16=True, # Essential for speed
    learning_rate=3e-5,
    num_train_epochs=2,

    # RE-ENABLING this optimization, which is now stable.
    # It uses background processes to fetch data so the GPU doesn't wait.
    dataloader_num_workers=os.cpu_count(),

    dataloader_pin_memory=True,
    logging_steps=200,
    save_steps=2000,
    save_total_limit=1,
    report_to="none",
)

# 6. Initialize the Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# 7. Start Fine-Tuning!
print(f"\nStarting to fine-tune '{MODEL_NAME}' on your English dataset...")
trainer.train()
print("\n✅ Fine-tuning complete!")

# 8. Save the final model for easy use later
trainer.save_model(OUTPUT_DIR)
print(f"\n✅ Final fine-tuned model saved to: ./{OUTPUT_DIR}")

✅ Dataset shuffled.
Loading model and tokenizer for 'facebook/bart-large-cnn'...
Tokenizing the dataset using multiple cores...


Running tokenizer on dataset (num_proc=2):   0%|          | 0/47643 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=2):   0%|          | 0/5955 [00:00<?, ? examples/s]

✅ Dataset tokenized.

Starting to fine-tune 'facebook/bart-large-cnn' on your English dataset...


  trainer = Seq2SeqTrainer(


Step,Training Loss
200,0.8104
400,0.2406
600,0.2334
800,0.2287
1000,0.2198
1200,0.2214
1400,0.2203
1600,0.2195
1800,0.215
2000,0.2188





✅ Fine-tuning complete!

✅ Final fine-tuned model saved to: ./bart-english-news-summarizer


In [None]:
# Cell: Save Final Model to Google Drive

import os
from google.colab import drive
import time

print("--- Starting the process to save the model to Google Drive ---")

# --- 1. Mount Google Drive ---
# This step ensures your Colab environment has access to your Drive.
# It will ask for permission if not already mounted.
try:
    drive.mount('/content/drive', force_remount=True)
    print("✅ Google Drive mounted successfully.")
except Exception as e:
    print(f"❌ Error mounting Google Drive: {e}")


# --- 2. Define File Paths ---
# The name of the folder created by the training script in your Colab session.
# IMPORTANT: This must match the 'OUTPUT_DIR' from your training script.
LOCAL_MODEL_DIR = "bart-english-news-summarizer"

# The destination folder in your Google Drive where models will be saved.
# You can change 'models' to any folder name you prefer.
DRIVE_DESTINATION_FOLDER = "/content/drive/My Drive/models"

# The full path where the final model will reside in Google Drive.
FULL_DRIVE_PATH = os.path.join(DRIVE_DESTINATION_FOLDER, LOCAL_MODEL_DIR)

print(f"\nSource (Local Colab): ./{LOCAL_MODEL_DIR}")
print(f"Destination (Google Drive): {FULL_DRIVE_PATH}")


# --- 3. Create Destination Folder in Drive (if it doesn't exist) ---
# This prevents errors if the 'models' folder is not already there.
print(f"\nEnsuring destination folder '{DRIVE_DESTINATION_FOLDER}' exists...")
os.makedirs(DRIVE_DESTINATION_FOLDER, exist_ok=True)
print("✅ Destination folder is ready.")


# --- 4. Copy the Model Folder to Google Drive ---
# We use a system command 'cp -r' to recursively copy the entire directory.
# This is generally the most reliable method.
print(f"\nStarting copy of '{LOCAL_MODEL_DIR}' to Google Drive. This may take a few minutes...")
start_time = time.time()

# The 'cp -rf' command copies the directory recursively (-r) and forces overwrite (-f).
# The destination path is in quotes to handle spaces in "My Drive".
os.system(f"cp -rf {LOCAL_MODEL_DIR} \"{DRIVE_DESTINATION_FOLDER}\"")

end_time = time.time()
print(f"✅ Copy operation finished in {end_time - start_time:.2f} seconds.")


# --- 5. Verify the Copy ---
# List the contents of the new directory in Google Drive to confirm success.
print(f"\nVerifying contents at '{FULL_DRIVE_PATH}'...")
try:
    # Use 'ls -l' for a detailed directory listing.
    !ls -l "{FULL_DRIVE_PATH}"
    print("\n--- Process complete! Your model is now saved in Google Drive. ---")
except Exception as e:
    print(f"❌ Verification failed. Could not list contents of the destination folder: {e}")

--- Starting the process to save the model to Google Drive ---
Mounted at /content/drive
✅ Google Drive mounted successfully.

Source (Local Colab): ./bart-english-news-summarizer
Destination (Google Drive): /content/drive/My Drive/models/bart-english-news-summarizer

Ensuring destination folder '/content/drive/My Drive/models' exists...
✅ Destination folder is ready.

Starting copy of 'bart-english-news-summarizer' to Google Drive. This may take a few minutes...
✅ Copy operation finished in 40.00 seconds.

Verifying contents at '/content/drive/My Drive/models/bart-english-news-summarizer'...
total 1592039
-rw------- 1 root root       1591 Aug  5 08:38 config.json
-rw------- 1 root root        358 Aug  5 08:38 generation_config.json
-rw------- 1 root root     456318 Aug  5 08:38 merges.txt
-rw------- 1 root root 1625422896 Aug  5 08:38 model.safetensors
-rw------- 1 root root        279 Aug  5 08:38 special_tokens_map.json
-rw------- 1 root root       1270 Aug  5 08:38 tokenizer_config

In [None]:
import shutil

# Path to your folder
folder_path = '/content/bart-english-news-summarizer'
zip_path = '/content/bart-english-news-summarizer.zip'

# Create a zip file
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)


'/content/bart-english-news-summarizer.zip'

In [None]:
from google.colab import files

# Provide the path to the zipped file
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>