In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# --- Setup & downloads ---
!pip -q install gdown

import os, zipfile, glob
import gdown
import pandas as pd

# 1) Download the zip from Google Drive
zip_path = "/content/data.zip"
gdown.download(id="1VFbCisUiNGnjOdOyNR_s1nEw0guu5ZOt", output=zip_path, quiet=False)

# 2) Extract it
extract_dir = "/content/data_zip"
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall(extract_dir)

# 3) Find split JSONL files
train_path = glob.glob(os.path.join(extract_dir, "**", "train.jsonl"), recursive=True)
val_path   = glob.glob(os.path.join(extract_dir, "**", "val.jsonl"), recursive=True)
test_path  = glob.glob(os.path.join(extract_dir, "**", "test.jsonl"), recursive=True)

if not (train_path and val_path and test_path):
    raise FileNotFoundError("train.jsonl, val.jsonl, or test.jsonl not found inside the zip.")

# 4) Load data
train_df = pd.read_json(train_path[0], lines=True)
val_df   = pd.read_json(val_path[0], lines=True)
test_df  = pd.read_json(test_path[0], lines=True)

print(f"Train set: {len(train_df)} rows")
print(f"Val set:   {len(val_df)} rows")
print(f"Test set:  {len(test_df)} rows")


Downloading...
From (original): https://drive.google.com/uc?id=1VFbCisUiNGnjOdOyNR_s1nEw0guu5ZOt
From (redirected): https://drive.google.com/uc?id=1VFbCisUiNGnjOdOyNR_s1nEw0guu5ZOt&confirm=t&uuid=f67e3943-c314-4e93-a7ae-f8eef30882e9
To: /content/data.zip
100%|██████████| 79.8M/79.8M [00:01<00:00, 45.2MB/s]


Train set: 47643 rows
Val set:   5955 rows
Test set:  5937 rows


In [4]:

import pandas as pd
import os
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# --- Ensure a clean start on the GPU ---
torch.cuda.empty_cache()

# --- Configuration ---

TEXT_COLUMN = "full_text_minimal"
SUMMARY_COLUMN = "Summary_minimal"
MODEL_NAME = "facebook/bart-large-cnn"
OUTPUT_DIR = "bart-english-news-summarizer"
# --------------------

# 1. Load your prepared English data
# For training
df_train = pd.read_json("/content/data_zip/train.jsonl", lines=True)
df_val = pd.read_json("/content/data_zip/val.jsonl", lines=True)
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)

# 2. Load the pre-trained BART model & tokenizer
print(f"Loading model and tokenizer for '{MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).cuda()


# 3. Preprocessing function
def preprocess(batch):
    inputs = tokenizer(
        batch[TEXT_COLUMN], max_length=1024, truncation=True, padding="max_length"
    )
    targets = tokenizer(
        batch[SUMMARY_COLUMN], max_length=128, truncation=True, padding="max_length"
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# 4. Tokenize the entire dataset with multiprocessing
print("Tokenizing the dataset using multiple cores...")
tokenized_train = train_dataset.map(
    preprocess,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=train_dataset.column_names,
    desc="Running tokenizer on dataset"
)
tokenized_val = val_dataset.map(
    preprocess,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=val_dataset.column_names,
    desc="Running tokenizer on dataset"
)
print("✅ Dataset tokenized.")


# 5. Define Training Arguments for Maximum Stable Performance
training_args = Seq2SeqTrainingArguments(
    output_dir=f"./{OUTPUT_DIR}-checkpoints",


    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    fp16=True,
    learning_rate=3e-5,
    num_train_epochs=2,

    dataloader_num_workers=os.cpu_count(),

    dataloader_pin_memory=True,
    logging_steps=200,
    save_steps=2000,
    save_total_limit=1,
    report_to="none",
)

# 6. Initialize the Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# 7. Start Fine-Tuning!
print(f"\nStarting to fine-tune '{MODEL_NAME}' on your English dataset...")
trainer.train()
print("\n✅ Fine-tuning complete!")

# 8. Save the final model for easy use later
trainer.save_model(OUTPUT_DIR)
print(f"\n✅ Final fine-tuned model saved to: ./{OUTPUT_DIR}")

Loading model and tokenizer for 'facebook/bart-large-cnn'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Tokenizing the dataset using multiple cores...


Running tokenizer on dataset (num_proc=2):   0%|          | 0/47643 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=2):   0%|          | 0/5955 [00:00<?, ? examples/s]

✅ Dataset tokenized.

Starting to fine-tune 'facebook/bart-large-cnn' on your English dataset...


  trainer = Seq2SeqTrainer(


Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Cell: Save Final Model to Google Drive

import os
from google.colab import drive
import time

print("--- Starting the process to save the model to Google Drive ---")

# --- 1. Mount Google Drive ---
try:
    drive.mount('/content/drive', force_remount=True)
    print("✅ Google Drive mounted successfully.")
except Exception as e:
    print(f"❌ Error mounting Google Drive: {e}")


# --- 2. Define File Paths ---
LOCAL_MODEL_DIR = "bart-english-news-summarizer"

DRIVE_DESTINATION_FOLDER = "/content/drive/My Drive/models"

FULL_DRIVE_PATH = os.path.join(DRIVE_DESTINATION_FOLDER, LOCAL_MODEL_DIR)

print(f"\nSource (Local Colab): ./{LOCAL_MODEL_DIR}")
print(f"Destination (Google Drive): {FULL_DRIVE_PATH}")


print(f"\nEnsuring destination folder '{DRIVE_DESTINATION_FOLDER}' exists...")
os.makedirs(DRIVE_DESTINATION_FOLDER, exist_ok=True)
print("✅ Destination folder is ready.")


print(f"\nStarting copy of '{LOCAL_MODEL_DIR}' to Google Drive. This may take a few minutes...")
start_time = time.time()

os.system(f"cp -rf {LOCAL_MODEL_DIR} \"{DRIVE_DESTINATION_FOLDER}\"")

end_time = time.time()
print(f"✅ Copy operation finished in {end_time - start_time:.2f} seconds.")


# --- 5. Verify the Copy ---
# List the contents of the new directory in Google Drive to confirm success.
print(f"\nVerifying contents at '{FULL_DRIVE_PATH}'...")
try:
    !ls -l "{FULL_DRIVE_PATH}"
    print("\n--- Process complete! Your model is now saved in Google Drive. ---")
except Exception as e:
    print(f"❌ Verification failed. Could not list contents of the destination folder: {e}")

--- Starting the process to save the model to Google Drive ---
Mounted at /content/drive
✅ Google Drive mounted successfully.

Source (Local Colab): ./bart-english-news-summarizer
Destination (Google Drive): /content/drive/My Drive/models/bart-english-news-summarizer

Ensuring destination folder '/content/drive/My Drive/models' exists...
✅ Destination folder is ready.

Starting copy of 'bart-english-news-summarizer' to Google Drive. This may take a few minutes...
✅ Copy operation finished in 40.00 seconds.

Verifying contents at '/content/drive/My Drive/models/bart-english-news-summarizer'...
total 1592039
-rw------- 1 root root       1591 Aug  5 08:38 config.json
-rw------- 1 root root        358 Aug  5 08:38 generation_config.json
-rw------- 1 root root     456318 Aug  5 08:38 merges.txt
-rw------- 1 root root 1625422896 Aug  5 08:38 model.safetensors
-rw------- 1 root root        279 Aug  5 08:38 special_tokens_map.json
-rw------- 1 root root       1270 Aug  5 08:38 tokenizer_config

In [None]:
import shutil

# Path to your folder
folder_path = '/content/bart-english-news-summarizer'
zip_path = '/content/bart-english-news-summarizer.zip'

# Create a zip file
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)


'/content/bart-english-news-summarizer.zip'

In [None]:
from google.colab import files

# Provide the path to the zipped file
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>