In [3]:
# All necessary imports
!pip install sacrebleu
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)
import sacrebleu
import os

# Preprocessing function
def preprocess_data(dataset, tokenizer, max_length=128):
    def tokenize_function(examples):
        inputs = examples['ur']
        targets = examples['en']
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding='max_length')
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, truncation=True, padding='max_length')
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    return dataset.map(tokenize_function, batched=True, remove_columns=['ur', 'en'])

# Evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels]).score
    chrf = sacrebleu.corpus_chrf(decoded_preds, [decoded_labels]).score
    return {"bleu": bleu, "chrf": chrf}

# Training function
def train_on_chunk(file_path, start_idx, chunk_size, num_epochs, resume_from=None):
    global tokenizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\n=== Training chunk {start_idx}-{start_idx + chunk_size} ===")

    # Model setup
    model_name = "facebook/m2m100_418M"
    output_dir = f"/kaggle/working/urdu_en_{start_idx}_to_{start_idx + chunk_size}"

    if resume_from and os.path.exists(resume_from):
        print(f"Resuming from {resume_from}")
        tokenizer = M2M100Tokenizer.from_pretrained(resume_from)
        model = M2M100ForConditionalGeneration.from_pretrained(resume_from).to(device)
    else:
        print("Initializing new model")
        tokenizer = M2M100Tokenizer.from_pretrained(model_name, src_lang="ur", tgt_lang="en")
        model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

    # Load full data
    df = pd.read_csv(file_path)

    # Slice chunk and filter invalid rows carefully
    chunk_df = df.iloc[start_idx:start_idx + chunk_size]

    # Drop rows with NaNs in key columns
    chunk_df = chunk_df.dropna(subset=['Urdu Transalation', 'Sentence'])

    # Filter out empty or non-string rows
    chunk_df = chunk_df[
        chunk_df['Urdu Transalation'].apply(lambda x: isinstance(x, str) and x.strip() != "") &
        chunk_df['Sentence'].apply(lambda x: isinstance(x, str) and x.strip() != "")
    ]

    print(f"Rows in chunk after filtering: {len(chunk_df)}")
    if len(chunk_df) == 0:
        print(f"No valid data in chunk {start_idx}-{start_idx + chunk_size}, skipping training.")
        return resume_from  # Just return last checkpoint path

    # Prepare dataset with renamed columns expected by preprocess_data
    chunk_df = chunk_df.rename(columns={'Urdu Transalation': 'ur', 'Sentence': 'en'})
    dataset = Dataset.from_pandas(chunk_df[['ur', 'en']].reset_index(drop=True))

    # Train/val split
    train_val = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = preprocess_data(train_val["train"], tokenizer)
    eval_dataset = preprocess_data(train_val["test"], tokenizer)

    # Training arguments with progress tracking
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        greater_is_better=True,
        logging_strategy="steps",
        logging_steps=50,
        report_to="none",
        disable_tqdm=False,
        log_level="info",
        predict_with_generate=True,
        remove_unused_columns=False  # Important to avoid earlier errors
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Starting training...")
    trainer.train(resume_from_checkpoint=resume_from)

    # Save model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"\n✅ Saved checkpoint: {output_dir}")
    return output_dir



# ==== Run training on first chunk ====
file_path = "/kaggle/input/d/bilalnadeem614/nlp-project-dataset/DATA_SET_FOR_NLP_PROJECT.csv"
chunk_size = 20000
resume = None

resume = train_on_chunk(file_path, start_idx=0, chunk_size=chunk_size, num_epochs=3, resume_from=resume)

# Uncomment to run second chunk
# resume = train_on_chunk(file_path, start_idx=20000, chunk_size=chunk_size, num_epochs=3, resume_from=resume)

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1

=== Training chunk 0-20000 ===
Initializing new model


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

  df = pd.read_csv(file_path)


Rows in chunk after filtering: 20000


Map:   0%|          | 0/18000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
Using auto half precision backend
***** Running training *****
  Num examples = 18,000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 3,375
  Number of trainable parameters = 483,905,536


Starting training...


Epoch,Training Loss,Validation Loss,Bleu,Chrf
1,0.0967,0.105312,43.759828,59.255897
2,0.079,0.09742,45.925595,60.87715
3,0.0475,0.094969,47.177853,62.121702



***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1
}

Saving model checkpoint to /kaggle/working/urdu_en_0_to_20000/checkpoint-1125
Configuration saved in /kaggle/working/urdu_en_0_to_20000/checkpoint-1125/config.json
Configuration saved in /kaggle/working/urdu_en_0_to_20000/checkpoint-1125/generation_config.json
Model weights saved in /kaggle/working/urdu_en_0_to_20000/checkpoint-1125/model.safetensors
tokenizer config file saved in /kaggle/working/urdu_en_0_to_20000/checkpoint-1125/tokenizer_config.json
Special tokens file saved in /kaggle/working/urdu_en_0_to_20000/checkpoint-1125/special_tokens_map.json
added tokens file saved in /kaggle/working/urdu_en_0_to_20000/checkpoint-1125/added_tokens.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
Savin


✅ Saved checkpoint: /kaggle/working/urdu_en_0_to_20000


In [4]:
import shutil
import os

model_dir = "/kaggle/working/urdu_en_0_to_20000/"

# Delete all checkpoint folders
for sub in os.listdir(model_dir):
    if sub.startswith("checkpoint-"):
        shutil.rmtree(os.path.join(model_dir, sub))

# Delete TrainerState and training arguments (optional)
for fname in ["trainer_state.json", "training_args.bin", "all_results.json"]:
    fpath = os.path.join(model_dir, fname)
    if os.path.exists(fpath):
        os.remove(fpath)

print("✅ Cleaned up unnecessary files.")


✅ Cleaned up unnecessary files.


In [8]:
!zip -r /kaggle/working/urdu_en_0_to_20000.zip /kaggle/working/urdu_en_0_to_20000


  adding: kaggle/working/urdu_en_0_to_20000/ (stored 0%)
  adding: kaggle/working/urdu_en_0_to_20000/sentencepiece.bpe.model (deflated 50%)
  adding: kaggle/working/urdu_en_0_to_20000/generation_config.json (deflated 35%)
  adding: kaggle/working/urdu_en_0_to_20000/config.json (deflated 57%)
  adding: kaggle/working/urdu_en_0_to_20000/special_tokens_map.json (deflated 79%)
  adding: kaggle/working/urdu_en_0_to_20000/model.safetensors (deflated 7%)
  adding: kaggle/working/urdu_en_0_to_20000/tokenizer_config.json (deflated 94%)
  adding: kaggle/working/urdu_en_0_to_20000/added_tokens.json (deflated 76%)
  adding: kaggle/working/urdu_en_0_to_20000/vocab.json (deflated 71%)


In [None]:
import os
from IPython.display import FileLink

model_dir = "/kaggle/working/urdu_en_0_to_20000/"
for fname in os.listdir(model_dir):
    full_path = os.path.join(model_dir, fname)
    if os.path.isfile(full_path):
        display(FileLink(full_path))


In [14]:
# dataset_metadata = """
# {
#   "title": "urdu_en_0_to_20000_model",
#   "id": "your-kaggle-username/urdu-en-0-to-20000-model",
#   "licenses": [
#     {
#       "name": "CC0-1.0"
#     }
#   ]
# }
# """

# with open('/kaggle/working/dataset-metadata.json', 'w') as f:
#     f.write(dataset_metadata)

!kaggle datasets create \
  -p /kaggle/working \
  -u \
  -r zip \
  -m /kaggle/working/dataset-metadata.json


Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.11/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 433, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [None]:
!rm -rf /kaggle/working/urdu_en_0_to_20000.zip

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load model and tokenizer
model_dir = "/kaggle/working/urdu_en_0_to_20000"  # Adjust if saved elsewhere
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Urdu test sentences
urdu_sentences = [
    "آپ کیسے ہیں؟",
    "مجھے آج بہت کام کرنا ہے۔",
    "کیا تم میری مدد کر سکتے ہو؟",
    "آج کا موسم بہت خوبصورت ہے۔",
    "میں کتاب پڑھ رہا ہوں۔"
]

# Translation loop
for sentence in urdu_sentences:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    translated_tokens = model.generate(
        **inputs,
        max_length=200,
        num_beams=5,
        early_stopping=True
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    print(f"🔸 Urdu: {sentence}\n🔹 English: {translated_text}\n")
