In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets transformers adapters sklearn torch

In [None]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
import gc
from multiprocessing import Pool, cpu_count
from datasets import load_dataset
import fasttext

# Initialize Spark
spark = SparkSession.builder.appName("IndicCorpus").config("spark.executor.memory", "8g").getOrCreate()

LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada", "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"]
TARGET_PER_LANG = 500000  # ~6M total, adjustable
MIN_WORDS = 50

# FastText for filtering
ft_model = fasttext.load_model('lid.176.bin')

def filter_text_length(text):
    return isinstance(text, str) and len(text.split()) >= MIN_WORDS

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    return Image.fromarray(cv2.dilate(binary, np.ones((1, 1), np.uint8), iterations=1))

def extract_text_from_pdf(pdf_path):
    try:
        print(f"Processing PDF: {pdf_path}")
        text = extract_text(pdf_path)
        if not text.strip():
            images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=50)
            text = "".join([pytesseract.image_to_string(preprocess_image(img), lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi', config="--psm 6 --oem 1").replace('\0', '')[:100000] + "\n" for img in images])
            gc.collect()
        return text.strip(), os.path.basename(pdf_path).split('_')[0]
    except Exception as e:
        print(f"Error: {pdf_path}: {e}")
        return "", ""

def detect_language(text):
    lang = ft_model.predict(text)[0][0].replace('__label__', '')
    return {'hi': 'Hindi', 'mr': 'Marathi', 'gu': 'Gujarati', 'bn': 'Bengali', 'ta': 'Tamil', 'kn': 'Kannada', 'te': 'Telugu', 'ml': 'Malayalam', 'pa': 'Punjabi', 'or': 'Odia', 'as': 'Assamese'}.get(lang, None)

def process_pdf(pdf_path):
    text, hinted_lang = extract_text_from_pdf(pdf_path)
    if not text:
        return []
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    return [{"text": para, "language": hinted_lang if hinted_lang in LANGUAGES else detect_language(para)} for para in paragraphs if filter_text_length(para)]

def process_pdfs_parallel(pdf_dir="/kaggle/input/meta-folder"):
    pdf_files = [os.path.join(lang_path, f) for lang_folder in os.listdir(pdf_dir) if os.path.isdir(lang_path := os.path.join(pdf_dir, lang_folder)) for f in os.listdir(lang_path) if f.endswith(".pdf")]
    with Pool(cpu_count()) as pool:
        results = pool.map(process_pdf, pdf_files)
    df = pd.DataFrame([item for sublist in results for item in sublist])
    df.to_parquet("/kaggle/working/temp_pdf.parquet", compression='gzip')
    return len(df)

def load_oscar():
    oscar_langs = {"hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam", "pa": "Punjabi", "or": "Odia", "as": "Assamese"}
    oscar_data = []
    for code, lang in oscar_langs.items():
        try:
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            oscar_data.extend({"text": item['text'].replace('\0', '')[:100000], "language": lang} for item in dataset if filter_text_length(item['text']))
        except Exception as e:
            print(f"Error OSCAR {lang}: {e}")
    df = pd.DataFrame(oscar_data).sample(min(3000000, len(oscar_data)))
    df.to_parquet("/kaggle/working/temp_oscar.parquet", compression='gzip')
    return len(df)

def load_samanantar():
    samanantar_langs = {"hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam", "pa": "Punjabi", "or": "Odia", "as": "Assamese"}
    dataset = load_dataset("togethercomputer/samanantar", split="train", trust_remote_code=True)
    samanantar_data = [{"text": row[lang_code].replace('\0', '')[:100000], "language": lang_name} for row in dataset for lang_code, lang_name in samanantar_langs.items() if row.get(lang_code) and filter_text_length(row[lang_code])]
    df = pd.DataFrame(samanantar_data).sample(min(2000000, len(samanantar_data)))
    df.to_parquet("/kaggle/working/temp_samanantar.parquet", compression='gzip')
    return len(df)

def build_corpus(pdf_dir="/kaggle/input/meta-folder", output_file="/kaggle/working/indic_corpus.parquet"):
    temp_files = ["/kaggle/working/temp_pdf.parquet", "/kaggle/working/temp_oscar.parquet", "/kaggle/working/temp_samanantar.parquet"]
    for f in temp_files:
        if os.path.exists(f):
            os.remove(f)
    
    pdf_count = process_pdfs_parallel(pdf_dir)
    oscar_count = load_oscar()
    samanantar_count = load_samanantar()
    
    # Merge existing corpus if it exists
    if os.path.exists(output_file):
        existing_df = spark.read.parquet(output_file)
        new_df = spark.read.parquet(*temp_files)
        combined_df = existing_df.union(new_df).dropDuplicates(["text"])
    else:
        combined_df = spark.read.parquet(*temp_files).dropDuplicates(["text"])
    
    # Balance and save
    balanced_df = combined_df.groupBy("language").agg(F.count("*").alias("count")).filter(F.col("language").isin(LANGUAGES)).join(combined_df, "language").orderBy(F.rand()).limit(TARGET_PER_LANG).select("text", "language")
    balanced_df.write.parquet(output_file, mode="overwrite", compression="gzip")
    total_samples = balanced_df.count()
    print(f"Saved {total_samples} samples to {output_file}")
    dist = balanced_df.groupBy("language").count().collect()
    for row in dist:
        print(f"{row['language']}: {row['count']}")
    
    # Clean up
    for f in temp_files:
        if os.path.exists(f):
            os.remove(f)

if __name__ == "__main__":
    build_corpus()

In [None]:
# corpus_chunking.py
from datasets import load_dataset
import pandas as pd

LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada", "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"]
MIN_WORDS = 50
CHUNK_SIZE = 100000  # 100K samples per chunk

def filter_text_length(text):
    return isinstance(text, str) and len(text.split()) >= MIN_WORDS

def load_oscar_chunk(start_idx, chunk_size=CHUNK_SIZE):
    oscar_langs = {"hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam", "pa": "Punjabi", "or": "Odia", "as": "Assamese"}
    oscar_data = []
    for code, lang in oscar_langs.items():
        try:
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            end_idx = min(start_idx + chunk_size, len(dataset))
            chunk = dataset[start_idx:end_idx]
            oscar_data.extend({"text": item['text'].replace('\0', '')[:100000], "language": lang} 
                             for item in chunk if filter_text_length(item['text']))
            print(f"OSCAR {lang}: {len([d for d in oscar_data if d['language'] == lang])} samples")
            if len(oscar_data) >= chunk_size:
                break
        except Exception as e:
            print(f"Error OSCAR {lang}: {e}")
    return pd.DataFrame(oscar_data).sample(min(chunk_size, len(oscar_data)))

def load_samanantar_chunk(start_idx, chunk_size=CHUNK_SIZE):
    samanantar_langs = {"hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam", "pa": "Punjabi", "or": "Odia", "as": "Assamese"}
    try:
        dataset = load_dataset("togethercomputer/samanantar", split="train", trust_remote_code=True)
        end_idx = min(start_idx + chunk_size, len(dataset))
        chunk = dataset[start_idx:end_idx]
        samanantar_data = [{"text": row[lang_code].replace('\0', '')[:100000], "language": lang_name} 
                          for row in chunk for lang_code, lang_name in samanantar_langs.items() 
                          if row.get(lang_code) and filter_text_length(row[lang_code])]
        print(f"Samanantar chunk: {len(samanantar_data)} samples")
        return pd.DataFrame(samanantar_data).sample(min(chunk_size, len(samanantar_data)))
    except Exception as e:
        print(f"Error Samanantar: {e}")
        return pd.DataFrame()

def get_chunk(chunk_idx, output_file="/kaggle/working/chunk.parquet"):
    start_idx = chunk_idx * CHUNK_SIZE
    oscar_df = load_oscar_chunk(start_idx)
    samanantar_df = load_samanantar_chunk(start_idx)
    combined_df = pd.concat([oscar_df, samanantar_df]).drop_duplicates(subset=["text"]).sample(frac=1)
    combined_df.to_parquet(output_file, compression="gzip")
    print(f"Chunk {chunk_idx} saved with {len(combined_df)} samples")
    return combined_df

if __name__ == "__main__":
    for i in range(3):  # Test 3 chunks
        get_chunk(i)

In [None]:
# pretraining.py
from transformers import XLMRobertaForMaskedLM, XLMRobertaTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd
import os

def pretrain_xlmr(chunk_file="/kaggle/working/chunk.parquet", model_dir="/kaggle/working/xlmr_pretrained"):
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base" if not os.path.exists(model_dir) else model_dir)
    model = XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-base" if not os.path.exists(model_dir) else model_dir)
    
    # Load chunk
    df = pd.read_parquet(chunk_file)
    dataset = Dataset.from_pandas(df)
    encodings = tokenizer(df["text"].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
    dataset = Dataset.from_dict({"input_ids": encodings["input_ids"], "attention_mask": encodings["attention_mask"]})
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
    # Training args
    training_args = TrainingArguments(
        output_dir=model_dir,
        overwrite_output_dir=not os.path.exists(model_dir),
        per_device_train_batch_size=16,
        num_train_epochs=1,  # 1 epoch per chunk
        save_steps=10000,
        logging_steps=100,
        learning_rate=1e-5,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )
    
    trainer.train()
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    print(f"Pre-trained on chunk, saved to {model_dir}")

if __name__ == "__main__":
    pretrain_xlmr()

In [None]:
# finetuning.py
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, Trainer, TrainingArguments
from transformers.adapters import AdapterConfig
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import os

LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada", "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"]

def finetune_xlmr(pretrained_dir="/kaggle/working/xlmr_pretrained", output_base_dir="/kaggle/working/finetuned"):
    tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(pretrained_dir, num_labels=3)
    
    # Add adapters
    adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=16)
    for lang in LANGUAGES:
        if not model.has_adapter(lang):
            model.add_adapter(lang, config=adapter_config)
    if not model.has_adapter("lora"):
        model.add_adapter("lora", config=AdapterConfig.load("lora", r=8, alpha=16))
    model.train_adapter(["lora"] + LANGUAGES)
    model.freeze_model()
    
    # IndicGLUE tasks
    indicglue_tasks = {
        "innews": {"num_labels": 3, "desc": "News Genre Classification"},
        "sentiment": {"num_labels": 2, "desc": "Sentiment Analysis"}
    }
    
    for task, config in indicglue_tasks.items():
        if model.config.num_labels != config["num_labels"]:
            model = XLMRobertaForSequenceClassification.from_pretrained(pretrained_dir, num_labels=config["num_labels"])
            for lang in LANGUAGES:
                if not model.has_adapter(lang):
                    model.add_adapter(lang, config=adapter_config)
            if not model.has_adapter("lora"):
                model.add_adapter("lora", config=AdapterConfig.load("lora", r=8, alpha=16))
            model.train_adapter(["lora"] + LANGUAGES)
            model.freeze_model()
        
        dataset = load_dataset("ai4bharat/indicglue", task)
        train_data = dataset["train"].map(lambda x: tokenizer(x["text"], truncation=True, padding=True, max_length=512), batched=True)
        eval_data = dataset["validation"].map(lambda x: tokenizer(x["text"], truncation=True, padding=True, max_length=512), batched=True)
        
        train_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        eval_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        
        training_args = TrainingArguments(
            output_dir=f"{output_base_dir}_{task}",
            overwrite_output_dir=not os.path.exists(f"{output_base_dir}_{task}"),
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=1,  # 1 epoch per chunk
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            learning_rate=2e-5,
        )
        
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds, average="weighted")}
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=eval_data,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        eval_results = trainer.evaluate()
        print(f"Task: {config['desc']} ({task})")
        print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
        print(f"F1 Score: {eval_results['eval_f1']:.4f}")
        
        model.save_pretrained(f"{output_base_dir}_{task}")
        tokenizer.save_pretrained(f"{output_base_dir}_{task}")

if __name__ == "__main__":
    finetune_xlmr()