In [4]:
#Step 1 and 2
import fasttext

# Load the fastText pre-trained language detection model
model = fasttext.load_model(r"T:\nlp\lid.176.bin")

def detect_language(text):
    """
    Detects the language of the given text using fastText.
    :param text: Input sentence
    :return: Detected language code and confidence score
    """
    prediction = model.predict(text)
    language = prediction[0][0].replace("__label__", "")  # Extract language code
    confidence = prediction[1][0]  # Extract confidence score
    return language, confidence

# Example usage
sentence = input("Enter the sentence")
lang, confidence = detect_language(sentence)
print(f"Detected Language: {lang}, Confidence: {confidence:.2f}")


Detected Language: or, Confidence: 0.99


In [5]:
import networkx as nx

# Load the graph from GML file
G = nx.read_gml(r"T:\nlp\language_similarity_graph.gml")  # Replace with your actual filename

# Map language codes to correct graph node names
language_map = {
    'bn': 'bengali',  # Bengali
    'or': 'odia',  # Odia
    'af': 'afrikaans',  # Afrikaans (replaced Konkani)
    'ms': 'malay',  # Malay (replaced Georgian)
    'ur': 'urdu',  # Urdu
    'en': 'english',  # English
    'hi': 'hindi',  # Hindi
    'fr': 'french',  # French
    'es': 'spanish',  # Spanish
    'de': 'german',  # German
    'ar': 'arabic'  # Arabic
}

high_resource = ['en', 'hi', 'fr', 'es', 'de', 'ar']

def find_high_resource_match(language_code):
    if language_code not in language_map:
        print(f"Error: {language_code} not found in language map!")
        return None

    sims = []
    low_node = language_map[language_code]

    for high_lang in high_resource:
        high_node = language_map[high_lang]

        if G.has_node(low_node) and G.has_node(high_node):
            try:
                sim = G[low_node][high_node]['weight']
                sims.append((high_lang, sim))  # Save original HRL code
            except KeyError:
                pass

    sims = sorted(sims, key=lambda x: x[1], reverse=True)

    if sims:
        return sims[0][0]  # Return only the first high-resource language code
    else:
        return None

# Example usage
low_resource_input = lang  # Change this to any low-resource language code
high_resource_output = find_high_resource_match(low_resource_input)

if high_resource_output:
    print(high_resource_output.lower())  # Print only the language code
else:
    print("No match found!")


hi


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from deep_translator import GoogleTranslator
from comet import download_model, load_from_checkpoint

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load NLLB
nllb_name = "facebook/nllb-200-distilled-600M"
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_name, use_fast=False)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_name).to(device)
lang_code_to_id = nllb_tokenizer.convert_tokens_to_ids

# Load M2M100
m2m_model_name = "facebook/m2m100_418M"
m2m_tokenizer = AutoTokenizer.from_pretrained(m2m_model_name)
m2m_model = AutoModelForSeq2SeqLM.from_pretrained(m2m_model_name).to(device)

# Load MBart50
mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = AutoTokenizer.from_pretrained(mbart_model_name)
mbart_model = AutoModelForSeq2SeqLM.from_pretrained(mbart_model_name).to(device)

# Load COMET
comet_model_path = download_model("Unbabel/wmt22-cometkiwi-da")
comet_model = load_from_checkpoint(comet_model_path)

# Language Code Mappings
google_to_nllb = {
    'bn': 'ben_Beng',
    'or': 'ory_Orya',
    'af': 'afr_Latn',
    'ms': 'zsm_Latn',
    'ur': 'urd_Arab',
    'en': 'eng_Latn',
    'hi': 'hin_Deva',
    'fr': 'fra_Latn',
    'es': 'spa_Latn',
    'de': 'deu_Latn',
    'ar': 'arb_Arab'
}

google_to_mbart50 = {
    'bn': 'bn_IN',
    'or': 'or_IN',
    'af': 'af_ZA',
    'ms': 'ms_MY',
    'ur': 'ur_PK',
    'en': 'en_XX',
    'hi': 'hi_IN',
    'fr': 'fr_XX',
    'es': 'es_XX',
    'de': 'de_DE',
    'ar': 'ar_AR'
}

# --- Translators ---
def translate_nllb(text, src_lang_code, tgt_lang_code):
    try:
        src = google_to_nllb[src_lang_code]
        tgt = google_to_nllb[tgt_lang_code]
        nllb_tokenizer.src_lang = src
        forced_bos_id = lang_code_to_id(tgt)
        encoded = nllb_tokenizer(text, return_tensors="pt").to(device)
        output = nllb_model.generate(**encoded, forced_bos_token_id=forced_bos_id, max_length=128)
        return nllb_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"NLLB failed: {e}")
        return ""

def translate_google(text, src_lang_code, tgt_lang_code):
    try:
        return GoogleTranslator(source=src_lang_code, target=tgt_lang_code).translate(text)
    except Exception as e:
        print(f"Google failed: {e}")
        return ""

def translate_m2m100(text, src_lang_code, tgt_lang_code):
    try:
        m2m_tokenizer.src_lang = src_lang_code
        encoded = m2m_tokenizer(text, return_tensors="pt").to(device)
        generated_tokens = m2m_model.generate(**encoded, forced_bos_token_id=m2m_tokenizer.lang_code_to_id[tgt_lang_code])
        return m2m_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"M2M100 failed: {e}")
        return ""

def translate_mbart50(text, src_lang_code, tgt_lang_code):
    try:
        src = google_to_mbart50[src_lang_code]
        tgt = google_to_mbart50[tgt_lang_code]
        mbart_tokenizer.src_lang = src
        encoded = mbart_tokenizer(text, return_tensors="pt").to(device)
        output = mbart_model.generate(**encoded, forced_bos_token_id=mbart_tokenizer.lang_code_to_id[tgt])
        return mbart_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"MBart50 failed: {e}")
        return ""

# --- COMET Evaluation ---
def evaluate_comet(src, mt):
    data = [{"src": src, "mt": mt}]
    score = comet_model.predict(data, batch_size=1, gpus=1 if torch.cuda.is_available() else 0)
    return score[0]

# --- Ensemble Translation ---
def ensemble_translate(text, src_google_code, tgt_google_code):
    outputs = {
        "nllb": translate_nllb(text, src_google_code, tgt_google_code),
        "google": translate_google(text, src_google_code, tgt_google_code),
        "m2m100": translate_m2m100(text, src_google_code, tgt_google_code),
        "mbart50": translate_mbart50(text, src_google_code, tgt_google_code)
    }

    print("\nüîπ Translations:")
    for name, out in outputs.items():
        print(f"   {name.title():10}: {out}")

    scores = {}
    for name, mt in outputs.items():
        if mt.strip():
            scores[name] = evaluate_comet(text, mt)
        else:
            scores[name] = float('-inf')

    best_model = max(scores, key=scores.get)
    best_translation = outputs[best_model]

    print(f"\n‚úÖ Selected: {best_model.upper()} | COMET-QE scores: {scores}")
    return best_translation

# --- Main Function for Translation ---
def translate_best(sentence, low_resource_input, high_resource_output):
    src_lang = low_resource_input.lower()
    tgt_lang = high_resource_output.lower()
    print(f"\nüìù Input Text: {sentence}")
    best_translation = ensemble_translate(sentence, src_lang, tgt_lang)
    print(f"üéØ Best Translation: {best_translation}")
    return best_translation

# Example usage (can be replaced with user input or script call)
# translate_best("‡¶Ü‡¶Æ‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶Ø‡¶º ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶á", "bn", "en")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\shrin\.cache\huggingface\hub\models--Unbabel--wmt22-cometkiwi-da\snapshots\1ad785194e391eebc6c53e2d0776cada8f83179a\checkpoints\model.ckpt`
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000017552861030>>
Traceback (most recent call last):
  File "t:\nlp\.env\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 