In [None]:
import pandas as pd
import csv
import re
from google.colab import drive
import logging
import os

In [None]:
!pip install peft
!pip install -U transformers accelerate bitsandbytes

In [None]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Projects/backend/models



In [None]:

df = pd.read_csv('sufinama_ghazals.csv')
df1 = pd.read_csv('rekhta.csv')

In [None]:
df = pd.concat([df1, df], ignore_index=True, sort=False)

In [None]:
df_clean = df.dropna(subset=["ghazal_en", "ghazal_ur"])

In [None]:
df_clean

In [None]:
# --- Step 1: Detect Inconsistencies ---
def has_hindi(text):
    """Check for Devanagari (Hindi) characters in Urdu column <button class="citation-flag" data-index="1">"""
    return any('\u0900' <= char <= '\u097F' for char in str(text))

def invalid_roman(text):
    """Check if Roman Urdu contains Urdu script <button class="citation-flag" data-index="5">"""
    if pd.isna(text):
        return True
    return any('\u0600' <= char <= '\u06FF' for char in str(text))

# Flag script inconsistencies and missing values <button class="citation-flag" data-index="5"><button class="citation-flag" data-index="6">
df["has_hindi"] = df["ghazal_ur"].apply(has_hindi)
df["invalid_roman"] = df["ghazal_en"].apply(invalid_roman)
df["missing_ghazal_ur"] = df["ghazal_ur"].isna()  # Flag original NaNs <button class="citation-flag" data-index="5">
df["missing_ghazal_en"] = df["ghazal_en"].isna()  # Flag original NaNs <button class="citation-flag" data-index="6">

# Combine all flags to define inconsistent data
inconsistent_data = df[
    df["has_hindi"] |
    df["invalid_roman"] |
    df["missing_ghazal_ur"] |
    df["missing_ghazal_en"]
]

# Clean data (rows without inconsistencies or missing values)
clean_data = df[
    ~(
        df["has_hindi"] |
        df["invalid_roman"] |
        df["missing_ghazal_ur"] |
        df["missing_ghazal_en"]
    )
]


In [None]:
clean_data

In [None]:

# Save results <button class="citation-flag" data-index="6">
inconsistent_data.to_csv("inconsistent_data.csv", index=False)
clean_data.to_csv("clean_ghazals.csv", index=False)

In [None]:
# df.loc[2294,"ghazal_url"]

In [None]:
df.loc[df["name"] == 'baba-shah-hussaini']

In [None]:
# Load data
df = pd.read_csv("clean_ghazals.csv")
# Create training examples
training_data = []

In [None]:
df["ghazal_ur"] = df["ghazal_ur"].fillna("[MISSING]")
df["ghazal_en"] = df["ghazal_en"].fillna("[MISSING]")

In [None]:
df.loc[df['ghazal_ur'] == "[MISSING]"]

In [None]:
for _, row in df.iterrows():
    poet = row["name"].replace("-", " ").title()

    # Task 1: Urdu → Roman Conversion
    training_data.append({
        "instruction": f"Convert this Urdu ghazal by {poet} to Roman Urdu",
        "input": row["ghazal_ur"],
        "response": row["ghazal_en"]
    })

    # Task 2: Roman → Urdu Conversion
    training_data.append({
        "instruction": f"Convert this Roman Urdu ghazal by {poet} to traditional Urdu script",
        "input": row["ghazal_en"],
        "response": row["ghazal_ur"]
    })

    # Task 3: Poet Attribution
    training_data.append({
        "instruction": "Who wrote this ghazal?",
        "input": row["ghazal_ur"],
        "response": poet
    })

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True)
    labels = pad_sequence([item["labels"] for item in batch], batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [None]:
# !pip uninstall -y tensorflow && pip install tensorflow-cpu

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator
import torch
from torch.utils.data import Dataset

In [None]:


# Quantization config
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

# Load model
model_name = "qwen/Qwen2-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
)
model = get_peft_model(model, lora_config)
# model.gradient_checkpointing_enable()

class LazyDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        inputs = example["instruction"] + " " + example["input"]
        tokenized = tokenizer(
            text=inputs,
            text_pair=example["response"],
            truncation=True,
            padding="max_length",
            max_length=256,
            return_tensors="pt"
        )
        labels = tokenized["input_ids"].clone()
        labels[labels == tokenizer.pad_token_id] = -100

        return {
            "input_ids": tokenized["input_ids"].squeeze(0),
            "attention_mask": tokenized["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0)  # Include labels
        }


# Prepare data
dataset = LazyDataset(training_data, tokenizer)
train_dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    collate_fn=collate_fn  # Add this
)
# Accelerate setup
accelerator = Accelerator()
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=5e-5
)
model, optimizer, train_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader
)

In [None]:
for batch in train_dataloader:
    print("Labels shape:", batch["labels"].shape)  # Should match input_ids shape
    break

In [None]:
training_args = TrainingArguments(
    output_dir="./qwen-urdu-test",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=True,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Resume training from the last checkpoint
trainer.train(resume_from_checkpoint=True)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Load with proper tokenizer settings
tokenizer = AutoTokenizer.from_pretrained(
    "qwen/Qwen2-0.5B",
    pad_token="<|endoftext|>",  # Qwen's EOS token
    padding_side="left"  # Crucial for generation
)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "qwen/Qwen2-0.5B",
    device_map="auto",
    pad_token_id=tokenizer.pad_token_id  # Match tokenizer
)

# Load LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    "./qwen-urdu-test/checkpoint-76533"  # Your specific checkpoint
)

# Critical configuration for generation
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

def generate_safe(prompt, max_length=200):
    # Tokenize with attention mask
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(model.device)

    # Generate with proper config
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Pass mask
        max_new_tokens=max_length,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode only new tokens
    input_length = inputs.input_ids.shape[1]
    return tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

In [None]:
def test_urdu_features():
    # Test 1: Right-to-Left Consistency
    print("RTL Test:", generate_safe("اردو میں لکھیں: میرا نام علی ہے"))

    # Test 2: Poet Attribution
    print("\nPoet Test:", generate_safe(
        "Who wrote this ghazal? دل سے جو بات نکلتی ہے اثر رکھتی ہے"
    ))

    # Test 3: Mixed Script Handling
    print("\nMixed Script Test:", generate_safe(
        "Convert: میں use کرتا ہوں Roman Urdu کو"
    ))

test_urdu_features()

In [None]:
# Install required libraries
# !pip install transformers torch

# Import libraries
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the DeepSeek-R1 model and tokenizer
model_name = "deepseek-ai/deepseek-llm-7b-chat"  # Replace with the correct model path if not on Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define an Urdu input prompt
urdu_prompt = "محبت کے بارے میں ایک شعر کہو"  # "Write a poem about love"

# Tokenize the input
inputs = tokenizer(urdu_prompt, return_tensors="pt")

# Generate text
outputs = model.generate(
    inputs.input_ids,
    max_length=100,  # Adjust the length of the generated text
    num_return_sequences=1,  # Number of responses to generate
    no_repeat_ngram_size=2,  # Avoid repeating phrases
    top_k=50,  # Sampling parameter
    top_p=0.95,  # Nucleus sampling parameter
    temperature=0.7,  # Controls randomness
)

# Decode and print the generated Urdu text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Urdu Text:")
print(generated_text)

In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="bartowski/Tower-Babel_Babel-9B-Chat-GGUF",
	filename="Tower-Babel_Babel-9B-Chat-Q8_0.gguf",
)


In [None]:
# Define a chat prompt in Urdu
messages = [
    {
        "role": "user",
        "content": "محبت کے بارے میں ایک شعر کہو"  # "Write a poem about love"
    }
]

# Generate a response
response = llm.create_chat_completion(
    messages=messages,
    max_tokens=100,  # Maximum number of tokens to generate
    temperature=0.7,  # Controls randomness
    top_p=0.95,  # Nucleus sampling parameter
)

# Print the generated response
print("Generated Urdu Text:")
print(response["choices"][0]["message"]["content"])

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
  "Tower-Babel/Babel-83B-Chat",
  torch_dtype=torch.bfloat16,
  device_map=device
)
tokenizer = AutoTokenizer.from_pretrained("Tower-Babel/Babel-83B-Chat")

# prepare messages to model
prompt = "Hiii How are you?"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
print(f"Formatted text:\n {text}")
print(f"Model input:\n {model_inputs}")

generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True, eos_token_id=tokenizer.eos_token_id)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(f"Response:\n {response[0]}")


In [None]:
## Memory Error