 1. Load & Read All Text Files (English + Urdu)

In [None]:
from google.colab import files
uploaded = files.upload()

Saving NLP Project.zip to NLP Project (1).zip


In [None]:
import zipfile
import os

with zipfile.ZipFile('NLP Project.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
# Check files
print("Files in English folder:", os.listdir("NLP Project/data/English"))
print("Files in Urdu folder:", os.listdir("NLP Project/data/Urdu"))

Files in English folder: ['ai_intro.txt', 'neural network.txt', 'machine learning.txt']
Files in Urdu folder: ['ai_intro_urdu.txt', 'machine_learning_urdu.txt', 'neural_network_urdu.txt']


2. Load and Clean Text

In [None]:
def load_and_clean_texts(directory, lang='english'):
    all_texts = []

    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                text = f.read()
                text = text.strip()
                text = ' '.join(text.split())
                if lang == 'english':
                    text = text.lower()
                all_texts.append((filename, text))

    return all_texts

In [None]:
english_data = load_and_clean_texts('NLP Project/data/English', lang='english')
urdu_data = load_and_clean_texts('NLP Project/data/Urdu', lang='urdu')

print("Sample English:\n", english_data[0])
print("\nSample Urdu:\n", urdu_data[0])

Sample English:
 ('ai_intro.txt', 'artificial intelligence (ai) refers to the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. it is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.[1] such machines may be called ais. high-profile applications of ai include advanced web search engines (e.g., google search); recommendation systems (used by youtube, amazon, and netflix); virtual assistants (e.g., google assistant, siri, and alexa); autonomous vehicles (e.g., waymo); generative and creative tools (e.g., chatgpt and ai art); and superhuman play and analysis in strategy games (e.g., chess and go). however, many ai applications are not perceived as ai: "a lot of cutting edg

3. Chunking (Overlap + Fixed Length)

In [None]:
def chunk_text(text, chunk_size=100, overlap=20):
    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks


In [None]:
def process_all_chunks(data_list, lang='english'):
    all_chunks = []

    for fname, text in data_list:
        chunks = chunk_text(text)
        for i, chunk in enumerate(chunks):
            source = f"{lang}_{fname}_chunk{i}"
            all_chunks.append((source, chunk))

    return all_chunks

english_chunks = process_all_chunks(english_data, lang='english')
urdu_chunks = process_all_chunks(urdu_data, lang='urdu')

print(f"English Chunks: {len(english_chunks)}")
print(f"Urdu Chunks: {len(urdu_chunks)}")

# Example
print("\nSample English Chunk:\n", english_chunks[0])
print("\nSample Urdu Chunk:\n", urdu_chunks[0])


English Chunks: 13
Urdu Chunks: 15

Sample English Chunk:
 ('english_ai_intro.txt_chunk0', 'artificial intelligence (ai) refers to the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. it is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.[1] such machines may be called ais. high-profile applications of ai include advanced web search engines (e.g., google search); recommendation systems (used by youtube, amazon, and netflix); virtual assistants (e.g., google assistant, siri, and alexa); autonomous')

Sample Urdu Chunk:
 ('urdu_ai_intro_urdu.txt_chunk0', 'مصنوعی ذہانت سے مراد کمپیوٹری نظاموں کی وہ صلاحیت ہے جو عام طور پر انسانی ذہانت سے وابستہ کاموں کو انجام دینے کی اہل

In [None]:
import json

# Save chunks to files
with open('english_chunks.json', 'w', encoding='utf-8') as f:
    json.dump(english_chunks, f, ensure_ascii=False, indent=2)

with open('urdu_chunks.json', 'w', encoding='utf-8') as f:
    json.dump(urdu_chunks, f, ensure_ascii=False, indent=2)


4. Embedding Chunks with selected models

In [None]:
!pip install -q sentence-transformers

In [None]:
!pip install huggingface_hub[hf_xet]



In [None]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np

#selected models
model_names = [
    "sentence-transformers/distiluse-base-multilingual-cased-v1",
    "intfloat/multilingual-e5-small",
    "BAAI/bge-small-en-v1.5"  # Change to multilingual version if needed
]

models = {name: SentenceTransformer(name) for name in model_names}

with open('english_chunks.json', 'r', encoding='utf-8') as f:
    english_chunks = json.load(f)

with open('urdu_chunks.json', 'r', encoding='utf-8') as f:
   urdu_chunks = json.load(f)

all_chunks = {
    "English": english_chunks,
    "Urdu": urdu_chunks
}
for lang, chunks in all_chunks.items():
    for model_name, model in models.items():
        print(f"Embedding {lang} chunks using {model_name}...")
        embeddings = model.encode(chunks, show_progress_bar=True)
        save_path = f"{lang}_{model_name.replace('/', '_')}_embeddings.npy"
        np.save(save_path, embeddings)
        print(f"Saved to {save_path}")


Embedding English chunks using sentence-transformers/distiluse-base-multilingual-cased-v1...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to English_sentence-transformers_distiluse-base-multilingual-cased-v1_embeddings.npy
Embedding English chunks using intfloat/multilingual-e5-small...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to English_intfloat_multilingual-e5-small_embeddings.npy
Embedding English chunks using BAAI/bge-small-en-v1.5...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to English_BAAI_bge-small-en-v1.5_embeddings.npy
Embedding Urdu chunks using sentence-transformers/distiluse-base-multilingual-cased-v1...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to Urdu_sentence-transformers_distiluse-base-multilingual-cased-v1_embeddings.npy
Embedding Urdu chunks using intfloat/multilingual-e5-small...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to Urdu_intfloat_multilingual-e5-small_embeddings.npy
Embedding Urdu chunks using BAAI/bge-small-en-v1.5...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to Urdu_BAAI_bge-small-en-v1.5_embeddings.npy


5. Pinecone

In [None]:
!pip install -q pinecone-client

In [None]:
!pip install pinecone



In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="pcsk_oZM9Z_SjA2Ptu9GS8NBikJqtAWPAyXQcWJvLLkPSJUNqpws3mAJxduL1MABbG88rN1Shx")

# Create each index with proper dimension
pc.create_index("multilingual-nlp-distiluse", dimension=512, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
pc.create_index("multilingual-nlp-e5", dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
pc.create_index("multilingual-nlp-bge", dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))


{
    "name": "multilingual-nlp-bge",
    "metric": "cosine",
    "host": "multilingual-nlp-bge-fyi1ymv.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
# Load all chunks
with open("english_chunks.json", 'r', encoding='utf-8') as f:
    english_chunks = json.load(f)
with open("urdu_chunks.json", 'r', encoding='utf-8') as f:
    urdu_chunks = json.load(f)

def upload_embeddings(index_name, eng_file, urdu_file):
    index = pc.Index(index_name)
    vectors = []

    english_embeddings = np.load(eng_file)
    urdu_embeddings = np.load(urdu_file)

    for (id_text, emb) in zip(english_chunks, english_embeddings):
        vectors.append({
            "id": id_text[0],
            "values": emb.tolist(),
            "metadata": {"language": "english", "text": id_text[1]}
        })

    for (id_text, emb) in zip(urdu_chunks, urdu_embeddings):
        vectors.append({
            "id": id_text[0],
            "values": emb.tolist(),
            "metadata": {"language": "urdu", "text": id_text[1]}
        })

    # Upload in batches
    for i in range(0, len(vectors), 32):
        index.upsert(vectors=vectors[i:i+32])

In [None]:
upload_embeddings("multilingual-nlp-distiluse",
                  "English_sentence-transformers_distiluse-base-multilingual-cased-v1_embeddings.npy",
                  "Urdu_sentence-transformers_distiluse-base-multilingual-cased-v1_embeddings.npy")

upload_embeddings("multilingual-nlp-e5",
                  "English_intfloat_multilingual-e5-small_embeddings.npy",
                  "Urdu_intfloat_multilingual-e5-small_embeddings.npy")

upload_embeddings("multilingual-nlp-bge",
                  "English_BAAI_bge-small-en-v1.5_embeddings.npy",
                  "Urdu_BAAI_bge-small-en-v1.5_embeddings.npy")


In [None]:
def query_all_models(query_text):
    print(f"🔍 Query: {query_text}\n")
    for index_name, model in models.items():
        print(f"--- Results from {index_name} ---")
        embedding = model.encode(query_text).tolist()
        results = pc.Index(index_name).query(vector=embedding, top_k=3, include_metadata=True)
        for match in results['matches']:
            print(f"[{match['metadata']['language']}] ({match['score']:.4f}) → {match['metadata']['text']}\n")

Query (Urdu)

In [None]:
query_all_models(queries["urdu"])

🔍 Query: مصنوعی ذہانت اور مشین لرننگ کے درمیان کیا فرق ہے؟

--- Results from multilingual-nlp-distiluse ---
[urdu] (0.2720) → جیسی سمجھی جاتی ہیں، مگر ان میں فرق ہے: مصنوعی ذہانت ایک وسیع تصور ہے جس میں نظام کو انسان کی طرح سوچنے، سیکھنے اور عمل کرنے کے قابل بنایا جاتا ہے۔ مشین لرننگ مصنوعی ذہانت کا ایک حصہ ہے جو مشینوں کو معلومات سے سیکھنے اور سمجھ بوجھ حاصل کرنے کی صلاحیت دیتی ہے۔ یعنی، مصنوعی ذہانت ایک مکمل شعبہ ہے، اور مشین لرننگ اس شعبے کی ایک خاص شاخ ہے جو مشینوں کو خودکار سیکھنے کے قابل بناتی ہے۔

[urdu] (0.2458) → جائے تو اسے مصنوعی ذہانت کہنا چھوڑ دیا جاتا ہے۔ مصنوعی ذہانت کی تحقیق مختلف مقاصد اور مخصوص اوزاروں کے استعمال کے گرد گھومتی ہے۔ اس کے روایتی مقاصد میں سیکھنا، سوچنا، علم کی نمائندگی، منصوبہ بندی، قدرتی زبان کی سمجھ، مشاہدہ، اور خودکار مشینوں کی مدد شامل ہیں۔ مکمل ذہانت — یعنی ہر وہ کام جو ایک انسان کر سکتا ہے، اسے کم از کم برابر سطح پر انجام دینا — اس شعبے کا طویل المدتی ہدف ہے۔ ان مقاصد کے حصول کے لیے محققین نے تلاش، ریاضیاتی بہتری، منطقی اصول، مصنوعی اعصابی جال، او

Query (English)

In [None]:
query_all_models(queries["english"])

🔍 Query: What is the difference between AI and machine learning?

--- Results from multilingual-nlp-distiluse ---
[english] (0.5635) → what is machine learning? machine learning (ml) is a subset of artificial intelligence that enables machines to learn from data without being explicitly programmed. it uses algorithms to analyze large amounts of data, learn from the insights, and gain patterns and make informed decisions. in simple words, the machine "learns" from the data and uses this knowledge to make predictions and decisions. machine learning algorithms enhance their performance over time as they undergo continuous training and exposed to additional data. machine learning models are the output or what the program learns by executing an algorithm on training data. the greater

[english] (0.5237) → they are not the same thing but are closely connected. relationship between ai and ml understanding the relationship between ai and ml is important for developing intelligent systems. the 

**Assignment # 3**

In [None]:
!pip install transformers accelerate



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load FLAN-T5 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [None]:
def ask_flan_t5(context, query):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=150)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
def build_context(matches):
    return "\n".join([m["metadata"]["text"] for m in matches])

In [None]:
def get_top_chunks(query, top_k=3):
    query_emb = embedding_model.encode(query)
    index = pc.Index("multilingual-nlp-e5")
    results = index.query(vector=query_emb.tolist(), top_k=top_k, include_metadata=True)
    return results['matches']

In [None]:
query = "What is machine learning?"

top_chunks = get_top_chunks(query)
context = build_context(top_chunks)
answer = ask_flan_t5(context, query)

print("✅ Answer from FLAN-T5:\n", answer)

✅ Answer from FLAN-T5:
 a subset of artificial intelligence that enables machines to learn from data without being explicitly programmed
