**Q.1) Demonstrate the effectiveness of RAG to cite sources and to prevent hallucination in an LLM.**

In [1]:
# Install FAISS for the vector store, Sentence-Transformers for embeddings, and Transformers for the LLM
%pip install faiss-cpu sentence-transformers transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [2]:
from google.colab import files

# This will pop up a file picker. Select your local random_data.txt
uploaded = files.upload()

# Verify it’s here
!ls -lh random_data.txt


Saving random_data.txt to random_data.txt
-rw-r--r-- 1 root root 1.3K May 21 18:28 random_data.txt


In [3]:
from pathlib import Path
from typing import List, Tuple

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ─── CONFIG ────────────────────────────────────────────────────────
DATA_PATH   = "random_data.txt"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL   = "google/flan-t5-small"
TOP_K       = 3

# ─── 1) LOAD & CHUNK DATA ────────────────────────────────────────────
def load_chunks(path: str) -> List[Tuple[str,str]]:
    raw = Path(path).read_text(encoding="utf8").strip().split("\n\n")
    return [(f"para{i}", p.replace("\n", " ").strip())
            for i, p in enumerate(raw) if p.strip()]

chunks               = load_chunks(DATA_PATH)
chunk_ids, chunk_texts = zip(*chunks)

# ─── 2) EMBED & INDEX WITH FAISS ─────────────────────────────────────
embedder   = SentenceTransformer(EMBED_MODEL)
chunk_embs = embedder.encode(chunk_texts, convert_to_numpy=True)
faiss.normalize_L2(chunk_embs)

dim   = chunk_embs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(chunk_embs)
print(f"Indexed {index.ntotal} paragraphs in FAISS")

# ─── 3) RETRIEVAL FUNCTION ──────────────────────────────────────────
def retrieve(query: str, k: int = TOP_K) -> List[Tuple[str,str]]:
    q_emb = embedder.encode(query, convert_to_numpy=True)
    faiss.normalize_L2(q_emb.reshape(1, -1))
    scores, inds = index.search(q_emb.reshape(1, -1), k)
    return [(chunk_ids[i], chunk_texts[i]) for i in inds[0]]

# ─── 4) LOAD LOCAL GENERATOR & DEFINE GENERATION FUNCTIONS ─────────
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
model     = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)

def generate_vanilla(query: str) -> str:
    prompt = (
        "You are a helpful assistant. Answer the following question in one complete sentence.\n\n"
        f"Question: {query}\nAnswer:"
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    output_ids = model.generate(
        **inputs,
        max_new_tokens=50,
        min_length=10,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

def generate_rag(query: str, retrieved: List[Tuple[str,str]]) -> str:
    ctx = "\n\n".join(f"[{i+1}] ({cid}): {txt}"
                      for i, (cid, txt) in enumerate(retrieved))
    prompt = (
        "You are an expert assistant. Use ONLY the following sources to answer in one complete sentence. "
        "Be sure to name the river and state its length, and cite each fact with its source number in brackets.\n\n"
        f"{ctx}\n\n"
        f"Question: {query}\nAnswer:"
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    output_ids = model.generate(
        **inputs,
        max_new_tokens=100,
        min_length=20,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

# # ─── 5) DEMO: NON-RAG VS. RAG ────────────────────────────────────────
# query = "What is the longest river mentioned, and how long is it?"
# docs  = retrieve(query)

# print("=== Non-RAG (Vanilla) Response ===")
# print(generate_vanilla(query))

# print("\n=== Retrieved Passages ===")
# for i, (cid, txt) in enumerate(docs, 1):
#     print(f"{i}. [{cid}]: {txt}\n")

# print("=== RAG + Citation Response ===")
# print(generate_rag(query, docs))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexed 5 paragraphs in FAISS


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [4]:
# ─── 5) DEMO: NON-RAG VS. RAG FOR MULTIPLE QUESTIONS ─────────────────────
queries = [
    "What is the longest river mentioned, and how long is it?",
    "Which civilization thrived along the Nile’s banks?",
    "Explain quantum entanglement in one sentence.",
    "What paradigms does Python support?",
    "How many countries does the Nile River flow through?"
]

for query in queries:
    print(f"\n=== Query: {query} ===\n")

    # Non-RAG (Vanilla) response
    vanilla = generate_vanilla(query)
    print(">>> Non-RAG Response:")
    print(vanilla, "\n")

    # Retrieval
    docs = retrieve(query)
    print(">>> Retrieved Passages:")
    for i, (cid, txt) in enumerate(docs, 1):
        print(f"{i}. [{cid}]: {txt}")
    print()

    # RAG response with citations
    rag = generate_rag(query, docs)
    print(">>> RAG + Citation Response:")
    print(rag)
    print("-" * 60)


=== Query: What is the longest river mentioned, and how long is it? ===

>>> Non-RAG Response:
st. john's river 

>>> Retrieved Passages:
1. [para1]: The Nile River rises in the highlands of East Africa and flows northward for about 6,650 kilometers through nine countries. It is widely regarded as the longest river in the world. Ancient Egyptian civilization thrived along its banks for millennia.
2. [para3]: Quantum entanglement is a phenomenon where two particles remain connected such that the state of one immediately influences the state of the other, even when separated by large distances. This “spooky action at a distance” was famously critiqued by Einstein but has been repeatedly confirmed in experiments.
3. [para4]: Python is a high-level, interpreted programming language known for its readability and compact syntax. It supports multiple paradigms—procedural, object-oriented, and functional—and boasts a huge standard library plus a vibrant ecosystem of third-party packages.

>>>

**Q.2) Implement a neural network classifier for the loan data with Decision as the output attribute. Prepare the data as needed. Come up with your best performing model by changing the size and number of hidden layers and activation functions.**



In [5]:
from google.colab import files

# Trigger a file‐picker and upload your loan dataset (e.g. loan.csv)
uploaded = files.upload()

# Grab the filename of the first (and only) file you uploaded
filename = list(uploaded.keys())[0]
print(f"Loaded file: {filename}")


Saving loan.xlsx to loan.xlsx
Loaded file: loan.xlsx


In [6]:
# For reading .xlsx files
!pip install openpyxl




In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks


In [40]:
import pandas as pd

# Read the uploaded Excel file directly
df = pd.read_excel(filename, engine='openpyxl')

# Quick peek
print(df.shape)
print(df.dtypes)
display(df.head())

# Encode target as 0/1
df['Decision'] = df['Decision'].map({'accept':1, 'reject':0})
X = df.drop('Decision', axis=1)
y = df['Decision']


(429, 14)
Sex                 object
Age                float64
Time_at_address    float64
Res_status          object
Telephone           object
Occupation          object
Job_status          object
Time_employed        int64
Time_bank            int64
Liab_ref            object
Acc_ref             object
Home_Expn            int64
Balance              int64
Decision            object
dtype: object


Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Occupation,Job_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision
0,M,50.75,0.585,owner,given,unemploye,unemploye,0,0,f,given,145,0,reject
1,M,19.67,10.0,rent,not_given,labourer,governmen,0,0,t,given,140,0,reject
2,F,52.830002,15.0,owner,given,creative_,private_s,5,14,f,given,0,2200,accept
3,M,22.67,2.54,rent,not_given,creative_,governmen,2,0,f,given,0,0,accept
4,M,29.25,13.0,owner,given,driver,governmen,0,0,f,given,228,0,reject


In [9]:
df.head()

Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Occupation,Job_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision
0,M,50.75,0.585,owner,given,unemploye,unemploye,0,0,f,given,145,0,0
1,M,19.67,10.0,rent,not_given,labourer,governmen,0,0,t,given,140,0,0
2,F,52.830002,15.0,owner,given,creative_,private_s,5,14,f,given,0,2200,1
3,M,22.67,2.54,rent,not_given,creative_,governmen,2,0,f,given,0,0,1
4,M,29.25,13.0,owner,given,driver,governmen,0,0,f,given,228,0,0


In [10]:
print("Missing values before imputation:")
print(df.isnull().sum())

Missing values before imputation:
Sex                0
Age                0
Time_at_address    0
Res_status         0
Telephone          0
Occupation         0
Job_status         0
Time_employed      0
Time_bank          0
Liab_ref           0
Acc_ref            0
Home_Expn          0
Balance            0
Decision           0
dtype: int64


In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# numeric vs categorical columns from before
num_cols = ['Age','Time_at_address','Time_employed','Time_bank','Home_Expn','Balance']
cat_cols = [c for c in X.columns if c not in num_cols]

preprocessor = ColumnTransformer([
    # Converts each categorical column into a set of 0/1 “dummy” columns
    ('ohe',   OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols),
    # Shifts each numeric column so its mean is 0 and scales it so its standard deviation is 1.
    ('scale', StandardScaler(), num_cols),
])

# fit + transform
X_proc = preprocessor.fit_transform(X)
input_dim = X_proc.shape[1]
print(f"Processed feature matrix shape: {X_proc.shape}")


Processed feature matrix shape: (429, 35)


In [43]:
X_train, X_val, y_train, y_val = train_test_split(
    X_proc, y,
    test_size=0.2,
    stratify=y,  #Ensures that the class balance (proportion of accepts vs. rejects) is the same in train and val. Without this, you might accidentally end up with very different class ratios
    random_state=42  #Seeds the internal random number generator so that every time you run this cell you get the exact same split—important for reproducibility.
)

print("Train:", X_train.shape, "Val:", X_val.shape)



Train: (343, 35) Val: (86, 35)


In [44]:
def build_model(layer_sizes, activation):
    """Creates, compiles, and returns a Sequential binary‐classifier."""
    model = models.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    for size in layer_sizes:
        model.add(layers.Dense(size, activation=activation))
    model.add(layers.Dense(1, activation='sigmoid'))  # output
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Optimizer: adam—a popular algorithm that adjusts weights to minimize the loss.

# Loss: binary_crossentropy—the appropriate loss function when your target is 0 or 1.

# Metrics: track accuracy during training/validation so you can see how often you get the right class.

In [45]:
configs = [
    {'name':'3×ReLU', 'layers':[64,64,64],             'activation':'relu'},
    {'name':'5×Tanh', 'layers':[64,64,64,64,64],       'activation':'tanh'},
    {'name':'3×Tanh', 'layers':[64,64,64],             'activation':'tanh'},
    {'name':'5×ReLU', 'layers':[64,64,64,64,64],       'activation':'relu'},
    # {'name':'3×Tanh', 'layers':[32,32,32],             'activation':'tanh'},
    # {'name':'5×ReLU', 'layers':[128,128,128,128,128],       'activation':'elu'},
    # {'name':'5×ReLU', 'layers':[64,64,64,64,64],       'activation':'elu'}

]

results = []

for cfg in configs:
    print(f"\n▶ Training {cfg['name']}")
    model = build_model(cfg['layers'], cfg['activation'])
    hist = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=30,
        batch_size=32,
        callbacks=[callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=0
    )
    # final metrics
    t_loss, t_acc = model.evaluate(X_train, y_train, verbose=0)
    v_loss, v_acc = model.evaluate(X_val,   y_val,   verbose=0)

    results.append({
        'Model':     cfg['name'],
        'Train Loss': round(t_loss,4),
        'Train Acc':  round(t_acc,4),
        'Val Loss':   round(v_loss,4),
        'Val Acc':    round(v_acc,4),
    })



▶ Training 3×ReLU

▶ Training 5×Tanh

▶ Training 3×Tanh

▶ Training 5×ReLU


In [46]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Train Loss,Train Acc,Val Loss,Val Acc
0,3×ReLU,0.4543,0.793,0.525,0.7442
1,5×Tanh,0.4294,0.8163,0.5436,0.6977
2,3×Tanh,0.4812,0.7901,0.5289,0.7209
3,5×ReLU,0.415,0.8455,0.5445,0.7093
