<a href="https://colab.research.google.com/github/maham-gif/AdvisorAgent-using-LangChain-Flan-T5/blob/main/AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pdfplumber python-pptx python-docx sentence-transformers faiss-cpu transformers pandas openpyxl

import os, io, json, time
import pandas as pd
from pathlib import Path
import pdfplumber, docx
from pptx import Presentation
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from google.colab import files
import requests

# ======= Hugging Face Model Settings =======
HUGGINGFACE_API_TOKEN = "<PASTE_YOUR_HF_TOKEN>"
MODEL_ID = "google/flan-t5-large"
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# ---------- File text extractors ----------
def read_pdf(p):
    with pdfplumber.open(p) as pdf: return "\n".join([pg.extract_text() or "" for pg in pdf.pages])
def read_docx(p):
    d=docx.Document(p); return "\n".join([p.text for p in d.paragraphs if p.text])
def read_pptx(p):
    prs=Presentation(p); return "\n".join([sh.text for s in prs.slides for sh in s.shapes if hasattr(sh,"text")])
def read_txt(p):
    return open(p,"r",encoding="utf-8",errors="ignore").read()
def read_csv(p):
    return pd.read_csv(p).to_csv(index=False)

def extract(path):
    ext=Path(path).suffix.lower()
    return (read_pdf if ext==".pdf" else
            read_docx if ext==".docx" else
            read_pptx if ext in [".pptx",".ppt"] else
            read_csv if ext==".csv" else
            read_txt)(path)

# ---------- Chunk + RAG ----------
def chunk(text, size=400, overlap=50):
    words=text.split(); out=[]; i=0
    while i<len(words):
        out.append(" ".join(words[i:i+size])); i+=size-overlap
    return out

class RAG:
    def __init__(self):
        self.model = SentenceTransformer(EMB_MODEL)
        self.index = faiss.IndexFlatL2(self.model.get_sentence_embedding_dimension())
        self.chunks, self.meta = [], []
    def add(self,text,src):
        for i,c in enumerate(chunk(text)):
            self.chunks.append(c); self.meta.append({"src":src})
        v=self.model.encode(self.chunks,convert_to_numpy=True)
        self.index.add(v)
    def search(self,q,k=5):
        v=self.model.encode([q],convert_to_numpy=True)
        D,I=self.index.search(v,k)
        return "\n".join([self.chunks[i] for i in I[0] if i>=0])

def hf_generate(prompt):
    headers={"Authorization":f"Bearer {HUGGINGFACE_API_TOKEN}"}
    r=requests.post(
        f"https://api-inference.huggingface.co/models/{MODEL_ID}",
        headers=headers,json={"inputs":prompt,"parameters":{"max_new_tokens":600,"temperature":0.2},"options":{"wait_for_model":True}}
    )
    r.raise_for_status()
    data=r.json()
    return data[0]["generated_text"] if isinstance(data,list) else str(data)

# ---------- Interactive Flow ----------
print("⬆️  Upload any text-containing files (pdf/docx/pptx/csv/txt)")
uploaded = files.upload()
paths=[]
for n,b in uploaded.items():
    p=f"/content/{n}"
    with open(p,"wb") as f:f.write(b); paths.append(p)

n_cols = int(input("\nNumber of columns? "))
cols = [input(f"Name of column {i+1}: ") for i in range(n_cols)]
n_rows = int(input("Number of rows to generate? "))

print("\n🔎 Building vector index...")
idx = RAG()
for p in paths:
    txt=extract(p)
    if txt.strip(): idx.add(txt,p)

contexts = {c: idx.search(c) for c in cols}
prompt = (
    f"You are a data-extraction agent. Using the contexts below, create EXACTLY {n_rows} rows of CSV "
    f"with header: {','.join(cols)}.\nIf information is missing, leave cell blank.\n\n"
)
for c,k in contexts.items(): prompt += f"FIELD: {c}\nCONTEXT:\n{k}\n---\n"
prompt += "\nReturn only valid CSV."

print("\n🤖 Generating table with HuggingFace model...")
csv_text = hf_generate(prompt)

# save outputs
df = pd.read_csv(io.StringIO(csv_text))
out_csv = "/content/output_table.csv"
out_xlsx = "/content/output_table.xlsx"
df.to_csv(out_csv,index=False); df.to_excel(out_xlsx,index=False)

print("\n✅ Done!")
print("CSV saved to",out_csv)
print("XLSX saved to",out_xlsx)
files.download(out_csv)
files.download(out_xlsx)


⬆️  Upload any text-containing files (pdf/docx/pptx/csv/txt)
