<a href="https://colab.research.google.com/github/maham-gif/AdvisorAgent-using-LangChain-Flan-T5/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ STEP 1: Install Dependencies
!pip install openpyxl pandas transformers accelerate sentence-transformers matplotlib

# ✅ STEP 2: Import Libraries
from google.colab import files
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import io

# ✅ STEP 3: Upload File
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(io.BytesIO(uploaded[filename]))

print("✅ File uploaded and read successfully!")
print(df.head(3))

# ✅ STEP 4: User Options
print("\n🎯 What would you like to do?")
print("""
Available Tasks:
1. clean - Clean dataset (remove duplicates, trim whitespace, drop empty rows)
2. fill_missing - Fill missing values intelligently
3. analyze - Describe data, generate insights
4. generate_rows - Generate new rows based on existing pattern
5. create_chart - Create a chart
6. make_summary - Generate natural language summary of the dataset
7. field_meaning - Understand column names via embeddings
""")

task = input("Enter task: ").strip().lower()

# ✅ STEP 5: Load Hugging Face LLM (Mixtral-8x7B)
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

# ✅ STEP 6: Optional Embeddings Model (for RAG/field meaning)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# ✅ STEP 7: RAG-Like Prompting
prompt = f"""
You are a professional data analyst working in Excel.
Here are the first few rows of the dataset:

{df.head(5).to_string(index=False)}

Task: {task}.
Generate detailed Python code using pandas and matplotlib to perform the task efficiently.
"""

# ✅ STEP 8: LLM Generates Python Code
response = llm(prompt, max_new_tokens=500, temperature=0.7)[0]["generated_text"]

# ✅ Display Suggested Code
print("\n🧠 Suggested Code by LLM:\n")
generated_code = response.split("Task:")[1] if "Task:" in response else response
print(generated_code)

# ✅ STEP 9: Perform Task Automatically (safe, predefined only)
if task == "clean":
    df = df.drop_duplicates()
    df = df.dropna(how='all')
    df.columns = df.columns.str.strip()
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    print("✅ Dataset cleaned!")

elif task == "fill_missing":
    df = df.fillna(method='ffill').fillna(method='bfill')
    print("✅ Missing values filled using forward/backward fill!")

elif task == "analyze":
    print("\n📊 Summary Statistics:\n")
    print(df.describe(include='all'))

elif task == "generate_rows":
    num = int(input("How many rows to generate? "))
    sample = df.sample(n=1).to_dict(orient='records')[0]
    new_rows = [sample.copy() for _ in range(num)]
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    print(f"✅ {num} synthetic rows generated!")

elif task == "create_chart":
    print("Available columns:\n", df.columns.tolist())
    x = input("Enter X-axis column: ")
    y = input("Enter Y-axis column: ")
    df.plot(x=x, y=y, kind='bar', figsize=(12,6))
    plt.title(f'{y} vs {x}')
    plt.show()

elif task == "make_summary":
    sum_prompt = f"""
You are a data analyst. Summarize this dataset:

{df.head(5).to_string(index=False)}
"""
    summary = llm(sum_prompt, max_new_tokens=300)[0]["generated_text"]
    print("\n📝 Dataset Summary:\n", summary)

elif task == "field_meaning":
    print("\n📚 Field Semantic Understanding (via Embeddings):")
    embeddings = embedder.encode(df.columns)
    for i, col in enumerate(df.columns):
        print(f"{col}: Vector => {embeddings[i][:5]}...")  # Just a preview
else:
    print("❌ Invalid task or not implemented yet.")

# ✅ STEP 10: Save Updated File
output_name = "updated_dataset.xlsx"
df.to_excel(output_name, index=False)
files.download(output_name)
print(f"\n📥 File saved and downloaded: {output_name}")


