In [None]:
import pandas as pd
import dotenv
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

In [None]:
# --- CONFIGURATION ---
from dotenv import load_dotenv
load_dotenv()
key = os.getenv("GEMINI_API_KEY")
# File locations
INPUT_FILE = "data\\train_clean.csv"
MODEL_OUTPUT_PATH = "faiss_gemini_model_multi"

In [None]:
df = pd.read_csv(INPUT_FILE)
df.head()

In [None]:
# Drop duplicate rows in columns: 'Headline', 'Category'
df = df.drop_duplicates(subset=['Headline', 'Category'])

In [None]:

# The 1st column is Text, all others are Metadata
text_col_name = df.columns[1]
metadata_col_names = df.columns[2:] # Takes all columns from index 2 onwards

In [None]:

print(f"Embedding Column: '{text_col_name}'")
print(f"Metadata Columns: {list(metadata_col_names)}")

In [None]:
documents = []
for _, row in df.iterrows():
    # 1. Extract the text content
    text_content = str(row[text_col_name])
    
    # 2. Dynamically build metadata dictionary from all other columns
    metadata = {}
    for col in metadata_col_names:
        # We convert to str to ensure FAISS serialization compatibility
        metadata[col] = str(row[col])
        
    doc = Document(page_content=text_content, metadata=metadata)
    documents.append(doc)

print(f"Prepared {len(documents)} documents.")

In [None]:
# --- TEST: Minimal Gemini Embedding Call ---
try:
    test_embed = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004",
        google_api_key=key
    )
    
    result = test_embed.embed_query("hello world")
    print("Test embedding result (length):", len(result))
    print("Test embedding (first 5 values):", result[:5])
except Exception as e:
    print("Embedding API test failed:", e)

In [None]:
print("--- Step 2: Initializing Gemini Embeddings ---")
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=key
)

print("--- Step 3: Creating Vector Embeddings ---")
vector_store = FAISS.from_documents(documents, embeddings)

In [None]:
# Save the model as a local FAISS Vector Store
print("--- Step 4: Saving Model ---")
vector_store.save_local(MODEL_OUTPUT_PATH)
print(f"Saved to '{MODEL_OUTPUT_PATH}'")

In [None]:
# --- Step 5: Load FAISS Vector Store and Query ---

from langchain_community.vectorstores import FAISS

In [None]:

# Load the saved FAISS vector store
print("--- Step 5: Loading Model ---")
vector_store = FAISS.load_local(MODEL_OUTPUT_PATH, embeddings, allow_dangerous_deserialization=True)

In [None]:

# Load the queries file from test_clean.csv
QUERIES_FILE = "data\\test_clean.csv"
queries_df = pd.read_csv(QUERIES_FILE)  # Using the test input file for demonstration
queries = queries_df['Headline'].tolist()

In [None]:
# Search for the results for the listed queries in the defined vector store
print("--- Step 6: Querying Vector Store ---")
results = []

for query in queries:
    docs = vector_store.similarity_search(query, k=5)
    for doc in docs:
        results.append({
            'query': query,
            'matched_text': doc.page_content,
            **doc.metadata  # Unpack all metadata fields
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df.head())

In [None]:
# Aggregate matched_text column as unique values
results_df = results_df.groupby('query', as_index=False).agg(
    text=('matched_text', lambda x: ', '.join(x.unique()))
)

In [None]:
# Join the results_df with the original queries_df dataframe based on the "query" column as the key
merged_results = pd.merge(queries_df, results_df, left_on='Headline', right_on='query', how='left')

In [None]:
merged_results.drop(columns=['query'], inplace=True)

In [None]:

merged_results.to_csv("data\\vector_store_search_results.csv", index=False)