In [1]:
import pandas as pd
import dotenv
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

In [5]:
# --- CONFIGURATION ---
from dotenv import load_dotenv
load_dotenv()
key = os.getenv("GEMINI_API_KEY")
# File locations
INPUT_FILE = "data\\train_clean.csv"
MODEL_OUTPUT_PATH = "faiss_gemini_model_multi"

In [6]:
df = pd.read_csv(INPUT_FILE)
df.head()

Unnamed: 0,Id,Category,Headline
0,1,Science,breakthrough discovery cyber research breakthr...
1,2,Society,government focus youth development program gov...
2,3,Politics,health party win majority assembly election he...
3,4,Business,startup digital sector attract global investor...
4,5,Business,sensex surge technology amid market optimism s...


In [7]:
# Drop duplicate rows in columns: 'Headline', 'Category'
df = df.drop_duplicates(subset=['Headline', 'Category'])

In [8]:

# The 1st column is Text, all others are Metadata
text_col_name = df.columns[1]
metadata_col_names = df.columns[2:] # Takes all columns from index 2 onwards

In [9]:

print(f"Embedding Column: '{text_col_name}'")
print(f"Metadata Columns: {list(metadata_col_names)}")

Embedding Column: 'Category'
Metadata Columns: ['Headline']


In [10]:
documents = []
for _, row in df.iterrows():
    # 1. Extract the text content
    text_content = str(row[text_col_name])
    
    # 2. Dynamically build metadata dictionary from all other columns
    metadata = {}
    for col in metadata_col_names:
        # We convert to str to ensure FAISS serialization compatibility
        metadata[col] = str(row[col])
        
    doc = Document(page_content=text_content, metadata=metadata)
    documents.append(doc)

print(f"Prepared {len(documents)} documents.")

Prepared 651 documents.


In [11]:
# --- TEST: Minimal Gemini Embedding Call ---
try:
    test_embed = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004",
        google_api_key=key
    )
    
    result = test_embed.embed_query("hello world")
    print("Test embedding result (length):", len(result))
    print("Test embedding (first 5 values):", result[:5])
except Exception as e:
    print("Embedding API test failed:", e)

Test embedding result (length): 768
Test embedding (first 5 values): [0.023418045, -0.015917433, -0.03490989, -0.0002421863, -0.00988026]


In [12]:
print("--- Step 2: Initializing Gemini Embeddings ---")
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=key
)

print("--- Step 3: Creating Vector Embeddings ---")
vector_store = FAISS.from_documents(documents, embeddings)

--- Step 2: Initializing Gemini Embeddings ---
--- Step 3: Creating Vector Embeddings ---


In [13]:
# Save the model as a local FAISS Vector Store
print("--- Step 4: Saving Model ---")
vector_store.save_local(MODEL_OUTPUT_PATH)
print(f"Saved to '{MODEL_OUTPUT_PATH}'")

--- Step 4: Saving Model ---
Saved to 'faiss_gemini_model_multi'


In [14]:
# --- Step 5: Load FAISS Vector Store and Query ---

from langchain_community.vectorstores import FAISS

In [15]:

# Load the saved FAISS vector store
print("--- Step 5: Loading Model ---")
vector_store = FAISS.load_local(MODEL_OUTPUT_PATH, embeddings, allow_dangerous_deserialization=True)

--- Step 5: Loading Model ---


In [16]:

# Load the queries file from test_clean.csv
QUERIES_FILE = "data\\test_clean.csv"
queries_df = pd.read_csv(QUERIES_FILE)  # Using the test input file for demonstration
queries = queries_df['Headline'].tolist()

In [17]:
# Search for the results for the listed queries in the defined vector store
print("--- Step 6: Querying Vector Store ---")
results = []

for query in queries:
    docs = vector_store.similarity_search(query, k=5)
    for doc in docs:
        results.append({
            'query': query,
            'matched_text': doc.page_content,
            **doc.metadata  # Unpack all metadata fields
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df.head())

--- Step 6: Querying Vector Store ---
                                               query matched_text  \
0  rbi announce woman policy framework rbi announ...     Business   
1  rbi announce woman policy framework rbi announ...     Business   
2  rbi announce woman policy framework rbi announ...     Business   
3  rbi announce woman policy framework rbi announ...     Business   
4  rbi announce woman policy framework rbi announ...     Business   

                                            Headline  
0  startup digital sector attract global investor...  
1  sensex surge technology amid market optimism s...  
2  rbi announce climate policy framework rbi anno...  
3  india sign trade deal health nation india sign...  
4  india sign trade deal digital nation india sig...  


In [18]:
# Aggregate matched_text column as unique values
results_df = results_df.groupby('query', as_index=False).agg(
    text=('matched_text', lambda x: ', '.join(x.unique()))
)

In [25]:
# Join the results_df with the original queries_df dataframe based on the "query" column as the key
merged_results = pd.merge(queries_df, results_df, left_on='Headline', right_on='query', how='left')

In [26]:
merged_results.drop(columns=['query'], inplace=True)

In [27]:

merged_results.to_csv("data\\vector_store_search_results.csv", index=False)