In [1]:
import pandas as pd
import dotenv
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

In [2]:
# --- CONFIGURATION ---
from dotenv import load_dotenv
load_dotenv()
key = os.getenv("GEMINI_API_KEY")
# File locations
INPUT_FILE = "data\\training_prep.csv"
MODEL_OUTPUT_PATH = "faiss_gemini_model_multi"

In [4]:
df = pd.read_csv(INPUT_FILE)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data\\training_prep.csv'

In [None]:
# Drop duplicate rows in columns: 'Headline', 'Category'
df = df.drop_duplicates(subset=['Headline', 'Category'])

In [None]:

# The 1st column is Text, all others are Metadata
text_col_name = df.columns[1]
metadata_col_names = df.columns[2:] # Takes all columns from index 2 onwards

In [None]:

print(f"Embedding Column: '{text_col_name}'")
print(f"Metadata Columns: {list(metadata_col_names)}")

In [None]:
documents = []
for _, row in df.iterrows():
    # 1. Extract the text content
    text_content = str(row[text_col_name])
    
    # 2. Dynamically build metadata dictionary from all other columns
    metadata = {}
    for col in metadata_col_names:
        # We convert to str to ensure FAISS serialization compatibility
        metadata[col] = str(row[col])
        
    doc = Document(page_content=text_content, metadata=metadata)
    documents.append(doc)

print(f"Prepared {len(documents)} documents.")

In [None]:
# --- TEST: Minimal Gemini Embedding Call ---
try:
    test_embed = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004",
        google_api_key=key
    )
    
    result = test_embed.embed_query("hello world")
    print("Test embedding result (length):", len(result))
    print("Test embedding (first 5 values):", result[:5])
except Exception as e:
    print("Embedding API test failed:", e)

In [None]:
print("--- Step 2: Initializing Gemini Embeddings ---")
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=key
)

print("--- Step 3: Creating Vector Embeddings ---")
vector_store = FAISS.from_documents(documents, embeddings)

In [None]:
# Save the model as a local FAISS Vector Store
print("--- Step 4: Saving Model ---")
vector_store.save_local(MODEL_OUTPUT_PATH)
print(f"Saved to '{MODEL_OUTPUT_PATH}'")