In [2]:
import pandas as pd
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

In [5]:
# --- CONFIGURATION ---
GOOGLE_API_KEY = "AIzaSyCowRcCr38EvFZ6JK-vmJmdJaceIEFjkiU"
INPUT_FILE = "/workspaces/codespacerepo/newsclassification/data/Train.csv"
MODEL_OUTPUT_PATH = "/workspaces/codespacerepo/newsclassification/faiss_gemini_model_multi"

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [6]:
df = pd.read_csv(INPUT_FILE)
df.head()

Unnamed: 0,Id,Headline,Category
0,1,Breakthrough discovery in cyber research,Science
1,2,Government focuses on youth development programs,Society
2,3,health party wins majority in assembly elections,Politics
3,4,Startups in digital sector attract global inve...,Business
4,5,Sensex surges by technology points amid market...,Business


In [7]:

# The 1st column is Text, all others are Metadata
text_col_name = df.columns[1]
metadata_col_names = df.columns[2:] # Takes all columns from index 2 onwards

In [8]:

print(f"Embedding Column: '{text_col_name}'")
print(f"Metadata Columns: {list(metadata_col_names)}")

Embedding Column: 'Headline'
Metadata Columns: ['Category']


In [9]:
documents = []
for _, row in df.iterrows():
    # 1. Extract the text content
    text_content = str(row[text_col_name])
    
    # 2. Dynamically build metadata dictionary from all other columns
    metadata = {}
    for col in metadata_col_names:
        # We convert to str to ensure FAISS serialization compatibility
        metadata[col] = str(row[col])
        
    doc = Document(page_content=text_content, metadata=metadata)
    documents.append(doc)

print(f"Prepared {len(documents)} documents.")

Prepared 5000 documents.


In [10]:
print("--- Step 2: Initializing Gemini Embeddings ---")
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=GOOGLE_API_KEY
)

print("--- Step 3: Creating Vector Embeddings ---")
vector_store = FAISS.from_documents(documents, embeddings)

--- Step 2: Initializing Gemini Embeddings ---
--- Step 3: Creating Vector Embeddings ---


In [12]:
# Save the model as a local FAISS Vector Store
print("--- Step 4: Saving Model ---")
vector_store.save_local(MODEL_OUTPUT_PATH)
print(f"Saved to '{MODEL_OUTPUT_PATH}'")

--- Step 4: Saving Model ---
Saved to '/workspaces/codespacerepo/newsclassification/faiss_gemini_model_multi'
