In [2]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
import pandas as pd
from src.config import KNOWLEDGE_DIR
with open(KNOWLEDGE_DIR / "player_weekly_comnined_snippets.txt", "r") as f:
    snippets = [line.strip() for line in f if line.strip()]

df = pd.DataFrame({"snippet": snippets})

In [4]:
df

Unnamed: 0,snippet
0,"Tom Brady (QB, NE), Week 1, 2018 vs. HOU: | Pa..."
1,"Tom Brady (QB, NE), Week 2, 2018 vs. JAX: | Pa..."
2,"Tom Brady (QB, NE), Week 3, 2018 vs. DET: | Pa..."
3,"Tom Brady (QB, NE), Week 4, 2018 vs. MIA: | Pa..."
4,"Tom Brady (QB, NE), Week 5, 2018 vs. IND: | Pa..."
...,...
38563,"Trey Benson (RB, ARI), Week 10, 2024 vs. NYJ: ..."
38564,"Trey Benson (RB, ARI), Week 12, 2024 vs. SEA: ..."
38565,"Trey Benson (RB, ARI), Week 13, 2024 vs. MIN: ..."
38566,"Trey Benson (RB, ARI), Week 14, 2024 vs. SEA: ..."


In [5]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

pc = Pinecone(api_key=PINECONE_API_KEY)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

pc.delete_index("weekly-stats")

snippets = df["snippet"].tolist()

index_name = "weekly-stats"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

weekly_embeddings = embedder.encode(snippets, show_progress_bar=True)

batch_size = 100
for i in range(0, len(snippets), batch_size):
    batch_ids = [str(i) for i in range(i, i+batch_size)]
    batch_embeddings = weekly_embeddings[i:i+batch_size]
    batch_snippets = snippets[i:i+batch_size]

    vectors = [
        {"id": batch_ids[k], "values": batch_embeddings[k], "metadata": {"text": batch_snippets[k]}}
        for k in range(len(batch_embeddings))
    ]

    index.upsert(vectors=vectors)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1206/1206 [00:21<00:00, 55.35it/s]


In [11]:
df.to_parquet(KNOWLEDGE_DIR / "snippet_embeddings.parquet")

In [12]:
df

Unnamed: 0,snippet,embedding
0,"Tom Brady (QB, NE), Week 1, 2018 vs. HOU: | Pa...","[-0.025262221693992615, 0.012066834606230259, ..."
1,"Tom Brady (QB, NE), Week 2, 2018 vs. JAX: | Pa...","[-0.008193491958081722, 0.03533443436026573, 0..."
2,"Tom Brady (QB, NE), Week 3, 2018 vs. DET: | Pa...","[-0.008198426105082035, 0.03153906390070915, 0..."
3,"Tom Brady (QB, NE), Week 4, 2018 vs. MIA: | Pa...","[-0.010116074234247208, 0.029954370111227036, ..."
4,"Tom Brady (QB, NE), Week 5, 2018 vs. IND: | Pa...","[-0.016648583114147186, 0.05683114379644394, 0..."
...,...,...
38563,"Trey Benson (RB, ARI), Week 10, 2024 vs. NYJ: ...","[-0.02143833227455616, 0.01273783016949892, 0...."
38564,"Trey Benson (RB, ARI), Week 12, 2024 vs. SEA: ...","[-0.01764897257089615, 0.004418414086103439, 0..."
38565,"Trey Benson (RB, ARI), Week 13, 2024 vs. MIN: ...","[-0.027152298018336296, 0.013948437757790089, ..."
38566,"Trey Benson (RB, ARI), Week 14, 2024 vs. SEA: ...","[-0.01902417652308941, 0.0008071899646893144, ..."


In [13]:
import faiss
import numpy as np

embedding_matrix = np.array(df["embedding"].to_list()).astype("float32")

index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

query = "Josh Allen rushing yards week 4"
q_embed = client.embeddings.create(
    model="text-embedding-3-small",
    input=query
).data[0].embedding

D, I = index.search(np.array([q_embed], dtype="float32"), k=5)

df.iloc[I[0]]

Unnamed: 0,snippet,embedding
24341,"Josh Allen (QB, BUF), Week 4, 2022 vs. BAL: | ...","[-0.03242037445306778, 0.033873703330755234, 0..."
14409,"Josh Allen (QB, BUF), Week 4, 2020 vs. LV: | P...","[-0.040227457880973816, 0.010719715617597103, ..."
19361,"Josh Allen (QB, BUF), Week 4, 2021 vs. HOU: | ...","[-0.04075894504785538, 0.00806802324950695, 0...."
29325,"Josh Allen (QB, BUF), Week 4, 2023 vs. MIA: | ...","[-0.025417305529117584, 0.013657893054187298, ..."
9729,"Josh Allen (QB, BUF), Week 4, 2019 vs. NE: | P...","[-0.04557603970170021, 0.017903324216604233, 0..."
