In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.neighbors import KNeighborsRegressor
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [2]:
# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = "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"  # <-- Replace with your key or use environment variable

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import json
import re

# Load data
m3000 = pd.read_csv("/Users/leviwarren/Downloads/cbr3000/3kmn.csv")
trackcor_df = pd.read_csv("/Users/leviwarren/Downloads/cbr3000/trackcor.csv")
trackcor_3000 = trackcor_df[trackcor_df['distance'] == 3000].drop(columns='distance').to_dict('records')[0]

# Prepare case base
pbs = m3000[m3000['pb'] == True]
non_pbs = m3000[m3000['pb'] == False]
cb = pd.merge(pbs, non_pbs, on='PersonID', suffixes=('_p', '_n'))

def predict_pb(q, cb, ft_range=500, age_range=5, adjust=False, npb=True, pb500=False, pb1000=False, pb1500=False, sameTrack=False, noCases=10):
    q_gender = q['Gender_n']
    q_track = q['Track_n']
    
    # Filter case base
    cases = cb[cb['Gender_n'] == q_gender]
    cases = cases[cases['PersonID'] != q['PersonID']]
    
    if age_range > 0:
        cases = cases[(cases['age_perf_p'] < q['age_perf_n'] + age_range) & (cases['age_perf_p'] > q['age_perf_n'] - age_range)]
    if ft_range > 0:
        cases = cases[(cases['endtime_n'] < q['endtime_n'] + ft_range) & (cases['endtime_n'] > q['endtime_n'] - ft_range)]
    if sameTrack:
        cases = cases[cases['Track_n'] == q_track]

    # Build regex for selecting variables
    reg_parts = []
    if pb500: reg_parts.append(r'^r_500_._n')
    if pb1000: reg_parts.append(r'^r_1000_._n')
    if pb1500: reg_parts.append(r'^r_1500_._n')
    if npb: reg_parts.append(r'^r_._n')

    reg = '|'.join(reg_parts)
    if not reg:
        raise ValueError("No features selected for prediction.")
    
    # Select features
    selected_cols = [col for col in cases.columns if re.search(reg, col)]
    selected_cols += ['PersonID', 'endtime_n', 'Track_n', 'age_perf_p']
    cases = cases[selected_cols].drop_duplicates()
    
    nk = min(noCases, len(cases))
    if nk == 0:
        return None  # No neighbors found

    # Prepare training data
    y = cases['endtime_n']
    X = cases[[col for col in selected_cols if col not in ['PersonID', 'endtime_n', 'Track_n', 'age_perf_p']]]

    # Query
    X_t = q[[col for col in X.columns]]

    # Fit KNN
    knn = KNeighborsRegressor(n_neighbors=nk)
    knn.fit(X, y)
    pred = knn.predict(X_t.values.reshape(1, -1))[0]

    if adjust:
        pred *= trackcor_3000[q['Track_p']]

    # Find neighbors for pace prediction
    distances, indices = knn.kneighbors(X_t.values.reshape(1, -1), n_neighbors=nk)
    nn = cases.iloc[indices[0]]

    # Relative paces
    rel_pace_cols = [col for col in nn.columns if re.match(r'^dr._p', col)]
    if rel_pace_cols:
        rel_paces = nn[rel_pace_cols].mean()
        pred_pb_paces = rel_paces * (pred / 7.5)
    else:
        pred_pb_paces = np.full(8, np.nan)

    return {
        "predicted_time": float(pred),
        "predicted_paces": pred_pb_paces.tolist(),
        "similar_cases": nn.to_dict(orient='records')
    }

# Run predictions for all skaters
predictions = {}

for person_id, q in cb.groupby('PersonID'):
    q_sample = q.iloc[0]
    pred = predict_pb(q_sample, cb, adjust=True, age_range=5, sameTrack=False, npb=True, pb1000=True, pb500=True, noCases=10)
    if pred:
        name = q_sample['Name_p']
        predictions[name] = pred

# Save to JSON
with open('/Users/leviwarren/Downloads/cbr3000/predictions_3000m.json', 'w') as f:
    json.dump(predictions, f, indent=2)

print("Predictions saved to 'predictions_3000m.json'")

In [3]:
info = pd.read_csv("/Users/leviwarren/Downloads/cbr3000/3kmn.csv")

In [4]:
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprint

file_path='/Users/leviwarren/Downloads/cbr3000/predictions_3000m.json'
data = json.loads(Path(file_path).read_text())

In [4]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=500)
json_chunks = splitter.split_json(json_data=data)

In [5]:
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load CSV
loader_csv = CSVLoader(file_path="/Users/leviwarren/Downloads/cbr3000/3kmn.csv")
person_info = loader_csv.load()

# Each document corresponds to a row. Now split them if needed.
splitter_personal_info = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
personal_information = splitter_personal_info.split_documents(person_info)

In [15]:
personal_information

[Document(metadata={'source': '/Users/leviwarren/Downloads/cbr3000/3kmn.csv', 'row': 0}, page_content='PersonID: 0024fb74-902b-4c26-a2d2-78752c24cdd4\nName: Pelle Hoek\nGender: H\nTrack: EN\nDate: 2018-02-18\nDateBirth: 1993-02-16\nage: 23.9068493150685\ncat: sen\nage_perf: 25.0219178082192\nendtime: 26805\nPB: 0\nr_0: 2036\nr_1: 3376\nr_2: 3436\nr_3: 3448\nr_4: 3538\nr_5: 3579\nr_6: 3638\nr_7: 3754\nallrank: 2\nrankperc: 0.357904761904755\nrank: 0\ndr0: 0.569669837716844\ndr1: 0.944599888080582\ndr2: 0.961387800783436\ndr3: 0.964745383324007\ndr4: 0.989927252378288\ndr5: 1.00139899272524\ndr6: 1.01790710688304'),
 Document(metadata={'source': '/Users/leviwarren/Downloads/cbr3000/3kmn.csv', 'row': 0}, page_content='dr7: 1.05036373810856\npb: TRUE\ncluster3: 2\ncluster4: 1\ncluster5: 2\nar_0: 2073.96339358828\nar_1: 3438.94912414246\nar_2: 3500.06788819712\nar_3: 3512.29164100806\nar_4: 3603.96978709005\nar_5: 3645.73427586074\nar_6: 3705.83439384783\nar_7: 3823.99733768685\naendtime: 2

In [6]:
from langchain.schema import Document

documents_predictions = []

for chunk in json_chunks:
    # chunk is like { 'Lianne van Loon': {...} }
    for athlete_name, details in chunk.items():
        # Create one document for predicted time
        doc_predicted_time = Document(
            page_content=f"Athlete: {athlete_name}\nPredicted Time: {details['predicted_time']}",
            metadata={"type": "prediction"}
        )
        documents_predictions.append(doc_predicted_time)
        
        # Create one document per similar case
        for case in details.get('similar_cases', []):
            text = f"Athlete: {athlete_name}\nSimilar Case:\n{json.dumps(case, indent=2)}"
            documents_predictions.append(
                Document(page_content=text, metadata={"type": "similar_case"})
            )



In [16]:
documents_predictions

[Document(metadata={'type': 'prediction'}, page_content='Athlete: Anne den Uil\nPredicted Time: 28442.046252135562'),
 Document(metadata={'type': 'similar_case'}, page_content='Athlete: Anne den Uil\nSimilar Case:\n{\n  "r_0_n": 2291,\n  "r_1_n": 3621,\n  "r_2_n": 3712,\n  "r_3_n": 3823,\n  "r_4_n": 3928,\n  "r_5_n": 3964,\n  "r_6_n": 4037,\n  "r_7_n": 4093,\n  "r_500_0_n": 1253,\n  "r_500_1_n": 3262,\n  "r_1000_0_n": 2164,\n  "r_1000_1_n": 3309,\n  "r_1000_2_n": 3460,\n  "PersonID": "ce460584-c871-478f-821a-49054c058f68",\n  "endtime_n": 29469,\n  "Track_n": "LE",\n  "age_perf_p": 25.4109589041096\n}'),
 Document(metadata={'type': 'similar_case'}, page_content='Athlete: Anne den Uil\nSimilar Case:\n{\n  "r_0_n": 2319,\n  "r_1_n": 3612,\n  "r_2_n": 3747,\n  "r_3_n": 3871,\n  "r_4_n": 3901,\n  "r_5_n": 3989,\n  "r_6_n": 4035,\n  "r_7_n": 4033,\n  "r_500_0_n": 1230,\n  "r_500_1_n": 3215,\n  "r_1000_0_n": 2102,\n  "r_1000_1_n": 3295,\n  "r_1000_2_n": 3625,\n  "PersonID": "50824234-51a9-4d

In [12]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
# Define the path where you want to save the DB
persist_dir = "/Users/leviwarren/Downloads/cbr3000/chroma_personal_info.db"

# Initialize the embedding model
embedding_model_personal_info = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the vector store with a persist directory
vector_store_personal_info = Chroma.from_documents(
    documents=personal_information,
    embedding=embedding_model_personal_info,
    persist_directory=persist_dir
)


In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
# Define the path where you want to save the DB
persist_dir_predictions = "/Users/leviwarren/Downloads/cbr3000/chroma_predictions.db"

# Initialize the embedding model
embedding_model_predictions = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the vector store with a persist directory
vector_store_predictions = Chroma.from_documents(
    documents=documents_predictions,
    embedding=embedding_model_predictions,
    persist_directory=persist_dir_predictions
)

In [5]:
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Je bent een schaatscoach die uitlegt hoe het voorspellingmodel werkt. Geef altijd heldere, begrijpelijke uitleg in het Nederlands. Gebruik voorbeelden zoals:

De races van de tien vergelijkbare schaatsers zijn gekozen omdat hun prestaties en race-opbouw lijken op die van jou. We hebben gekeken naar je persoonlijke records (PR's), je rondetijden, leeftijd en eindtijd.

Beantwoord de vraag volledig en gebruik eenvoudige taal.

Gebruik de volgende informatie over het model als achtergrondinformatie:
- Het model voorspelt je 3000m tijd door tien vergelijkbare schaatsers te zoeken. Hierbij wordt gekeken naar leeftijd (±5 jaar), persoonlijke records op 500m, 1000m, 1500m, de eindtijd van eerdere races (±5 sec), en pacing per ronde.
- We passen een track-correctie toe afhankelijk van de ijsbaan waarop je gereden hebt, zoals vastgelegd in het trackcorrectie schema.
- De uiteindelijke voorspelde rondetijden komen voort uit het gemiddelde relatieve pacing patroon van de vergelijkbare schaatsers, aangepast naar jouw voorspelde eindtijd.
- Als je jouw 1000m PR verbetert, heeft dit invloed op de selectie van vergelijkbare schaatsers en daarmee op de voorspelling van je 3000m tijd.
- Het model gebruikt een Case-Based Reasoning aanpak: geen puur statistisch model, maar leren van eerdere vergelijkbare prestaties.

Context:
{context}

Vraag:
{question}

Antwoord:
"""
)

In [6]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)

  llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)


In [7]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Path to the Chroma database
VECTOR_STORE_PATH_PREDICTIONS = "/Users/leviwarren/Downloads/cbr3000/chroma_predictions.db"

# Initialize the embedding model (ensure it matches the one used during embedding)
embedding_model_predictions = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the vector store from the specified path
vector_store_predictions = Chroma(
    persist_directory=VECTOR_STORE_PATH_PREDICTIONS,
    embedding_function=embedding_model_predictions
)


  embedding_model_predictions = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  vector_store_predictions = Chroma(


In [8]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Path to the Chroma database
VECTOR_STORE_PATH_INFO = "/Users/leviwarren/Downloads/cbr3000/chroma_personal_info.db"

# Initialize the embedding model (ensure it matches the one used during embedding)
embedding_model_personal_info  = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the vector store from the specified path
vector_store_personal_info = Chroma(
    persist_directory=VECTOR_STORE_PATH_INFO,
    embedding_function=embedding_model_personal_info 
)

In [9]:
# Build LangChain RAG
retriever_predictions = vector_store_predictions.as_retriever(search_kwargs={"k": 3})
retriever_personal_info = vector_store_personal_info.as_retriever(search_kwargs={"k": 3})

In [10]:
from langchain.retrievers import EnsembleRetriever
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[retriever_predictions, retriever_personal_info], weights=[0.5, 0.5]
)

In [11]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=ensemble_retriever,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": custom_prompt}
)

In [14]:
from langchain.tools import tool

@tool
def get_prediction_by_name(name: str) -> dict:
    """Retrieve the prediction results for a specific athlete by name, including pacing and similar cases."""
    return data.get(name, {"error": f"No prediction found for {name}"})


In [15]:
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

tools = [get_prediction_by_name]

agent_chain = initialize_agent(
    tools,
    llm,
    agent=AgentType.OPENAI_FUNCTIONS,
    verbose=True,
)


  agent_chain = initialize_agent(


In [17]:
from langchain.tools import tool
@tool
def get_lap_times(track: str, name: str) -> dict:
    """
    Retrieve the lap times (r_0 to r_7) for a specific Track and Name.
    """
    # Filter the DataFrame for the specific Track and Name
    filtered_data = info[(info['Track'] == track) & (info['Name'] == name)]
    
    if filtered_data.empty:
        return {"error": f"No data found for Track: {track} and Name: {name}"}
    
    # Select the columns r_0 to r_7
    columns_of_interest = [f"r_{i}" for i in range(8)]
    lap_times = filtered_data[columns_of_interest].to_dict(orient='records')
    
    return {"lap_times": lap_times}

In [18]:
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

tools = [get_lap_times]

agent_chain2 = initialize_agent(
    tools,
    llm,
    agent=AgentType.OPENAI_FUNCTIONS,
    verbose=True,
)

In [19]:
question = "Wat zijn mijn rondetijden op de Track UT? Mijn naam is Lianne van Loon."
result = agent_chain2.run(question)
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `get_lap_times` with `{'track': 'UT', 'name': 'Lianne van Loon'}`


[0m[36;1m[1;3m{'lap_times': [{'r_0': 2266, 'r_1': 3755, 'r_2': 3940, 'r_3': 4098, 'r_4': 4160, 'r_5': 4383, 'r_6': 4442, 'r_7': 4561}, {'r_0': 2199, 'r_1': 3470, 'r_2': 3682, 'r_3': 3829, 'r_4': 3959, 'r_5': 4120, 'r_6': 4196, 'r_7': 4183}]}[0m[32;1m[1;3mHier zijn je rondetijden op de Track UT, Lianne van Loon:

1. Set 1:
   - Ronde 0: 2266 ms
   - Ronde 1: 3755 ms
   - Ronde 2: 3940 ms
   - Ronde 3: 4098 ms
   - Ronde 4: 4160 ms
   - Ronde 5: 4383 ms
   - Ronde 6: 4442 ms
   - Ronde 7: 4561 ms

2. Set 2:
   - Ronde 0: 2199 ms
   - Ronde 1: 3470 ms
   - Ronde 2: 3682 ms
   - Ronde 3: 3829 ms
   - Ronde 4: 3959 ms
   - Ronde 5: 4120 ms
   - Ronde 6: 4196 ms
   - Ronde 7: 4183 ms

Als je meer informatie nodig hebt, laat het me weten![0m

[1m> Finished chain.[0m
Hier zijn je rondetijden op de Track UT, Lianne van Loon:

1. Set 1:
   - Ronde 0

In [16]:
question = "Wat is mijn voorspelde tijd op de 3000m? Mijn naam is Anne den Uil?"
result = agent_chain.run(question)
print(result)

  result = agent_chain.run(question)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `get_prediction_by_name` with `{'name': 'Anne den Uil'}`


[0m[36;1m[1;3m{'predicted_time': 28442.046252135562, 'predicted_paces': [nan, nan, nan, nan, nan, nan, nan, nan], 'similar_cases': [{'r_0_n': 2291, 'r_1_n': 3621, 'r_2_n': 3712, 'r_3_n': 3823, 'r_4_n': 3928, 'r_5_n': 3964, 'r_6_n': 4037, 'r_7_n': 4093, 'r_500_0_n': 1253, 'r_500_1_n': 3262, 'r_1000_0_n': 2164, 'r_1000_1_n': 3309, 'r_1000_2_n': 3460, 'PersonID': 'ce460584-c871-478f-821a-49054c058f68', 'endtime_n': 29469, 'Track_n': 'LE', 'age_perf_p': 25.4109589041096}, {'r_0_n': 2319, 'r_1_n': 3612, 'r_2_n': 3747, 'r_3_n': 3871, 'r_4_n': 3901, 'r_5_n': 3989, 'r_6_n': 4035, 'r_7_n': 4033, 'r_500_0_n': 1230, 'r_500_1_n': 3215, 'r_1000_0_n': 2102, 'r_1000_1_n': 3295, 'r_1000_2_n': 3625, 'PersonID': '50824234-51a9-4d8e-ae24-def837d916f0', 'endtime_n': 29507, 'Track_n': 'EN', 'age_perf_p': 24.1178082191781}, {'r_0_n': 2336, 'r_1_n': 3598, 'r_2_n': 3665, 'r_3_n

In [20]:
question = "Welke factoren bepalen de vergelijkbare schaatsers?"
result = chain.invoke({"question": question, "chat_history": []})
print(result["answer"])  # Dit geeft alleen het antwoord

De vergelijkbare schaatsers worden bepaald aan de hand van verschillende factoren die relevant zijn voor jouw prestaties. Hier zijn de belangrijkste factoren:

1. **Leeftijd**: We kijken naar schaatsers die ongeveer dezelfde leeftijd hebben als jij, met een verschil van maximaal 5 jaar. Dit helpt om te zorgen dat we vergelijkbare fysieke mogelijkheden en ervaring in overweging nemen.

2. **Persoonlijke Records (PR's)**: We vergelijken je persoonlijke records op de afstanden 500m, 1000m, en 1500m. Schaatsers met PR's die in de buurt liggen van die van jou worden geselecteerd. Bijvoorbeeld, als jouw 1000m PR 8569 seconden is, zoeken we andere schaatsers met vergelijkbare tijden.

3. **Eindtijd van eerdere races**: We houden rekening met de eindtijden van jouw eerdere races, en selecteren schaatsers wiens eindtijden binnen 5 seconden van jouw tijden liggen. Dit zorgt ervoor dat we schaatsers kiezen die vergelijkbare raceprestaties hebben laten zien.

4. **Pacing per ronde**: We analyseren

In [13]:
for doc in result["source_documents"]:
    print(doc.page_content)

aendtime_1500: 15549.7094226163
Athlete: Henk den Uil
Predicted Time: 32142.2
Athlete: Joost van het Kaar
Predicted Time: 29122.260317572658
Athlete: Ilse Zijsveld
Predicted Time: 29060.958517590196


In [None]:
questions = [
    "Hoe wordt mijn 3000m voorspeld?",
    "Welke factoren bepalen de vergelijkbare schaatsers?",
    "Wat gebeurt er als ik mijn 1000m tijd verbeter?",
    "Op welke rondes verlies ik het meeste snelheid?",
]