In [1]:
# imports

import os
from tqdm import tqdm
from dotenv import load_dotenv
import numpy as np
import pickle
from pinecone import Pinecone
from testing import Tester
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CONSTANTS
QUERY = "How much does this cost to the nearest dollar?\n\n"
STORE = "products"

In [3]:
load_dotenv(override=True)
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')

In [4]:
# Load in the test pickle file:

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)

In [5]:
# Initialize Pinecone
pinecone_api = os.environ['PINECONE_API_KEY']

pc = Pinecone(api_key=pinecone_api)

# Connect to the existing index
index = pc.Index(STORE)


In [12]:
ids = [f"doc_{i}" for i in range(20_000)]

vectors = []
documents = []
prices = []

# Fetch in batches
for i in tqdm(range(0, len(ids), 1000)):
    batch_ids = ids[i: i+1000]
    response = index.fetch(ids=batch_ids).vectors
    
    for vec_data in response.values():
        vectors.append(vec_data.values)
        # documents.append(vec_data.metadata["documents"])
        documents.append(vec_data.metadata["category"])
        prices.append(vec_data.metadata["price"])


100%|██████████| 20/20 [03:01<00:00,  9.05s/it]


# Random Forest

We will now train a Random Forest model.

Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model.

In [13]:
# This next line takes an hour on my M1 Mac!

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(vectors, prices)

In [14]:
# Save the model to a file

joblib.dump(rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [6]:
rf_model = joblib.load('random_forest_model.pkl')

In [7]:
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent

In [8]:
specialist = SpecialistAgent()
frontier = FrontierAgent(collection=index)
random_forest = RandomForestAgent()

c:\Users\Muhammad Nasir\Desktop\AutoDealFinder\autoenv\Lib\asyncio\events.py:80: DeprecationError: 2025-01-27: `modal.Cls.lookup` is deprecated and will be removed in a future release. It can be replaced with `modal.Cls.from_name`.

See https://modal.com/docs/guide/modal-1-0-migration for more information.
  self._context.run(self._callback, *self._args)


In [9]:
def description(item):
    return item.prompt.split("to the nearest dollar?\n\n")[1].split("\n\nPrice is $")[0]

In [10]:
def rf(item):
    return random_forest.price(description(item))

In [13]:
# Tester.test(rf, test)

In [11]:
product = "Quadcast HyperX condenser mic for high quality audio for podcasting"

In [12]:
print(specialist.price(product))
print(frontier.price(product))
print(random_forest.price(product))

189.0
139.99
305.90720000000033


In [13]:
specialists = []
frontiers = []
random_forests = []
prices = []
for item in tqdm(test[1000:1250]):
    text = description(item)
    specialists.append(specialist.price(text))
    frontiers.append(frontier.price(text))
    random_forests.append(random_forest.price(text))
    prices.append(item.price)

100%|██████████| 250/250 [16:07<00:00,  3.87s/it]


In [14]:
mins = [min(s,f,r) for s,f,r in zip(specialists, frontiers, random_forests)]
maxes = [max(s,f,r) for s,f,r in zip(specialists, frontiers, random_forests)]

X = pd.DataFrame({
    'Specialist': specialists,
    'Frontier': frontiers,
    'RandomForest': random_forests,
    'Min': mins,
    'Max': maxes,
})

# Convert y to a Series
y = pd.Series(prices)

In [15]:
# Train a Linear Regression
np.random.seed(42)

lr = LinearRegression()
lr.fit(X, y)

feature_columns = X.columns.tolist()

for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

Specialist: 0.79
Frontier: 0.02
RandomForest: -0.05
Min: 0.18
Max: 0.11
Intercept=5.86


In [16]:
joblib.dump(lr, 'ensemble_model.pkl')

['ensemble_model.pkl']

In [17]:
from agents.ensemble_agent import EnsembleAgent
ensemble = EnsembleAgent(collection=index)

c:\Users\Muhammad Nasir\Desktop\AutoDealFinder\autoenv\Lib\asyncio\events.py:80: DeprecationError: 2025-01-27: `modal.Cls.lookup` is deprecated and will be removed in a future release. It can be replaced with `modal.Cls.from_name`.

See https://modal.com/docs/guide/modal-1-0-migration for more information.
  self._context.run(self._callback, *self._args)


In [18]:
ensemble.price(product)

np.float64(200.26786886819036)