# ClutchAI Sandbox

__GOAL__ 

Create an Agentic AI assistant for Yahoo Fantasy Basketball League.

__ReACT ClutchAI Agent Components__

Live Data - LangChain Tools:
1. Add YahooFantasy API as a Tool
2. Add Josh Llyod Youtube Transcripts to VectorDB
3. Add Tool to webscrape basketball monster data and hashtag data
4. Add nba_api as a Tool

Static Data - VectorDB:

5. Add Youtube Tool to get transcript from any video
6. Add sample weekly summary to VectorDB as example
7. Add articles online




__Test Cases__
- Give me a weekly summary from last week
- Who is more likely to win in week A vs week B
- Who is hot on the waiver wire?
- Is player A for player B a good trade?

#### 0. Environment Setup

In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
import json

# Load .env file from the project root (parent directory of notebook/)
# Resolve the path to get the absolute path to the .env file
env_file_location = Path('..').resolve()
load_dotenv(dotenv_path=env_file_location / ".env")
print(f"Loading .env from: {env_file_location / '.env'}\n")

# Set env variables
YAHOO_CONSUMER_KEY = os.environ.get('YAHOO_CLIENT_ID', "<INSERT_>")
YAHOO_CONSUMER_SECRET = os.environ.get('YAHOO_CLIENT_SECRET', "<INSERT>")
YAHOO_LEAGUE_ID = 58930
GAME_CODE = "nba"
GAME_ID=466

Loading .env from: /Users/matt/Code/ClutchAI/.env



In [2]:
# Show which env variables were loaded
env_vars_to_check = ['YAHOO_CLIENT_ID', 'YAHOO_CLIENT_SECRET', 'OPENAI_API_KEY', 'LANGSMITH_API_KEY']
print("Environment variables loaded:")
for var in env_vars_to_check:
    value = os.environ.get(var)
    if value:
        print(f"  ✓ {var}: {value[:10]} + ...")
    else:
        print(f"  ✗ {var}: NOT SET")

Environment variables loaded:
  ✓ YAHOO_CLIENT_ID: dj0yJmk9Ml + ...
  ✓ YAHOO_CLIENT_SECRET: 967c74034e + ...
  ✓ OPENAI_API_KEY: sk-proj-4A + ...
  ✓ LANGSMITH_API_KEY: lsv2_pt_7c + ...


#### 1. Connect to YahooFantasy League Data

In [4]:
from yfpy.query import YahooFantasySportsQuery

In [None]:
query = YahooFantasySportsQuery(
    league_id=YAHOO_LEAGUE_ID,
    game_code="nba",
    game_id= GAME_ID,
    env_var_fallback = True,
    env_file_location = env_file_location,
    save_token_data_to_env_file = True,
)

In [6]:
#Data to pull from Yahoo Fantasy
league_meta = query.get_league_metadata()
league_meta_dict = json.loads(league_meta.to_json())
print(len(league_meta))
league_meta_dict


31


{'allow_add_to_dl_extra_pos': 1,
 'current_week': 7,
 'draft_status': 'postdraft',
 'edit_key': '2025-12-05',
 'end_date': '2026-04-05',
 'end_week': 23,
 'felo_tier': 'gold',
 'game_code': 'nba',
 'iris_group_chat_id': None,
 'is_cash_league': 0,
 'is_highscore': False,
 'is_plus_league': 0,
 'is_pro_league': 0,
 'league_id': '58930',
 'league_key': '466.l.58930',
 'league_type': 'private',
 'league_update_timestamp': 1764924473,
 'logo_url': 'https://yahoofantasysports-res.cloudinary.com/image/upload/t_s192sq/fantasy-logos/56479288575_ec9899.jpg',
 'matchup_week': 7,
 'name': 'TK Fiji Fantasy 2024/25',
 'num_teams': 12,
 'renew': '454_14696',
 'renewed': None,
 'roster_type': 'date',
 'scoring_type': 'headone',
 'season': 2025,
 'start_date': '2025-10-21',
 'start_week': 1,
 'url': 'https://basketball.fantasysports.yahoo.com/nba/58930',
 'weekly_deadline': 'intraday'}

In [7]:
# Test get_team_stats_by_week API directly
# Let's see what the actual response looks like for different weeks

team_id = 6  # KATmandu Climbers
weeks_to_test = ["current", 1, 2, 3, 4, 5]

print("Testing get_team_stats_by_week for different weeks:")
print("=" * 70)

for week in weeks_to_test:
    print(f"\nWeek: {week}")
    print("-" * 70)
    try:
        data = query.get_team_stats_by_week(team_id, week)
        print(f"✓ Success! Got data for week {week}")
        
        # Try to see what fields are available
        if hasattr(data, '__dict__'):
            print(f"  Available attributes: {list(data.__dict__.keys())[:10]}...")  # First 10
            # Check if team_projected_points exists
            if hasattr(data, 'team_projected_points'):
                print(f"  ✓ team_projected_points exists: {data.team_projected_points}")
            else:
                print(f"  ✗ team_projected_points does NOT exist")
            
            # Check if team_points exists
            if hasattr(data, 'team_points'):
                print(f"  ✓ team_points exists: {data.team_points}")
            else:
                print(f"  ✗ team_points does NOT exist")
                
            # Try to convert to dict to see all fields
            try:
                data_dict = data.__dict__
                print(f"  All fields: {list(data_dict.keys())}")
            except:
                pass
                
        elif isinstance(data, dict):
            print(f"  Data is a dict with keys: {list(data.keys())[:10]}...")
            if 'team_projected_points' in data:
                print(f"  ✓ team_projected_points exists: {data['team_projected_points']}")
            else:
                print(f"  ✗ team_projected_points does NOT exist")
        else:
            print(f"  Data type: {type(data)}")
            print(f"  Data: {str(data)[:200]}...")
            
    except KeyError as e:
        print(f"✗ KeyError: {e}")
        print(f"  This means the API response is missing a required field")
    except Exception as e:
        print(f"✗ Error: {type(e).__name__}: {e}")

print("\n" + "=" * 70)
print("Test complete!")


Testing get_team_stats_by_week for different weeks:

Week: current
----------------------------------------------------------------------
✗ KeyError: 'team_projected_points'
  This means the API response is missing a required field

Week: 1
----------------------------------------------------------------------
✗ KeyError: 'team_projected_points'
  This means the API response is missing a required field

Week: 2
----------------------------------------------------------------------
✗ KeyError: 'team_projected_points'
  This means the API response is missing a required field

Week: 3
----------------------------------------------------------------------
✗ KeyError: 'team_projected_points'
  This means the API response is missing a required field

Week: 4
----------------------------------------------------------------------
✗ KeyError: 'team_projected_points'
  This means the API response is missing a required field

Week: 5
---------------------------------------------------------------

In [12]:
# Test get_team_stats_by_week with MULTIPLE teams and weeks
# First, get all teams in the league

POINTS_LEAGUE_ID = 229522 

query = YahooFantasySportsQuery(
    league_id=POINTS_LEAGUE_ID,
    game_code="nba",
    game_id= GAME_ID,
    env_var_fallback = True,
    env_file_location = env_file_location,
    save_token_data_to_env_file = True,
)

print(f"Testing with {LEAGUE_TYPE} league (ID: {YAHOO_LEAGUE_ID})")
print("=" * 70)
print("Step 1: Getting all teams in the league...")
print("=" * 70)
try:
    teams = query.get_league_teams()
    print(f"✓ Found {len(teams)} teams")
    
    # Extract team IDs and names
    team_info = []
    for team in teams:
        try:
            team_id = team.team_id
            team_name = getattr(team, 'name', f'Team {team_id}')
            team_info.append((team_id, team_name))
            print(f"  Team {team_id}: {team_name}")
        except:
            pass
    
    print(f"\nFound {len(team_info)} teams to test")
    
except Exception as e:
    print(f"✗ Error getting teams: {e}")
    # Fallback to testing team IDs 1-12
    team_info = [(i, f"Team {i}") for i in range(1, 13)]
    print(f"Using fallback team IDs: {[t[0] for t in team_info]}")

print("\n" + "=" * 70)
print("Step 2: Testing get_team_stats_by_week for different teams and weeks")
print("=" * 70)

weeks_to_test = ["current", 1, 2, 3, 4, 5, 6, 7, 8]
successful_combinations = []

# Test first 6 teams with all weeks
teams_to_test = team_info[:6] if len(team_info) >= 6 else team_info

for team_id, team_name in teams_to_test:
    print(f"\n{'='*70}")
    print(f"Testing Team {team_id} ({team_name})")
    print(f"{'='*70}")
    
    for week in weeks_to_test:
        try:
            data = query.get_team_stats_by_week(team_id, week)
            print(f"✓ SUCCESS! Team {team_id}, Week {week}")
            successful_combinations.append((team_id, team_name, week))
            
            # Check what fields are available
            if hasattr(data, '__dict__'):
                data_dict = data.__dict__
                fields = list(data_dict.keys())
                print(f"  Fields: {fields}")
                
                # Check for key fields
                if 'team_projected_points' in fields or hasattr(data, 'team_projected_points'):
                    print(f"  ✓ team_projected_points EXISTS!")
                    try:
                        val = data.team_projected_points if hasattr(data, 'team_projected_points') else data_dict.get('team_projected_points')
                        print(f"    Value: {val}")
                    except:
                        pass
                else:
                    print(f"  ✗ team_projected_points does NOT exist")
                
                if 'team_points' in fields or hasattr(data, 'team_points'):
                    print(f"  ✓ team_points EXISTS!")
                    try:
                        val = data.team_points if hasattr(data, 'team_points') else data_dict.get('team_points')
                        print(f"    Value: {val}")
                    except:
                        pass
                else:
                    print(f"  ✗ team_points does NOT exist")
                    
        except KeyError as e:
            # Only print if it's not the expected team_projected_points error
            if 'team_projected_points' not in str(e):
                print(f"  ✗ KeyError (unexpected): {e}")
        except Exception as e:
            # Only print unexpected errors
            if 'team_projected_points' not in str(e):
                print(f"  ✗ Error: {type(e).__name__}: {e}")

print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
if successful_combinations:
    print(f"✓ Found {len(successful_combinations)} successful combinations:")
    for team_id, team_name, week in successful_combinations:
        print(f"  Team {team_id} ({team_name}), Week {week}")
else:
    print("✗ No successful combinations found - all failed with KeyError for team_projected_points")
    print("\nThis suggests the yfpy library expects team_projected_points but the API")
    print("doesn't always return it. We may need to handle this differently.")
print("=" * 70)


Testing with POINTS league (ID: 229522)
Step 1: Getting all teams in the league...
✓ Found 10 teams
  Team 1: b'Cunning Ham Jerky'
  Team 2: b"Luka's LeGOAT Cheese Puffs"
  Team 3: b'The Banana Splitters'
  Team 4: b'Trust the Tiramisu'
  Team 5: b'Dwane(Pound) the rock (Cake)sy'
  Team 6: b"Jontay's Jalebi"
  Team 7: b'Day to Daynishes'
  Team 8: b'Just vibes'
  Team 9: b'THE HALIBAN HALVA'
  Team 10: b'Dubai Chocolateers'

Found 10 teams to test

Step 2: Testing get_team_stats_by_week for different teams and weeks

Testing Team 1 (b'Cunning Ham Jerky')
✓ SUCCESS! Team 1, Week current
✓ SUCCESS! Team 1, Week 1
✓ SUCCESS! Team 1, Week 2
✓ SUCCESS! Team 1, Week 3
✓ SUCCESS! Team 1, Week 4
✓ SUCCESS! Team 1, Week 5
✓ SUCCESS! Team 1, Week 6
✓ SUCCESS! Team 1, Week 7
✓ SUCCESS! Team 1, Week 8

Testing Team 2 (b"Luka's LeGOAT Cheese Puffs")
✓ SUCCESS! Team 2, Week current
✓ SUCCESS! Team 2, Week 1
✓ SUCCESS! Team 2, Week 2
✓ SUCCESS! Team 2, Week 3
✓ SUCCESS! Team 2, Week 4
✓ SUCCESS! Team

#### 2. Connecting LangChain Agent to Yahoo API

In [16]:
from langchain_core.tools import tool
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage


In [7]:
#Creat YahooFantasy Agent
class YahooFantasyAgent:
    def __init__(self):
        self.YAHOO_CONSUMER_KEY = os.environ.get('YAHOO_CLIENT_ID', "<INSERT_>")
        self.YAHOO_CONSUMER_SECRET = os.environ.get('YAHOO_CLIENT_SECRET', "<INSERT>")
        self.YAHOO_LEAGUE_ID = 58930
        self.GAME_CODE = "nba"
        self.GAME_ID=466

        self.query = YahooFantasySportsQuery(
            league_id=YAHOO_LEAGUE_ID,
            game_code="nba",
            game_id= GAME_ID,
            yahoo_consumer_key = YAHOO_CONSUMER_KEY,
            yahoo_consumer_secret = YAHOO_CONSUMER_SECRET,
            env_var_fallback = True,
            env_file_location = env_file_location,
            save_token_data_to_env_file = True,
        )

    def get_league_metadata(self) -> str:
        """Fetch Yahoo Fantasy league metadata."""
        try:
            data = self.query.get_league_metadata()
            return f'The Yahoo Fantasy league metadata in json format is: {data}'
        except Exception as e:
            return f"Failed to Yahoo Fantasy league metadata: {e}"
        
@tool("YahooLeagueMetDataTool", description="Get Yahoo League Metadata in json format from YPFS.")
def league_metadata_tool() -> str:
    """Get Yahoo League Metadata."""
    return YahooFantasyAgent().get_league_metadata()

In [8]:
#Create Clutch AI Agent
tools = [league_metadata_tool]
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

agent_v1 = create_agent(
    model = llm,
    tools = tools,
    system_prompt="You are a helpful assistant for a Yahoo Fantasy Sports league manager.",
)

In [9]:
#Test Agent
inputs = {"messages": [{"role": "user", "content": "What is the Yahoo Fantasy League Name?"}]}
agent_v1.invoke(inputs)

{'messages': [HumanMessage(content='What is the Yahoo Fantasy League Name?', additional_kwargs={}, response_metadata={}, id='0732860d-244f-44b6-bb5b-990cde41b7fd'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 67, 'total_tokens': 81, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CZ0IjtiHWHr1QUjJ9LQkNGMwmbpjw', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--18b03830-37ba-4409-9e68-ba2a49edd970-0', tool_calls=[{'name': 'YahooLeagueMetDataTool', 'args': {}, 'id': 'call_f7fLzHxSdQMlnYp2iy0Izota', 'type': 'tool_call'}], usage_metadata={'input_tokens': 67, 'output_tokens': 14, 'tot

In [10]:
#Stream Agent Response for Debug
for event in agent_v1.stream(inputs):
    print(event)

{'model': {'messages': [AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 67, 'total_tokens': 81, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CZ0InZoXDaWA6cD3N7JF8JVvk7cxZ', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--cd9a3d43-8692-4d8c-b5bb-26d49cbdca8f-0', tool_calls=[{'name': 'YahooLeagueMetDataTool', 'args': {}, 'id': 'call_G0vPVyZnLHGxuGZIPo2clSPw', 'type': 'tool_call'}], usage_metadata={'input_tokens': 67, 'output_tokens': 14, 'total_tokens': 81, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})]}}
{'tools': {'mes

#### 2. Adding Youtube Transcripts to VectorDB

In [11]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
# from youtube_transcript_api import YouTubeTranscriptApi
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [12]:
# Get Documents from YouTube
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=TB2QwCRMams&t=2s", 
    add_video_info=False,
    transcript_format=TranscriptFormat.CHUNKS,
    chunk_size_seconds=30
)
docs = loader.load()
docs[0]

Document(metadata={'source': 'https://www.youtube.com/watch?v=TB2QwCRMams&t=0s', 'start_seconds': 0, 'start_timestamp': '00:00:00'}, page_content="There are a lot of players who are either hurt, injury-prone, susceptible to tanking. So, what do you do in a fantasy draft? When do you take them? Michael Bolton, he's going to give you some answers. >> Thanks, Josh. It's Michael Bolton here, and it's time for another episode of the Locked On Fantasy Basketball Podcast. Let's get to it. >> Let's get to it. Indeed. You are Locked on Fantasy, your daily")

In [13]:
#Embedding YouTube Documents
vectorstore = Chroma.from_documents(
    documents=docs, 
    embedding=OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()

@tool("locked_on_retreiver", description="Retrieve contextual knowledge from Locked On Basketball.")
def retrieve_LockedOnKnowledge(query: str) -> str:
    """Retrieve contextual knowledge from Locked On Podcast YouTube transcripts or articles."""
    results = retriever.invoke(query)
    return "\n\n".join([r.page_content for r in results])

In [14]:
#Create ReACT Agent with YouTube Knowledge Retrieval and Yahoo Fantasy Tool
#Note: Temperature is a parameter that controls the “creativity” or randomness of the text generated.

llm = ChatOpenAI(model="gpt-4o-mini")
tools = [league_metadata_tool, retrieve_LockedOnKnowledge]

agent_v2 = create_agent(
    model = llm,
    tools = tools,
    system_prompt="You are a helpful assistant for a Yahoo Fantasy Sports league manager.",
)

In [21]:
#Test Agent
inputs = {"messages": [{"role": "user", "content": "Give me 3 key draft advice from Locked On Podcast that I should follow for my league."}]}
output_v2 = agent_v2.invoke(inputs)

In [23]:
output_v2

dict

In [22]:
print(output_v2)

{'messages': [HumanMessage(content='Give me 3 key draft advice from Locked On Podcast that I should follow for my league. Provide sources.', additional_kwargs={}, response_metadata={}, id='d75e4a65-1991-40ca-9787-cd5948ca70c5'), AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 106, 'total_tokens': 125, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CZ2mschdYzSy3zkJ20O9sVsFZdmg3', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--35ccf9ce-2f13-435b-ab6f-eeac204a8b74-0', tool_calls=[{'name': 'locked_on_retreiver', 'args': {'query': 'draft advice'}, 'id': 'call_3E8rKbt8Z0I1EEjjVlUgTJYN', 

#### 3. Add Tabular Data from Basketball Monster

In [1]:
import pandas as pd
import requests
from scrapy.selector import Selector

# Scrape the Basketball Monster player rankings table
def scrape_dashboard_tables(url):
    """
    Scrape player rankings table from Basketball Monster.
    
    Returns:
        pandas DataFrame with player rankings data
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Fetch the page
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Try using pandas read_html first (easiest method)
        try:
            tables = pd.read_html(response.text)
            if tables:
                df = tables[0]  # Usually the first table is the main rankings
                print(f"✓ Successfully parsed table with {len(df)} rows using pandas")
                return df
        except Exception as e:
            print(f"pandas read_html failed: {e}, trying manual parsing...")
        
        # Fallback: Manual parsing with scrapy selector
        selector = Selector(text=response.text)
        
        # Find the rankings table
        table = selector.css('table').get()
        if table:
            # Try to parse with pandas again on the extracted table HTML
            df = pd.read_html(table)[0]
            print(f"✓ Successfully parsed table with {len(df)} rows using manual extraction")
            return df
        else:
            print("✗ Could not find rankings table in HTML")
            return None
            
    except requests.exceptions.RequestException as e:
        print(f"✗ Error fetching {url}: {e}")
        return None
    except Exception as e:
        print(f"✗ Error scraping table: {e}")
        return None

# Scrape the Basketball Monster Rankings
print("Scraping Basketball Monster player rankings...")
url = 'https://basketballmonster.com/playerrankings.aspx'
basketball_monster_rankings_df = scrape_dashboard_tables(url)

if basketball_monster_rankings_df is not None:
    print(f"\nTable shape: {basketball_monster_rankings_df.shape}")
    print(f"\nData types:")
else:
    print("Failed to scrape rankings table")

# Scrape the Hashtag Basketball Rankings
print("Scraping Hastag Monster player rankings...")
url = 'https://hashtagbasketball.com/fantasy-basketball-dynasty-rankings'
hashtag_rankings_df = scrape_dashboard_tables(url)

if hashtag_rankings_df is not None:
    print(f"\nTable shape: {hashtag_rankings_df.shape}")
    print(f"\nData types:")
else:
    print("Failed to scrape rankings table")

Scraping Basketball Monster player rankings...


  tables = pd.read_html(response.text)


✓ Successfully parsed table with 203 rows using pandas

Table shape: (203, 30)

Data types:
Scraping Hastag Monster player rankings...


  tables = pd.read_html(response.text)


✓ Successfully parsed table with 1 rows using pandas

Table shape: (1, 6)

Data types:


In [31]:
hashtag_rankings_df

Unnamed: 0,SET OF RANKINGS,FORECAST RANGE,STATS FROM,POSITION,FROM,NBA TEAM
0,Overall Contending Rebuilding Rookies Poin...,Next 5 seasons Next 3 seasons,2025-26 Regular Season,All PG SG SF PF C,Yahoo Fantrax,All Teams Atlanta Hawks Boston Celtics Broo...


In [23]:
hashtag_rankings_df

Unnamed: 0,SHOW,MIN GP,Z SCORE,CO%,AGE,POSITION,POS FROM,NBA TEAM,DATA AND RANKINGS FROM,BASED ON,TYPE
0,Top 30 50 100 Top 200 300 400 All,1 2 3 5 10 15 20,On Off,On Off,On Off,All PG SG SF PF C,Yahoo ESPN Fantrax Depth Chart,All ATL BKN BOS CHA CHI CLE DAL DEN D...,2025-26 Rest of Season Rankings (Projections u...,Total Averages,Standard H2H Minus 1


In [7]:
    print(f"\nFirst few rows:")
    rankings_df.head()


First few rows:


Unnamed: 0,Round,Rank,Value,Name,Team,Pos,Inj,g,m/g,p/g,...,USG,pV,3V,rV,aV,sV,bV,fg%V,ft%V,toV
0,1,1,1.34,Nikola Jokic,DEN,C,,14,34.5,29.1,...,28.6,1.95,0.11,2.91,3.48,1.22,0.2,3.26,0.78,-1.86
1,1,2,0.97,Victor Wembanyama,SAS,C,INJ 10g,12,34.7,26.2,...,30.9,1.48,-0.07,2.8,0.2,-0.03,5.16,0.25,0.84,-1.87
2,1,3,0.92,Shai Gilgeous-Alexander,OKC,G,,16,33.4,32.0,...,33.8,2.41,0.43,-0.31,1.35,0.9,0.36,0.98,2.02,0.14
3,1,4,0.77,Kawhi Leonard,LAC,F,INJ 5g,6,33.5,24.3,...,28.3,1.19,0.57,-0.05,-0.03,3.15,-0.01,0.3,1.71,0.11
4,1,5,0.76,Tyrese Maxey,PHI,G,P,14,40.3,31.9,...,30.1,2.4,2.1,-0.42,1.94,0.9,0.2,-0.91,1.4,-0.8


In [2]:
# Enhanced version with punt categories support
from urllib.parse import urlencode, urlparse, urlunparse
from io import StringIO

def scrape_basketball_monster_rankings_with_punt(base_url, punt_categories=None):
    """
    Scrape player rankings table from Basketball Monster with optional punt categories.
    
    Args:
        base_url: Base URL for Basketball Monster rankings (e.g., 'https://basketballmonster.com/playerrankings.aspx')
        punt_categories: List of categories to punt (e.g., ['FT%', 'TO', 'FG%'])
            Common categories: 'PTS', 'REB', 'AST', 'STL', 'BLK', '3PM', 'FG%', 'FT%', 'TO'
            Note: Category names should match Basketball Monster's format exactly
    
    Returns:
        pandas DataFrame with player rankings data
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Build URL with punt categories if provided
    url = base_url
    if punt_categories and len(punt_categories) > 0:
        # Join categories with comma and URL encode if needed
        punt_str = ','.join(str(cat) for cat in punt_categories)
        parsed_url = urlparse(base_url)
        query_params = urlencode({'Punt': punt_str})
        url = urlunparse((
            parsed_url.scheme,
            parsed_url.netloc,
            parsed_url.path,
            parsed_url.params,
            query_params,
            parsed_url.fragment
        ))
        print(f"Fetching rankings with punt categories: {punt_categories}")
        print(f"URL: {url}")
    
    try:
        # Fetch the page
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Try using pandas read_html first (easiest method)
        try:
            tables = pd.read_html(StringIO(response.text))
            if tables:
                df = tables[0]  # Usually the first table is the main rankings
                print(f"✓ Successfully parsed table with {len(df)} rows using pandas")
                if punt_categories:
                    print(f"  Punt categories applied: {punt_categories}")
                return df
        except Exception as e:
            print(f"pandas read_html failed: {e}, trying manual parsing...")
        
        # Fallback: Manual parsing with scrapy selector
        selector = Selector(text=response.text)
        
        # Find the rankings table
        table = selector.css('table').get()
        if table:
            # Try to parse with pandas again on the extracted table HTML
            df = pd.read_html(StringIO(table))[0]
            print(f"✓ Successfully parsed table with {len(df)} rows using manual extraction")
            if punt_categories:
                print(f"  Punt categories applied: {punt_categories}")
            return df
        else:
            print("✗ Could not find rankings table in HTML")
            return None
            
    except requests.exceptions.RequestException as e:
        print(f"✗ Error fetching {url}: {e}")
        return None
    except Exception as e:
        print(f"✗ Error scraping table: {e}")
        import traceback
        traceback.print_exc()
        return None

# Test with punt categories
print("=" * 60)
print("Testing Basketball Monster rankings WITH punt categories")
print("=" * 60)

base_url = 'https://basketballmonster.com/playerrankings.aspx'

# Test 1: No punt categories (default rankings)
print("\n1. Fetching default rankings (no punt categories):")
df_default = scrape_basketball_monster_rankings_with_punt(base_url, punt_categories=None)

print("\n" + "-" * 60)

# Test 2: With punt categories (FT% and TO)
print("\n2. Fetching rankings with punt categories: ['FT%', 'TO']")
df_punt_ft_to = scrape_basketball_monster_rankings_with_punt(base_url, punt_categories=['FT%', 'TO'])
    

Testing Basketball Monster rankings WITH punt categories

1. Fetching default rankings (no punt categories):
✓ Successfully parsed table with 203 rows using pandas

------------------------------------------------------------

2. Fetching rankings with punt categories: ['FT%', 'TO']
Fetching rankings with punt categories: ['FT%', 'TO']
URL: https://basketballmonster.com/playerrankings.aspx?Punt=FT%25%2CTO
✓ Successfully parsed table with 203 rows using pandas
  Punt categories applied: ['FT%', 'TO']


In [4]:
df_default.columns

Index(['Round', 'Rank', 'Value', 'Name', 'Team', 'Pos', 'Inj', 'g', 'm/g',
       'p/g', '3/g', 'r/g', 'a/g', 's/g', 'b/g', 'fg%', 'fga/g', 'ft%',
       'fta/g', 'to/g', 'USG', 'pV', '3V', 'rV', 'aV', 'sV', 'bV', 'fg%V',
       'ft%V', 'toV'],
      dtype='object')

In [3]:
df_default

Unnamed: 0,Round,Rank,Value,Name,Team,Pos,Inj,g,m/g,p/g,...,USG,pV,3V,rV,aV,sV,bV,fg%V,ft%V,toV
0,1,1,1.35,Nikola Jokic,DEN,C,,17,34.8,29.6,...,29.0,2.02,0.37,2.75,3.46,1.17,0.15,3.11,0.83,-1.72
1,1,2,1.04,Tyler Herro,MIA,G,IN,2,30.9,26.5,...,25.3,1.52,-0.22,0.06,-0.04,2.12,-0.35,3.20,1.99,1.06
2,1,3,1.03,Shai Gilgeous-Alexander,OKC,G,,19,33.2,32.6,...,33.7,2.51,0.41,-0.38,1.39,1.15,0.19,1.50,2.26,0.23
3,1,4,1.01,Victor Wembanyama,SAS,C,INJ 7g,12,34.7,26.2,...,30.8,1.47,-0.06,2.78,0.19,0.01,5.39,0.27,0.91,-1.85
4,1,5,0.80,Tyrese Maxey,PHI,G,,17,39.9,32.2,...,30.3,2.45,2.13,-0.57,1.83,1.17,0.25,-0.63,1.42,-0.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,16,184,-0.40,Daniss Jenkins,DET,G,,13,19.1,10.0,...,21.3,-1.15,-0.55,-1.58,0.16,0.35,-1.13,0.18,-0.32,0.45
199,16,185,-0.40,Dylan Harper,SAS,G,,7,22.4,13.0,...,25.0,-0.67,-0.84,-0.79,0.00,-0.51,-1.28,-0.27,0.31,0.42
200,16,186,-0.40,Vince Williams Jr.,MEM,G,,14,22.2,7.9,...,19.2,-1.49,-0.50,-0.31,0.95,-0.18,-0.61,-1.22,0.10,-0.39
201,16,187,-0.41,T.J. McConnell,IND,G,,8,17.1,10.3,...,25.0,-1.11,-1.06,-1.28,0.19,-0.47,-1.28,0.10,0.04,1.20


#### 4. NBA API

In [4]:
from nba_api.stats.endpoints import scoreboardv2


#### 7. Add Articles to vectorDB

In [25]:
# Generic web scraping function using scrapy selectors
import requests
from scrapy.selector import Selector
from urllib.parse import urlparse
import time

def scrape_article_content(url, max_content_length=10000):
    """
    Generic function to scrape article content from any URL.
    
    Args:
        url: The URL to scrape (can be absolute or relative)
        max_content_length: Maximum length of content to return (default: 10000)
    
    Returns:
        Dictionary with 'url', 'title', 'content', and 'source' keys, or None if error
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Parse and normalize URL
        parsed_url = urlparse(url)
        if not parsed_url.scheme:
            # If no scheme, assume https
            url = f"https://{url}"
            parsed_url = urlparse(url)
        elif not parsed_url.netloc:
            # If relative URL, we need a base URL - return error
            raise ValueError(f"Invalid URL format: {url}")
        
        # Get the domain for source tracking
        domain = parsed_url.netloc
        
        # Fetch the page
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse with scrapy selector
        selector = Selector(text=response.text)
        
        # Extract title - try multiple common patterns
        title = None
        title_selectors = [
            'h1::text',
            '.article-title::text',
            '.post-title::text',
            '.entry-title::text',
            'title::text',
            'meta[property="og:title"]::attr(content)',
            'meta[name="twitter:title"]::attr(content)'
        ]
        
        for selector_pattern in title_selectors:
            title = selector.css(selector_pattern).get()
            if title:
                title = title.strip()
                break
        
        # Fallback to page title tag
        if not title:
            title = selector.css('title::text').get()
            if title:
                # Clean up title (remove site name if present)
                title = title.split('|')[0].split('-')[0].strip()
        
        title = title or 'Untitled'
        
        # Extract article content - try multiple common content selectors
        content = None
        content_selectors = [
            'article p::text',
            'article::text',
            '.article-body p::text',
            '.article-content p::text',
            '.post-content p::text',
            '.entry-content p::text',
            '.content p::text',
            'main article p::text',
            'main p::text',
            '[role="article"] p::text'
        ]
        
        for selector_pattern in content_selectors:
            paragraphs = selector.css(selector_pattern).getall()
            if paragraphs:
                text_content = ' '.join([p.strip() for p in paragraphs if p.strip()])
                if len(text_content) > 200:  # Only use if substantial content
                    content = text_content
                    break
        
        # Fallback: try to get text from main content areas
        if not content:
            main_content = selector.css('main, article, .main-content, .content').get()
            if main_content:
                main_selector = Selector(text=main_content)
                content = ' '.join(main_selector.css('::text').getall())
        
        # Last resort: get all paragraph text from body
        if not content:
            # Use XPath to exclude common non-content elements
            paragraphs = selector.xpath('//body//p[not(ancestor::script|ancestor::style|ancestor::nav|ancestor::header|ancestor::footer|ancestor::aside)]//text()').getall()
            content = ' '.join([p.strip() for p in paragraphs if p.strip()])
        
        # Clean up content
        if content:
            content = ' '.join(content.split())  # Normalize whitespace
            content = content.strip()
        
        if not content or len(content) < 50:
            print(f"Warning: Minimal or no content extracted from {url}")
            return None
        
        # Limit content length
        if len(content) > max_content_length:
            content = content[:max_content_length] + "..."
        
        return {
            'url': url,
            'title': title,
            'content': content,
            'source': domain
        }
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Test the function with RotoWire
test_url = "https://www.rotowire.com/basketball/advice/"
print("Testing generic scraper with RotoWire...")
test_article = scrape_article_content(test_url)
if test_article:
    print(f"✓ Successfully scraped: {test_article['title']}")
    print(f"  Source: {test_article['source']}")
    print(f"  Content length: {len(test_article['content'])} characters")
    print(f"  Preview: {test_article['content'][:200]}...")
else:
    print("✗ Failed to scrape article")


Testing generic scraper with RotoWire...
✓ Successfully scraped: How Does Fantasy Basketball Work: Best Tips for Your 2025 League
  Source: www.rotowire.com
  Content length: 10003 characters
  Preview: Whether it's your first time playing fantasy sports or you're a seasoned fantasy football player looking for a new challenge, this guide aims to educate you about how to approach fantasy basketball. R...


In [5]:
# Test scraping Hashtag Basketball Dynasty Rankings with multiple approaches
test_url = "https://hashtagbasketball.com/fantasy-basketball-dynasty-rankings"
print("=" * 70)
print("Testing Hashtag Basketball Dynasty Rankings Scraping")
print("=" * 70)

def scrape_hashtag_dynasty_rankings(url):
    """
    Specialized scraper for Hashtag Basketball Dynasty Rankings page.
    Tries multiple methods to extract the rankings table.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Method 1: Try pandas read_html on all tables
        print("\n1. Trying pandas read_html on all tables...")
        from io import StringIO
        try:
            tables = pd.read_html(StringIO(response.text))
            print(f"   Found {len(tables)} table(s)")
            for i, df in enumerate(tables):
                print(f"   Table {i+1}: {df.shape} - columns: {list(df.columns)[:5]}...")
                if len(df) > 50:  # Likely the main rankings table
                    print(f"   ✓ Found main rankings table with {len(df)} rows")
                    return df, "pandas_read_html"
        except Exception as e:
            print(f"   ✗ Failed: {e}")
        
        # Method 2: Use scrapy selector to find tables with specific structure
        print("\n2. Trying scrapy selector to find rankings table...")
        selector = Selector(text=response.text)
        
        # Look for table elements
        tables = selector.css('table')
        print(f"   Found {len(tables)} table elements")
        
        # Try to find table with rankings data
        for i, table in enumerate(tables):
            table_html = table.get()
            try:
                df = pd.read_html(StringIO(table_html))[0]
                if len(df) > 50:
                    print(f"   ✓ Found rankings table {i+1} with {len(df)} rows")
                    return df, f"scrapy_table_{i+1}"
            except:
                continue
        
        # Method 3: Look for data in script tags or data attributes (JS-loaded content)
        print("\n3. Checking for JavaScript-loaded data...")
        scripts = selector.css('script::text').getall()
        data_scripts = [s for s in scripts if 'rankings' in s.lower() or 'player' in s.lower()]
        if data_scripts:
            print(f"   Found {len(data_scripts)} potentially relevant script tags")
            # Could parse JSON from scripts if needed
        
        # Method 4: Check page structure and key elements
        print("\n4. Analyzing page structure...")
        title = selector.css('h1::text, title::text').get()
        print(f"   Page title: {title}")
        
        # Look for specific elements that might contain rankings
        player_rows = selector.css('tr, .player-row, [data-rank]')
        print(f"   Found {len(player_rows)} potential player row elements")
        
        # Method 5: Try to extract data from visible text patterns
        print("\n5. Attempting text-based extraction...")
        page_text = selector.css('body::text').getall()
        text_content = ' '.join(page_text[:100])  # First 100 text elements
        if 'Wembanyama' in text_content or 'Jokic' in text_content:
            print("   ✓ Found player names in page text (page may need JS rendering)")
        
        return None, "all_methods_failed"
        
    except Exception as e:
        print(f"✗ Error: {e}")
        import traceback
        traceback.print_exc()
        return None, "error"

# Run the scraper
df, method = scrape_hashtag_dynasty_rankings(test_url)

if df is not None:
    print(f"\n{'=' * 70}")
    print(f"✓ Successfully scraped using method: {method}")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"\n  First 5 rows:")
    print(df.head())
    print(f"\n{'=' * 70}")
else:
    print(f"\n{'=' * 70}")
    print("✗ Failed to extract rankings table")
    print("  Note: This page may require JavaScript rendering (Selenium needed)")
    print(f"{'=' * 70}")

Testing Hashtag Basketball Dynasty Rankings Scraping

1. Trying pandas read_html on all tables...
   Found 2 table(s)
   Table 1: (1, 6) - columns: ['SET OF RANKINGS', 'FORECAST RANGE', 'STATS FROM', 'POSITION', 'FROM']...
   Table 2: (716, 7) - columns: ['RANK', 'PLAYER', 'PLAYER.1', 'AGE', 'TEAM']...
   ✓ Found main rankings table with 716 rows

✓ Successfully scraped using method: pandas_read_html
  Shape: (716, 7)
  Columns: ['RANK', 'PLAYER', 'PLAYER.1', 'AGE', 'TEAM', 'POS', 'COMMENTS']

  First 5 rows:
  RANK                   PLAYER  \
0   #1        Victor Wembanyama   
1   #2  Shai Gilgeous-Alexander   
2   #3              Luka Doncic   
3   #4             Nikola Jokic   
4   #5          Cade Cunningham   

                                            PLAYER.1   AGE TEAM    POS  \
0  #1 Victor Wembanyama (SA, C) AGE: 21.9  12 GP ...  21.9   SA      C   
1  #2 Shai Gilgeous-Alexander (OKC, PG) AGE: 27.4...  27.4  OKC     PG   
2  #3 Luka Doncic (LAL, PG,SG) AGE: 26.7  13 GP 0...

In [6]:
# Store the scraped hashtag rankings dataframe
if df is not None:
    hashtag_dynasty_rankings_df = df.copy()
    print(f"✓ Stored Hashtag Dynasty Rankings DataFrame")
    print(f"  Shape: {hashtag_dynasty_rankings_df.shape}")
    print(f"  Columns: {list(hashtag_dynasty_rankings_df.columns)}")
    
    # Show a sample of the data
    print(f"\n  Sample data (first 10 rows):")
    print(hashtag_dynasty_rankings_df.head(10))
else:
    print("✗ No data to store. Check the scraping output above for details.")
    hashtag_dynasty_rankings_df = None


✓ Stored Hashtag Dynasty Rankings DataFrame
  Shape: (716, 7)
  Columns: ['RANK', 'PLAYER', 'PLAYER.1', 'AGE', 'TEAM', 'POS', 'COMMENTS']

  Sample data (first 10 rows):
  RANK                   PLAYER  \
0   #1        Victor Wembanyama   
1   #2  Shai Gilgeous-Alexander   
2   #3              Luka Doncic   
3   #4             Nikola Jokic   
4   #5          Cade Cunningham   
5   #6          Anthony Edwards   
6   #7              Evan Mobley   
7   #8             Cooper Flagg   
8   #9            Chet Holmgren   
9  #10    Giannis Antetokounmpo   

                                            PLAYER.1   AGE TEAM       POS  \
0  #1 Victor Wembanyama (SA, C) AGE: 21.9  12 GP ...  21.9   SA         C   
1  #2 Shai Gilgeous-Alexander (OKC, PG) AGE: 27.4...  27.4  OKC        PG   
2  #3 Luka Doncic (LAL, PG,SG) AGE: 26.7  13 GP 0...  26.7  LAL     PG,SG   
3  #4 Nikola Jokic (DEN, C) AGE: 30.8  17 GP 0.62...  30.8  DEN         C   
4  #5 Cade Cunningham (DET, PG,SG) AGE: 24.2  15 ...  24.2 

In [10]:
hashtag_dynasty_rankings_df["COMMENTS"][0]

"12 GP 0.502 FG% 0.857 FT% 1.7 3PM 26.2 PTS 12.9 REB 4.0 AST 1.0 STL 3.6 BLK 3.6 TO  Fun fact: if you only used Wemby's defensive categories (REB, STL, BLK), he still ranks among the top five players in 9-cat rankings."

In [None]:
# Convert scraped articles to LangChain Documents for vector storage
from langchain_core.documents import Document

def articles_to_documents(articles, document_type='web_article'):
    """
    Convert scraped articles to LangChain Documents.
    
    Args:
        articles: List of article dictionaries with 'url', 'title', 'content', 'source'
        document_type: Type identifier for the documents (default: 'web_article')
    
    Returns:
        List of LangChain Document objects
    """
    docs = []
    for article in articles:
        if article and article.get('content'):
            doc = Document(
                page_content=article['content'],
                metadata={
                    'source': article.get('url', ''),
                    'title': article.get('title', ''),
                    'source_domain': article.get('source', ''),
                    'type': document_type
                }
            )
            docs.append(doc)
    return docs

# Convert scraped articles to documents
article_docs = articles_to_documents(scraped_articles, document_type='web_article')

print(f"Created {len(article_docs)} LangChain documents")
if article_docs:
    print(f"Sample document: {article_docs[0].metadata}")
    print(f"Content preview: {article_docs[0].page_content[:200]}...")
else:
    print("No documents created. Make sure you've scraped some articles first.")