# Sports Betting Data Cleaning Work Thru

### Package Imports

In [301]:
from OddsJamClient import OddsJamClient;
from dotenv import load_dotenv
import os, requests, datetime, json
import pandas as pd

### Load Env Vars

In [302]:
load_dotenv()
ODDSJAM_API_KEY = os.getenv("ODDSJAM_API_KEY")

### Initialize Clients

In [303]:
Client = OddsJamClient(ODDSJAM_API_KEY);
Client.UseV2();

### Get Games For Today

In [403]:
def get_games_from_league_as_dataframe(league="nba", sport="basketball", game_date=None):
    # Define the endpoint URL
    end_point = 'https://api-external.oddsjam.com/api/v2/games'
    
    # Set up the headers with the API key and content type
    headers = {
        # 'x-api-key': ODDSJAM_API_KEY,
        'Content-Type': 'application/json'
    }

    if not game_date : 
        game_date = datetime.datetime.now().strftime('%Y-%m-%d')
    else:
        # Parse the string into a datetime object
        parsed_date = datetime.datetime.strptime(game_date, '%m/%d/%Y')
        # Format the datetime object to the desired string format
        game_date = parsed_date.strftime('%Y-%m-%d')

    
    # Set up the data payload with the parameters
    data = {
        'league': league,
        'sport': sport,
        "include_team_info": True,
        "key": ODDSJAM_API_KEY,
        "game_date": game_date
    }
    
    # Make the POST request to the API
    response = requests.get(end_point, headers=headers, params=data)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Here you would typically convert the response to a DataFrame
        df = pd.DataFrame(response.json()["data"])
        # update the time
        df['start_date'] = pd.to_datetime(df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S')
        df = df.sort_values(by='start_date').reset_index(drop=True)
        # Now, we extract the team ids from the home_team_info and away_team_info dictionaries
        # and then drop these columns from the DataFrame
        df['home_team_id'] = df['home_team_info'].apply(lambda x: x['id'])
        df['away_team_id'] = df['away_team_info'].apply(lambda x: x['id'])

        # Drop the now-redundant '_team_info' columns
        df = df.drop(columns=['home_team_info', 'away_team_info'])
        return df
    else:
        print(f"Error: {response.status_code}")
        # You could also handle errors here


In [406]:
games_df = get_games_from_league_as_dataframe(league="NBA", sport="basketball", game_date="01/22/2024")



In [407]:
display(games_df)

Unnamed: 0,id,start_date,home_team,away_team,is_live,is_popular,tournament,status,sport,league,home_team_id,away_team_id
0,11464-30354-2024-01-22,2024-01-22 19:00:00,Orlando Magic,Cleveland Cavaliers,False,False,,unplayed,basketball,NBA,CC72CD00EB95,D5348BDFEBCC
1,25236-12967-2024-01-22,2024-01-22 19:00:00,Detroit Pistons,Milwaukee Bucks,False,False,,unplayed,basketball,NBA,5988658C6B9B,14682EF45C4D
2,48840-32634-2024-01-22,2024-01-22 19:00:00,Philadelphia 76ers,San Antonio Spurs,False,False,,unplayed,basketball,NBA,EDF03AD3C346,E89F51275352
3,22796-35775-2024-01-22,2024-01-22 19:30:00,Toronto Raptors,Memphis Grizzlies,False,False,,unplayed,basketball,NBA,417F4FFF4625,2C653B0A5BBF
4,35142-41811-2024-01-22,2024-01-22 20:00:00,Minnesota Timberwolves,Charlotte Hornets,False,False,,unplayed,basketball,NBA,FDAE71FA88C6,C65360931346
5,77646-14251-2024-01-22,2024-01-22 20:30:00,Dallas Mavericks,Boston Celtics,False,False,,unplayed,basketball,NBA,7165DAB9CAE4,BE2A4976ABA4
6,37132-24860-2024-01-22,2024-01-22 21:00:00,Phoenix Suns,Chicago Bulls,False,False,,unplayed,basketball,NBA,9BF9A5FD18B1,CA98E3A931AE
7,19957-31342-2024-01-22,2024-01-22 22:00:00,Sacramento Kings,Atlanta Hawks,False,False,,unplayed,basketball,NBA,04E36C744934,B59C1C735494


### Segment The Games Into Sessions 
- Sessions are defined by games grouped together that start within 2 hours of each other
- no two sessions will share the same game; they're all exclusive sessions.

In [348]:
def segment_games_into_sessions(games_df):
    # Ensure start_date is a datetime
    games_df['start_date'] = pd.to_datetime(games_df['start_date'])
    
    # Sort the games by start time
    sorted_games_df = games_df.sort_values(by='start_date').reset_index(drop=True)
    
    # List to hold all game sessions
    sessions = []
    # Temporary list to store games for the current session
    current_session = []
    # Start time of the current session
    session_start = None
    
    for _, game in sorted_games_df.iterrows():
        game_start = game['start_date']
        
        # If current session is empty or game starts within 2 hours of the session start
        if not current_session or (game_start - session_start).total_seconds() <= 2 * 3600:
            # Add game to current session
            current_session.append(game)
            # If this is the first game in the session, set the session start time
            if not session_start:
                session_start = game_start
        else:
            # If game starts more than 2 hours after the session start, save and reset the session
            sessions.append(current_session)
            current_session = [game]
            session_start = game_start
    
    # Add the last session if it exists
    if current_session:
        sessions.append(current_session)
    
    # Convert sessions to DataFrame list for better handling
    session_dfs = [pd.DataFrame(session) for session in sessions]
    
    return session_dfs

In [408]:
# Example usage:
# Assuming games_df is your DataFrame containing the games information.
segmented_sessions = segment_games_into_sessions(games_df)

# You can then access each session DataFrame with segmented_sessions[0], segmented_sessions[1], etc.
len(segmented_sessions)

2

### Get Players From A Specific Team

In [371]:
def get_players_from_specific_team(team_id: str, league="nba", sport="basketball") -> pd.DataFrame:
    # Define the endpoint URL
    end_point = 'https://api-external.oddsjam.com/api/v2/players/list'
    
    # Set up the headers with the API key
    headers = {
        'Content-Type': 'application/json'
    }
    
    # Set up the parameters with the team_id
    params = {
        'team': team_id,
        'page': 3,
        # "league": league,
        # "sport": sport,
        'key': ODDSJAM_API_KEY
    }
    
    # Make the GET request to the API
    response = requests.get(end_point, headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Convert the JSON response to a DataFrame
        data = response.json()["data"]
        df = pd.DataFrame(data)
        print(f'total pages: {response.json()["total_pages"]}')
        return df
    else:
        print(f"Error: {response.status_code}")
        return pd.DataFrame()

### Testing That The Functionality Works As Envisioned
* Need to get the game_id from the segmented_sessions  

In [414]:
display(segmented_sessions[0])

Unnamed: 0,id,start_date,home_team,away_team,is_live,is_popular,tournament,status,sport,league,home_team_id,away_team_id
0,11464-30354-2024-01-22,2024-01-22 19:00:00,Orlando Magic,Cleveland Cavaliers,False,False,,unplayed,basketball,NBA,CC72CD00EB95,D5348BDFEBCC
1,25236-12967-2024-01-22,2024-01-22 19:00:00,Detroit Pistons,Milwaukee Bucks,False,False,,unplayed,basketball,NBA,5988658C6B9B,14682EF45C4D
2,48840-32634-2024-01-22,2024-01-22 19:00:00,Philadelphia 76ers,San Antonio Spurs,False,False,,unplayed,basketball,NBA,EDF03AD3C346,E89F51275352
3,22796-35775-2024-01-22,2024-01-22 19:30:00,Toronto Raptors,Memphis Grizzlies,False,False,,unplayed,basketball,NBA,417F4FFF4625,2C653B0A5BBF
4,35142-41811-2024-01-22,2024-01-22 20:00:00,Minnesota Timberwolves,Charlotte Hornets,False,False,,unplayed,basketball,NBA,FDAE71FA88C6,C65360931346
5,77646-14251-2024-01-22,2024-01-22 20:30:00,Dallas Mavericks,Boston Celtics,False,False,,unplayed,basketball,NBA,7165DAB9CAE4,BE2A4976ABA4
6,37132-24860-2024-01-22,2024-01-22 21:00:00,Phoenix Suns,Chicago Bulls,False,False,,unplayed,basketball,NBA,9BF9A5FD18B1,CA98E3A931AE


In [415]:
get_players_from_specific_team(segmented_sessions[1].iloc[0]['home_team_id'])

total pages: 3


In [374]:
temp = get_players_from_specific_team(segmented_sessions[1].iloc[0]['away_team_id'])

total pages: 4


In [375]:
temp

Unnamed: 0,id,player_name,first_name,last_name,team_name,team_id,number,position,age,height,weight,is_active,sport,league
0,923C0B6110CD,Matt Bushman,Matt,Bushman,Kansas City Chiefs,2D71E5BA64A5,49.0,TE,28,77,245,True,football,NFL
1,868B02CBC42F,Matt Dickerson,Matt,Dickerson,Kansas City Chiefs,2D71E5BA64A5,93.0,DT,28,77,292,True,football,NFL
2,7ECC86914BEF,Mecole Hardman Jr.,Mecole,Hardman Jr.,Kansas City Chiefs,2D71E5BA64A5,12.0,WR,25,70,187,True,football,NFL
3,658E9A4ADAA9,Mike Caliendo,Mike,Caliendo,Kansas City Chiefs,2D71E5BA64A5,66.0,G,26,76,301,True,football,NFL
4,6BE925261591,Mike Danna,Mike,Danna,Kansas City Chiefs,2D71E5BA64A5,51.0,DE,26,74,257,True,football,NFL
5,00071ACF8666,Mike Edwards,Mike,Edwards,Kansas City Chiefs,2D71E5BA64A5,21.0,S,27,70,205,True,football,NFL
6,6C2367F28691,Mike Pennel Jr.,Mike,Pennel Jr.,Kansas City Chiefs,2D71E5BA64A5,69.0,DT,32,76,332,True,football,NFL
7,0615B45C311E,Montrell Washington,Montrell,Washington,Kansas City Chiefs,2D71E5BA64A5,12.0,WR,24,70,170,True,football,NFL
8,269A362F5C64,Nazeeh Johnson,Nazeeh,Johnson,Kansas City Chiefs,2D71E5BA64A5,13.0,S,25,70,199,True,football,NFL
9,8B72DF4A5554,Neil Farrell,Neil,Farrell,Kansas City Chiefs,2D71E5BA64A5,92.0,DE,25,76,325,True,football,NFL


### Get All Available Player Props For Each Team, Given There's A PrizePicks Line

In [416]:
def get_player_odds(player_id: str, sportsbook: list, league="nba", sport="basketball") -> pd.DataFrame:
    # Define the endpoint URL
    end_point = 'https://api-external.oddsjam.com/api/v2/game-odds'
    
    # Set up the headers with the API key
    headers = {
        'Content-Type': 'application/json'
    }
    # Set up the parameters with the team_id
    params = {
        'player_id': player_id,
        "sportsbook": sportsbook,
        # "league": league,
        # "sport": sport,
        'key': ODDSJAM_API_KEY
    }
    
    # Make the GET request to the API
    response = requests.get(end_point, headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Convert the JSON response to a DataFrame
        data = response.json()["data"]
        df = pd.DataFrame(data)
        return df
    else:
        print(f"Error: {response.reason}")
        print(response)
        return pd.DataFrame()

### Test That The Function Pulls Odds Correctly

In [377]:
odds = get_player_odds("64945FEDDEE3", ["PrizePicks", "Underdog Fantasy", "Pinnacle", "FanDuel", "Draftkings", "BetMGM"])

In [378]:
odds.shape

(1, 11)

In [380]:
books = odds.iloc[0].odds
books

[{'id': '66EA5D95A677',
  'sports_book_name': 'Underdog Fantasy',
  'name': 'Richie James Over 5.5',
  'price': -137.0,
  'timestamp': 1705799006.9842653,
  'bet_points': 5.5,
  'is_main': True,
  'is_live': False,
  'market_name': 'Player Receiving Yards',
  'market': 'player_receiving_yards',
  'home_rotation_number': None,
  'away_rotation_number': None,
  'deep_link_url': None,
  'player_id': '64945FEDDEE3',
  'selection': 'Richie James',
  'normalized_selection': 'richie_james',
  'selection_line': 'over',
  'selection_points': 5.5},
 {'id': 'E405402D4C66',
  'sports_book_name': 'Underdog Fantasy',
  'name': 'Richie James Under 5.5',
  'price': -137.0,
  'timestamp': 1705799006.98432,
  'bet_points': 5.5,
  'is_main': True,
  'is_live': False,
  'market_name': 'Player Receiving Yards',
  'market': 'player_receiving_yards',
  'home_rotation_number': None,
  'away_rotation_number': None,
  'deep_link_url': None,
  'player_id': '64945FEDDEE3',
  'selection': 'Richie James',
  'normal

In [381]:
books_df = pd.DataFrame(books)
display(books_df.head())
print(books_df.shape)

Unnamed: 0,id,sports_book_name,name,price,timestamp,bet_points,is_main,is_live,market_name,market,home_rotation_number,away_rotation_number,deep_link_url,player_id,selection,normalized_selection,selection_line,selection_points
0,66EA5D95A677,Underdog Fantasy,Richie James Over 5.5,-137.0,1705799000.0,5.5,True,False,Player Receiving Yards,player_receiving_yards,,,,64945FEDDEE3,Richie James,richie_james,over,5.5
1,E405402D4C66,Underdog Fantasy,Richie James Under 5.5,-137.0,1705799000.0,5.5,True,False,Player Receiving Yards,player_receiving_yards,,,,64945FEDDEE3,Richie James,richie_james,under,5.5
2,16341-13602-24-03:pinnacle:anytime_touchdown_s...,Pinnacle,Richie James Jr.,714.0,1705786000.0,,True,False,Anytime Touchdown Scorer,anytime_touchdown_scorer,318.0,317.0,,64945FEDDEE3,Richie James Jr.,richie_james_jr_,,
3,16341-13602-24-03:pinnacle:player_receiving_ya...,Pinnacle,Richie James Jr. Over 4.5,-124.0,1705786000.0,4.5,True,False,Player Receiving Yards,player_receiving_yards,318.0,317.0,,64945FEDDEE3,Richie James Jr.,richie_james_jr_,over,4.5
4,16341-13602-24-03:pinnacle:player_receiving_ya...,Pinnacle,Richie James Jr. Under 4.5,-106.0,1705786000.0,4.5,True,False,Player Receiving Yards,player_receiving_yards,318.0,317.0,,64945FEDDEE3,Richie James Jr.,richie_james_jr_,under,4.5


(37, 18)


### Create Cleaned Sportsbook Data Frame
* Each row will have PrizePicks and show the odds and the line for a couple other highly trusted Sportsbooks.
* This makes it seemless to compare PrizePicks Player Projections to the sentiment of the trusted market.

In [386]:
def filter_for_prizepicks(odds_df):
    # First, sort the DataFrame by the 'market' column, which acts as our unique key.
    sorted_odds_df = odds_df.sort_values(by='market')

    # Group by 'market' to organize our DataFrame into groups based on prediction types.
    grouped = sorted_odds_df.groupby('market')

    # Now, we will keep only the groups that contain at least one 'PrizePicks' entry.
    valid_groups = []
    for name, group in grouped:
        if 'PrizePicks' in group['sports_book_name'].values:
            valid_groups.append(group)
    if len(valid_groups) > 0:
        # Combine all valid groups back into a DataFrame.
        filtered_df = pd.concat(valid_groups)
        filtered_df = filtered_df.groupby(['market', 'sports_book_name']).apply(lambda x: x.reset_index(drop=True)).reset_index(drop=True)
    else:
        filtered_df = pd.DataFrame()
    
    return filtered_df

### Testing the functionality

In [389]:
filtered_odds_df = filter_for_prizepicks(books_df)

  filtered_df = filtered_df.groupby(['market', 'sports_book_name']).apply(lambda x: x.reset_index(drop=True)).reset_index(drop=True)


In [390]:
filtered_odds_df

Unnamed: 0,id,sports_book_name,name,price,timestamp,bet_points,is_main,is_live,market_name,market,home_rotation_number,away_rotation_number,deep_link_url,player_id,selection,normalized_selection,selection_line,selection_points
0,16341-13602-24-03:betmgm:player_receiving_yard...,BetMGM,Richie James Under 5.5,-120.0,1705799000.0,5.5,True,False,Player Receiving Yards,player_receiving_yards,,,https://sports.<STATE>.betmgm.com/en/sports?op...,64945FEDDEE3,Richie James,richie_james,under,5.5
1,16341-13602-24-03:betmgm:player_receiving_yard...,BetMGM,Richie James Over 5.5,-110.0,1705799000.0,5.5,True,False,Player Receiving Yards,player_receiving_yards,,,https://sports.<STATE>.betmgm.com/en/sports?op...,64945FEDDEE3,Richie James,richie_james,over,5.5
2,16341-13602-24-03:pinnacle:player_receiving_ya...,Pinnacle,Richie James Jr. Over 4.5,-124.0,1705786000.0,4.5,True,False,Player Receiving Yards,player_receiving_yards,318.0,317.0,,64945FEDDEE3,Richie James Jr.,richie_james_jr_,over,4.5
3,16341-13602-24-03:pinnacle:player_receiving_ya...,Pinnacle,Richie James Jr. Under 4.5,-106.0,1705786000.0,4.5,True,False,Player Receiving Yards,player_receiving_yards,318.0,317.0,,64945FEDDEE3,Richie James Jr.,richie_james_jr_,under,4.5
4,299D373734F8,PrizePicks,Richie James Under 8.5,-137.0,1705802000.0,8.5,True,False,Player Receiving Yards,player_receiving_yards,,,,64945FEDDEE3,Richie James,richie_james,under,8.5
5,E94DA1B15B5D,PrizePicks,Richie James Over 8.5,-137.0,1705802000.0,8.5,True,False,Player Receiving Yards,player_receiving_yards,,,,64945FEDDEE3,Richie James,richie_james,over,8.5
6,66EA5D95A677,Underdog Fantasy,Richie James Over 5.5,-137.0,1705799000.0,5.5,True,False,Player Receiving Yards,player_receiving_yards,,,,64945FEDDEE3,Richie James,richie_james,over,5.5
7,E405402D4C66,Underdog Fantasy,Richie James Under 5.5,-137.0,1705799000.0,5.5,True,False,Player Receiving Yards,player_receiving_yards,,,,64945FEDDEE3,Richie James,richie_james,under,5.5


### Organize The DF Into Something We Can Scan Easily

In [396]:
def create_cleaned_props_df(filtered_df):
    book_order = ["PrizePicks", "Underdog Fantasy", "Pinnacle", "FanDuel", "Draftkings", "BetMGM"]

    # Create a list to store our new rows
    cleaned_data = []

    # Group by 'market' and then by 'selection' within each group
    for market, market_df in filtered_df.groupby('market'):
        for selection in ['over', 'under']:
            row = {'Player Name': None, 'Selection': selection, 'Market Name': market}
            selection_present = False

            for book in book_order:
                book_df = market_df[(market_df['sports_book_name'] == book) & (market_df['selection_line'] == selection)]
                
                if not book_df.empty:
                    selection_present = True
                    # We assume that there's only one row for each book per selection
                    book_row = book_df.iloc[0]
                    row['Player Name'] = book_row['normalized_selection'].replace('_', ' ').title()
                    row[f'{book} Line Number'] = book_row['bet_points']
                    row[f'{book} Odds'] = book_row['price']
                else:
                    row[f'{book} Line Number'] = '-'
                    row[f'{book} Odds'] = '-'
            
            # If we found at least one entry for this selection, add the row to our list
            if selection_present:
                cleaned_data.append(row)

    # Convert the list of rows into a DataFrame
    cleaned_props_df = pd.DataFrame(cleaned_data)
    return cleaned_props_df

# Then you would call this function, passing the DataFrame you've obtained after applying the filter_for_prizepicks function:
# cleaned_props_df = create_cleaned_props_df(filtered_df)


In [398]:
cleaned_props_df = create_cleaned_props_df(filtered_odds_df)
display(cleaned_props_df)

Unnamed: 0,Player Name,Selection,Market Name,PrizePicks Line Number,PrizePicks Odds,Underdog Fantasy Line Number,Underdog Fantasy Odds,Pinnacle Line Number,Pinnacle Odds,FanDuel Line Number,FanDuel Odds,Draftkings Line Number,Draftkings Odds,BetMGM Line Number,BetMGM Odds
0,Richie James,over,player_receiving_yards,8.5,-137.0,5.5,-137.0,4.5,-124.0,-,-,-,-,5.5,-110.0
1,Richie James,under,player_receiving_yards,8.5,-137.0,5.5,-137.0,4.5,-106.0,-,-,-,-,5.5,-120.0


### For Each Player On A Team, Find All Player Props And Construct A Prop Line And Odds Comparion Table

In [402]:
def get_every_players_props(team_id):
    # Get all players from a specific team
    players = get_players_from_specific_team(team_id)
    
    # This will store all props for all players
    all_props = []

    # Iterate through each player
    for player in players:
        # Assume player['player_id'] is how we can access the player's ID
        player_props = get_player_odds(player['player_id'])
        
        # If there are props for the player, add to our list
        if player_props is not None and not player_props.empty:
            all_props.append(player_props)

    # Concatenate all the props into a single DataFrame
    all_props_df = pd.concat(all_props, ignore_index=True)

    # Group by 'market' and 'player_id', and reset the index
    grouped_props_df = all_props_df.sort_values(by=['market', 'player_id']).reset_index(drop=True)

    return grouped_props_df