# Sports Betting Data Cleaning Work Thru

### Package Imports

In [301]:
from OddsJamClient import OddsJamClient;
from dotenv import load_dotenv
import os, requests, datetime, json
import pandas as pd

### Load Env Vars

In [302]:
load_dotenv()
ODDSJAM_API_KEY = os.getenv("ODDSJAM_API_KEY")

### Initialize Clients

In [303]:
Client = OddsJamClient(ODDSJAM_API_KEY);
Client.UseV2();

### Get Games For Today

In [344]:
def get_nba_games_as_dataframe(league="nba", sport="basketball", game_date=None):
    # Define the endpoint URL
    end_point = 'https://api-external.oddsjam.com/api/v2/games'
    
    # Set up the headers with the API key and content type
    headers = {
        # 'x-api-key': ODDSJAM_API_KEY,
        'Content-Type': 'application/json'
    }

    if not game_date : 
        game_date = datetime.datetime.now().strftime('%Y-%m-%d')
    else:
        # Parse the string into a datetime object
        parsed_date = datetime.datetime.strptime(game_date, '%m/%d/%Y')
        # Format the datetime object to the desired string format
        game_date = parsed_date.strftime('%Y-%m-%d')

    
    # Set up the data payload with the parameters
    data = {
        'league': league,
        'sport': sport,
        "include_team_info": True,
        "key": ODDSJAM_API_KEY,
        "game_date": game_date
    }
    
    # Make the POST request to the API
    response = requests.get(end_point, headers=headers, params=data)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Here you would typically convert the response to a DataFrame
        df = pd.DataFrame(response.json()["data"])
        # update the time
        df['start_date'] = pd.to_datetime(df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S')
        df = df.sort_values(by='start_date').reset_index(drop=True)
        # Now, we extract the team ids from the home_team_info and away_team_info dictionaries
        # and then drop these columns from the DataFrame
        df['home_team_id'] = df['home_team_info'].apply(lambda x: x['id'])
        df['away_team_id'] = df['away_team_info'].apply(lambda x: x['id'])

        # Drop the now-redundant '_team_info' columns
        df = df.drop(columns=['home_team_info', 'away_team_info'])
        return df
    else:
        print(f"Error: {response.status_code}")
        # You could also handle errors here


In [346]:
games_df = get_nba_games_as_dataframe(league="NFL", sport="football", game_date="01/21/2024")



In [347]:
display(games_df)

Unnamed: 0,id,start_date,home_team,away_team,is_live,is_popular,tournament,status,sport,league,home_team_id,away_team_id
0,21473-42288-24-03,2024-01-21 15:00:00,Detroit Lions,Tampa Bay Buccaneers,False,False,,unplayed,football,NFL,43412DC9CDCA,4E7DB4C57393
1,16341-13602-24-03,2024-01-21 18:30:00,Buffalo Bills,Kansas City Chiefs,False,False,,unplayed,football,NFL,0787D09E47B9,2D71E5BA64A5


### Segment The Games Into Sessions 
- Sessions are defined by games grouped together that start within 2 hours of each other
- no two sessions will share the same game; they're all exclusive sessions.

In [348]:
def segment_games_into_sessions(games_df):
    # Ensure start_date is a datetime
    games_df['start_date'] = pd.to_datetime(games_df['start_date'])
    
    # Sort the games by start time
    sorted_games_df = games_df.sort_values(by='start_date').reset_index(drop=True)
    
    # List to hold all game sessions
    sessions = []
    # Temporary list to store games for the current session
    current_session = []
    # Start time of the current session
    session_start = None
    
    for _, game in sorted_games_df.iterrows():
        game_start = game['start_date']
        
        # If current session is empty or game starts within 2 hours of the session start
        if not current_session or (game_start - session_start).total_seconds() <= 2 * 3600:
            # Add game to current session
            current_session.append(game)
            # If this is the first game in the session, set the session start time
            if not session_start:
                session_start = game_start
        else:
            # If game starts more than 2 hours after the session start, save and reset the session
            sessions.append(current_session)
            current_session = [game]
            session_start = game_start
    
    # Add the last session if it exists
    if current_session:
        sessions.append(current_session)
    
    # Convert sessions to DataFrame list for better handling
    session_dfs = [pd.DataFrame(session) for session in sessions]
    
    return session_dfs

In [349]:
# Example usage:
# Assuming games_df is your DataFrame containing the games information.
segmented_sessions = segment_games_into_sessions(games_df)

# You can then access each session DataFrame with segmented_sessions[0], segmented_sessions[1], etc.
len(segmented_sessions)

2

### Get Players From A Specific Team

In [350]:
def get_players_from_specific_team(team_id: str, league="nba", sport="basketball") -> pd.DataFrame:
    # Define the endpoint URL
    end_point = 'https://api-external.oddsjam.com/api/v2/players/list'
    
    # Set up the headers with the API key
    headers = {
        'Content-Type': 'application/json'
    }
    
    # Set up the parameters with the team_id
    params = {
        'team': team_id,
        # "league": league,
        # "sport": sport,
        'key': ODDSJAM_API_KEY
    }
    
    # Make the GET request to the API
    response = requests.get(end_point, headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Convert the JSON response to a DataFrame
        data = response.json()["data"]
        df = pd.DataFrame(data)
        return df
    else:
        print(f"Error: {response.status_code}")
        return pd.DataFrame()

### Testing That The Functionality Works As Envisioned
* Need to get the game_id from the segmented_sessions  

In [351]:
display(segmented_sessions[1])

Unnamed: 0,id,start_date,home_team,away_team,is_live,is_popular,tournament,status,sport,league,home_team_id,away_team_id
1,16341-13602-24-03,2024-01-21 18:30:00,Buffalo Bills,Kansas City Chiefs,False,False,,unplayed,football,NFL,0787D09E47B9,2D71E5BA64A5


In [352]:
get_players_from_specific_team(segmented_sessions[1].iloc[0]['home_team_id'])

Unnamed: 0,id,player_name,first_name,last_name,team_name,team_id,number,position,age,height,weight,is_active,sport,league
0,4373AA4A99DA,AJ Epenesa,AJ,Epenesa,Buffalo Bills,0787D09E47B9,57,DE,25,78,260,True,football,NFL
1,699FB8870036,A.J. Klein,A.J.,Klein,Buffalo Bills,0787D09E47B9,52,LB,32,73,240,True,football,NFL
2,8700AC1341AE,Alec Anderson,Alec,Anderson,Buffalo Bills,0787D09E47B9,70,OT,24,77,305,True,football,NFL
3,AB17E73FC51C,Andrew Brown,Andrew,Brown,Buffalo Bills,0787D09E47B9,94,DT,28,75,296,True,football,NFL
4,23F50B2FDB63,Andy Isabella,Andy,Isabella,Buffalo Bills,0787D09E47B9,87,WR,27,69,188,True,football,NFL
5,95A36321FB28,Baylon Spector,Baylon,Spector,Buffalo Bills,0787D09E47B9,54,LB,25,72,233,True,football,NFL
6,9C2E2D1DBF32,Bryan Thompson,Bryan,Thompson,Buffalo Bills,0787D09E47B9,89,WR,24,75,200,True,football,NFL
7,9F72CB1D12CA,Cam Lewis,Cam,Lewis,Buffalo Bills,0787D09E47B9,39,CB,26,69,183,True,football,NFL
8,0DE3040D799C,Christian Benford,Christian,Benford,Buffalo Bills,0787D09E47B9,47,CB,23,73,205,True,football,NFL
9,716E83329EC9,Connor McGovern,Connor,McGovern,Buffalo Bills,0787D09E47B9,66,G,26,77,308,True,football,NFL


In [353]:
temp = get_players_from_specific_team(segmented_sessions[1].iloc[0]['away_team_id'])

In [354]:
temp

Unnamed: 0,id,player_name,first_name,last_name,team_name,team_id,number,position,age,height,weight,is_active,sport,league
0,E5C34D6B9C9F,Austin Reiter,Austin,Reiter,Kansas City Chiefs,2D71E5BA64A5,61,C,32,75,301,True,football,NFL
1,E56402C27338,BJ Thompson,BJ,Thompson,Kansas City Chiefs,2D71E5BA64A5,53,DE,24,78,243,True,football,NFL
2,C81EE2171A61,Blaine Gabbert,Blaine,Gabbert,Kansas City Chiefs,2D71E5BA64A5,9,QB,34,77,235,True,football,NFL
3,5E20D4B3B659,Blake Bell,Blake,Bell,Kansas City Chiefs,2D71E5BA64A5,81,TE,32,78,252,True,football,NFL
4,AF4F0AEC9C2D,Bryan Cook,Bryan,Cook,Kansas City Chiefs,2D71E5BA64A5,6,S,24,73,206,True,football,NFL
5,3011666496C1,Cam Jones,Cam,Jones,Kansas City Chiefs,2D71E5BA64A5,44,LB,24,74,227,True,football,NFL
6,EB160C59A1AB,Chamarri Conner,Chamarri,Conner,Kansas City Chiefs,2D71E5BA64A5,27,S,23,72,206,True,football,NFL
7,06C2533FCD93,Charles Omenihu,Charles,Omenihu,Kansas City Chiefs,2D71E5BA64A5,90,DE,26,77,280,True,football,NFL
8,34ABD726CED5,Chris Jones,Chris,Jones,Kansas City Chiefs,2D71E5BA64A5,95,DT,29,78,310,True,football,NFL
9,B10598F82D4A,Chris Oladokun,Chris,Oladokun,Kansas City Chiefs,2D71E5BA64A5,13,QB,26,74,195,True,football,NFL


### Get All Available Player Props For Each Team, Given There's A PrizePicks Line

In [355]:
def get_player_odds(player_id: str, sportsbook: list, league="nba", sport="basketball") -> pd.DataFrame:
    # Define the endpoint URL
    end_point = 'https://api-external.oddsjam.com/api/v2/game-odds'
    
    # Set up the headers with the API key
    headers = {
        'Content-Type': 'application/json'
    }
    # Set up the parameters with the team_id
    params = {
        'player_id': player_id,
        "sportsbook": sportsbook,
        # "league": league,
        # "sport": sport,
        'key': ODDSJAM_API_KEY
    }
    
    # Make the GET request to the API
    response = requests.get(end_point, headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Convert the JSON response to a DataFrame
        data = response.json()["data"]
        df = pd.DataFrame(data)
        return df
    else:
        print(f"Error: {response.reason}")
        print(response)
        return pd.DataFrame()

### Test That The Function Pulls Odds Correctly

In [356]:
odds = get_player_odds("668BC2EDB642", ["PrizePicks", "Underdog Fantasy", "Pinnacle", "FanDuel", "Draftkings", "BetMGM"])

In [357]:
odds.shape

(1, 11)

In [358]:
books = odds.iloc[0].odds
odds

Unnamed: 0,id,start_date,home_team,away_team,is_live,is_popular,tournament,status,sport,league,odds
0,16341-13602-24-03,2024-01-21T23:30:00+00:00,Buffalo Bills,Kansas City Chiefs,False,False,,unplayed,football,NFL,[]


In [359]:
books_df = pd.DataFrame(books)
display(books_df.head())
print(books_df.shape)

(0, 0)


### Create Cleaned Sportsbook Data Frame
* Each row will have PrizePicks and show the odds and the line for a couple other highly trusted Sportsbooks.
* This makes it seemless to compare PrizePicks Player Projections to the sentiment of the trusted market.

In [327]:
def filter_for_prizepicks(odds_df):
    # First, sort the DataFrame by the 'market' column, which acts as our unique key.
    sorted_odds_df = odds_df.sort_values(by='market')

    # Group by 'market' to organize our DataFrame into groups based on prediction types.
    grouped = sorted_odds_df.groupby('market')

    # Now, we will keep only the groups that contain at least one 'PrizePicks' entry.
    valid_groups = []
    for name, group in grouped:
        if 'PrizePicks' in group['sports_book_name'].values:
            valid_groups.append(group)
    if len(valid_groups) > 0:
        # Combine all valid groups back into a DataFrame.
        filtered_df = pd.concat(valid_groups)
    else:
        filtered_df = pd.DataFrame()
    
    return filtered_df

### Testing the functionality

In [328]:
filtered_odds_df = filter_for_prizepicks(books_df)

In [329]:
filtered_odds_df

In [None]:
def test_other_books_presence(original_df, filtered_df):
    # Get all the markets present in the filtered DataFrame
    filtered_markets = filtered_df['market'].unique()
    
    # Check for each market, if there are entries for other books in the original DataFrame
    missing_books = {}
    for market in filtered_markets:
        books_in_market = original_df[original_df['market'] == market]['sports_book_name'].unique()
        missing_books_in_market = [book for book in books_in_market if book != 'PrizePicks']
        if missing_books_in_market:
            missing_books[market] = missing_books_in_market

    # If the missing_books dictionary is empty, it means there are no other book entries for the markets we have.
    # Otherwise, it will list which markets have entries from other books.
    return missing_books


In [None]:
# Usage:
# 'books_df' is the DataFrame before filtering and 'filtered_odds_df' is after filtering.
missing_books = test_other_books_presence(books_df, filtered_odds_df)
# if empty, then we didnt miss any
print(missing_books)

['player_minutes_played']
{}
