In [12]:
import pandas as pd
import os
from datetime import datetime
from dotenv import load_dotenv
from arize.pandas.logger import Client
from arize.utils.types import ModelTypes, Environments, Schema, Metrics


load_dotenv()


arize_space_key = os.getenv('ARIZE_SPACE_KEY')
arize_api_key = os.getenv('ARIZE_API_KEY')

arize_client = Client(space_key=arize_space_key, api_key=arize_api_key)

def extract_team_nickname(full_name):
    return full_name.split()[-1]
# def generate_prediction_id(home_team, visitor_team, game_date):
#     # Ensure game_date is a datetime object
#     if not isinstance(game_date, datetime):
#         # Parse game_date assuming it's a string in a known format, e.g., 'YYYY-MM-DD'
#         # Adjust the format string if your input format is different
#         game_date = datetime.strptime(game_date, '%Y-%m-%d')

#     # Set the time to midnight (00:00:00)
#     game_date = game_date.replace(hour=0, minute=0, second=0, microsecond=0)

#     # Format the date and time as 'YYYY-MM-DD 00:00:00'
#     formatted_datetime = game_date.strftime('%Y-%m-%d %H:%M:%S')

#     return f"{home_team}-{visitor_team}-{formatted_datetime}"
def generate_prediction_id(home_team, visitor_team, game_date):
    return f"{home_team}-{visitor_team}-{game_date}"
actual_schema = Schema(
    prediction_id_column_name="prediction_id",
    actual_label_column_name="actual_winner",
)


# Load the CSV files
game_stats_path = './game_stats.csv'
predictions_path = './predictions.csv'

# Read the CSV files into pandas DataFrames
game_stats_df = pd.read_csv(game_stats_path)
predictions_df = pd.read_csv(predictions_path)

# Standardize the date format in both DataFrames
game_stats_df['game_date'] = pd.to_datetime(game_stats_df['game_date']).dt.date
predictions_df['date'] = pd.to_datetime(predictions_df['date']).dt.date

# Apply the nickname extraction function
game_stats_df['team_full_name_home'] = game_stats_df['team_full_name_home'].apply(extract_team_nickname)
game_stats_df['team_full_name_visitor'] = game_stats_df['team_full_name_visitor'].apply(extract_team_nickname)

# Adding a column for the actual winner in predictions_df
predictions_df['actual_winner'] = None

# Iterate through predictions_df and update with actual winner
for index, row in predictions_df.iterrows():
    # Extract prediction details
    prediction_date = row['date']
    home_team = row['Home Team']  # Assumes this is a nickname
    visitor_team = row['Visitor Team']  # Assumes this is a nickname

    # Find the corresponding game in game_stats_df
    game = game_stats_df[(game_stats_df['game_date'] == prediction_date) &
                         (game_stats_df['team_full_name_home'].str.lower().str.contains(home_team.lower())) &
                         (game_stats_df['team_full_name_visitor'].str.lower().str.contains(visitor_team.lower()))]

    if not game.empty:
        # Determine whether the home or visitor team won
        if game.iloc[0]['game_home_team_score'] > game.iloc[0]['game_visitor_team_score']:
            winning_team = home_team
        else:
            winning_team = visitor_team

        # Update the actual_winner in predictions_df
        predictions_df.at[index, 'actual_winner'] = winning_team
        predictions_df.at[index, 'prediction_id'] = generate_prediction_id(home_team, visitor_team, pd.to_datetime(game.iloc[0]['game_date']))
        
    else:
        print(f"No matching game found for date: {prediction_date}, Home Team: {home_team}, Visitor Team: {visitor_team}")


response = arize_client.log(
    dataframe=predictions_df,
    schema=actual_schema,
    model_id="nick-nba-game-predictor",
    model_version="1.0.0",
    model_type=ModelTypes.BINARY_CLASSIFICATION,
    environment=Environments.PRODUCTION,
)

# If successful, the server will return a status_code of 200
if response.status_code != 200:
    print(
        f"❌ logging failed with response code {response.status_code}, {response.text}"
    )
else:
    print(
        f"Step 4 ✅: You have successfully logged {len(predictions_df)} data points to Arize!"
    )

# Save the updated DataFrame, overwriting the existing predictions.csv file
predictions_df.to_csv(predictions_path, index=False)

# Display the first few rows of the updated predictions DataFrame for verification
print(predictions_df.head())


[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjox/spaces/U3BhY2U6NTc2Mw==/models/modelName/nick-nba-game-predictor?selectedTab=dataIngestion[0m
Step 4 ✅: You have successfully logged 31 data points to Arize!
         date Start (ET) Visitor Team  PTS Home Team  PTS.1  Unnamed: 6  \
0  2023-12-08      7:00p      Raptors  NaN   Hornets    NaN         NaN   
1  2023-12-08      7:00p      Pistons  NaN     Magic    NaN         NaN   
2  2023-12-08      7:00p        Hawks  NaN     76ers    NaN         NaN   
3  2023-12-08      7:30p       Knicks  NaN   Celtics    NaN         NaN   
4  2023-12-08      7:30p      Wizards  NaN      Nets    NaN         NaN   

   Unnamed: 7  Attend.               Arena  Notes predicted_winner  \
0         NaN      NaN     Spectrum Center    NaN          Raptors   
1         NaN      NaN        Amway Center    NaN            Magic   
2         NaN      NaN  Wells Fargo Center  