In [7]:
import pandas as pd

# Load the dataset
input_file = "CopyOfData.csv"  # Replace with the path to your file
output_file = "basketball_games_with_season.csv"  # Output file path

# Read the data into a DataFrame
df = pd.read_csv(input_file)

# Convert the `date` column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Function to calculate the season
def get_season(date):
    year = date.year
    if date.month >= 10:  # October to December belong to the current year's season
        return f"{year}-{year + 1}"
    else:  # January to June belong to the previous year's season
        return f"{year - 1}-{year}"

# Apply the season calculation to the DataFrame
df['season'] = df['date'].apply(get_season)

# Save the updated DataFrame back to a new CSV file
df.to_csv(output_file, index=False)

print(f"Dataset updated with 'season' column and saved to {output_file}")

Dataset updated with 'season' column and saved to basketball_games_with_season.csv


In [8]:
import pandas as pd

# Load the dataset
input_file = "basketball_games_with_season.csv"  # Replace with your file path
output_file = "basketball_games_with_win_percentage.csv"

# Read the data into a DataFrame
df = pd.read_csv(input_file)

# Convert the `date` column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort the data by season and date
df = df.sort_values(by=['season', 'date']).reset_index(drop=True)

# Function to calculate win percentages for a single season
def calculate_win_percentage_for_season(season_df):
    # Initialize team stats for the season
    team_stats = {}
    
    # Function to calculate dynamic win percentage
    def calculate_win_percentage(row):
        nonlocal team_stats

        # Extract teams
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        winner = row['winner']

        # Initialize teams in the stats dictionary if not already present
        for team in [home_team, away_team]:
            if team not in team_stats:
                team_stats[team] = {'wins': 0, 'games': 0}

        # Update games played for both teams
        team_stats[home_team]['games'] += 1
        team_stats[away_team]['games'] += 1

        # Update wins for the winner
        if winner == home_team:
            team_stats[home_team]['wins'] += 1
        elif winner == away_team:
            team_stats[away_team]['wins'] += 1

        # Calculate win percentage for both teams
        home_win_pct = (team_stats[home_team]['wins'] / team_stats[home_team]['games']) * 100
        away_win_pct = (team_stats[away_team]['wins'] / team_stats[away_team]['games']) * 100

        return pd.Series([home_win_pct, away_win_pct])

    # Apply the calculation to the season's DataFrame
    season_df[['homeWinPct', 'awayWinPct']] = season_df.apply(calculate_win_percentage, axis=1)
    return season_df

# Apply the win percentage calculation for each season
df = df.groupby('season', group_keys=False).apply(calculate_win_percentage_for_season)

# Save the updated dataset
df.to_csv(output_file, index=False)

print(f"Dataset updated with dynamic win percentages (resetting each season) and saved to {output_file}")

Dataset updated with dynamic win percentages (resetting each season) and saved to basketball_games_with_win_percentage.csv


  df = df.groupby('season', group_keys=False).apply(calculate_win_percentage_for_season)


In [9]:
import pandas as pd

# Load the dataset
input_file = "basketball_games_with_win_percentage.csv"  # Replace with your file path
output_file = "NBADatawithWinandPlayoff.csv"

# Read the data into a DataFrame
df = pd.read_csv(input_file)

# Convert the `date` column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort the data by season and date
df = df.sort_values(by=['season', 'date']).reset_index(drop=True)

# Function to classify playoff games
def classify_playoff_games(season_df):
    # Get the current season
    season = season_df['season'].iloc[0]
    
    # Determine total regular-season games for the season
    if season == "2020-21":
        total_regular_season_games = 1080  # 72 games per team
    elif season == "2019-20":
        total_regular_season_games = 971  # COVID-shortened season
    else:
        total_regular_season_games = 1230  # Full regular season (82 games per team)
    
    # Add a game number column for the season
    season_df['gameNumber'] = range(1, len(season_df) + 1)
    # Classify games as playoffs if they occur after the regular season
    season_df['isPlayoffGame'] = season_df['gameNumber'] > total_regular_season_games
    return season_df

# Apply the playoff classification for each season
df = df.groupby('season', group_keys=False).apply(classify_playoff_games)

# Save the updated dataset
df.to_csv(output_file, index=False)

print(f"Dataset updated with 'isPlayoffGame' column and saved to {output_file}")

  df = pd.read_csv(input_file)
  df = df.groupby('season', group_keys=False).apply(classify_playoff_games)


Dataset updated with 'isPlayoffGame' column and saved to NBADatawithWinandPlayoff.csv


In [10]:
import pandas as pd

# Load the dataset
input_file = "NBADatawithWinandPlayoff.csv"  # Replace with your file path
output_file = "nbsDataAllScrapedInfo.csv"

# Read the data into a DataFrame
df = pd.read_csv(input_file)

# Convert the `date` column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort the data by season and date
df = df.sort_values(by=['season', 'date']).reset_index(drop=True)

# Function to calculate win/loss records dynamically
def calculate_win_loss(season_df):
    # Initialize dictionaries to track wins and losses
    win_counts = {}
    loss_counts = {}

    # Columns to store wins and losses
    season_df['homeWins'] = 0
    season_df['homeLosses'] = 0
    season_df['awayWins'] = 0
    season_df['awayLosses'] = 0

    # Iterate over each game in the season
    for idx, row in season_df.iterrows():
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        winner = row['winner']

        # Initialize win/loss counts for teams if not already present
        if home_team not in win_counts:
            win_counts[home_team] = 0
            loss_counts[home_team] = 0
        if away_team not in win_counts:
            win_counts[away_team] = 0
            loss_counts[away_team] = 0

        # Update win/loss counts based on the winner
        if winner == home_team:
            win_counts[home_team] += 1
            loss_counts[away_team] += 1
        elif winner == away_team:
            win_counts[away_team] += 1
            loss_counts[home_team] += 1

        # Assign current win/loss counts to the DataFrame
        season_df.at[idx, 'homeWins'] = win_counts[home_team]
        season_df.at[idx, 'homeLosses'] = loss_counts[home_team]
        season_df.at[idx, 'awayWins'] = win_counts[away_team]
        season_df.at[idx, 'awayLosses'] = loss_counts[away_team]

    return season_df

# Apply the win/loss calculation for each season
df = df.groupby('season', group_keys=False).apply(calculate_win_loss)

# Save the updated dataset
df.to_csv(output_file, index=False)

print(f"Dataset updated with dynamic win/loss columns and saved to {output_file}")

  df = pd.read_csv(input_file)
  df = df.groupby('season', group_keys=False).apply(calculate_win_loss)


Dataset updated with dynamic win/loss columns and saved to nbsDataAllScrapedInfo.csv
