In [18]:
import os
import pandas as pd

# Define paths
advanced_team_stats_path = 'advancedteamstats'
last_5_seasons_records_path = 'last5seasonsrecords'

# Helper function to load and preprocess data
def load_and_preprocess(folder_path, additional_columns=None):
    all_data = []
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            # Extract team and year from filename
            team_name = file.split('_')[0]
            year = file.split('_')[-1].split('.')[0]
            df['Team'] = team_name
            df['Year'] = year
            
            # Pivot the `Category` column into individual columns
            if 'Category' in df.columns and 'Team' not in df.columns:
                df = df.pivot(index=['Team', 'Year'], columns='Category', values='Lg').reset_index()
            
            # Select additional columns if specified
            if additional_columns:
                df = df[['Team', 'Year'] + additional_columns]
            
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

# Load advanced team stats
advanced_team_data = load_and_preprocess(
    advanced_team_stats_path,
    additional_columns=['NetRtg', 'W/L%', 'ORtg', 'DRtg']
)

# Load last 5 seasons' records
last_5_seasons_data = load_and_preprocess(
    last_5_seasons_records_path,
    additional_columns=['W/L%', 'Win Conf', 'Win Finals']
)

# Merge datasets on Team and Year
merged_data = pd.merge(
    advanced_team_data, last_5_seasons_data,
    on=['Team', 'Year'], how='inner'
)

# Display merged dataset structure
print(merged_data.head())

# Example: Simple Predictor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Select features and target
X = merged_data[['NetRtg', 'ORtg', 'DRtg', 'W/L%']]
y = merged_data['Win Finals']  # Assuming `Win Finals` is binary (1 for champions, 0 otherwise)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

# Feature importance
importances = pd.Series(model.feature_importances_, index=X.columns)
print("Feature Importances:")
print(importances.sort_values(ascending=False))


KeyError: "['NetRtg', 'W/L%', 'ORtg', 'DRtg'] not in index"

In [23]:
import os
import pandas as pd

def load_and_preprocess(folder_path, additional_columns=None):
    all_data = []

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)

        # Handle files with metrics in `Category` and values in `TeamR`
        if 'Category' in df.columns and 'TeamR' in df.columns:
            # Create a mapping from `Category` to `TeamR`
            category_map = df.set_index('Category')['TeamR'].to_dict()

            # Add additional columns based on the mapping
            for col in additional_columns:
                if col in category_map:
                    df[col] = df['Category'].map(lambda x: category_map.get(x, None))

        # Add Year and Team columns if available
        df['Team'] = df['Team'].iloc[0] if 'Team' in df.columns else file_name.split('_')[0]
        df['Year'] = df['Year'].iloc[0] if 'Year' in df.columns else file_name.split('_')[1]

        # Keep only relevant columns
        if additional_columns:
            relevant_columns = ['Team', 'Year'] + additional_columns
            df = df[df.columns.intersection(relevant_columns)]

        all_data.append(df)

    # Combine all data into a single DataFrame
    return pd.concat(all_data, ignore_index=True)

# Define paths and columns
advanced_team_stats_path = "advancedteamstats"
additional_columns = ['NetRtg', 'W/L%', 'ORtg', 'DRtg', 'Lg Rank']

# Load and preprocess
advanced_team_data = load_and_preprocess(
    advanced_team_stats_path,
    additional_columns=additional_columns
)

# Display results
print(advanced_team_data.head())


   Lg Rank  Year    Team NetRtg   ORtg   DRtg
0      1.0  2024  Boston     64     64     64
1     30.0  2024  Boston     18     18     18
2      1.0  2024  Boston     66     66     66
3      1.0  2024  Boston     16     16     16
4     29.0  2024  Boston  11.34  11.34  11.34


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Function to load and preprocess data
def load_and_preprocess(folder_path, additional_columns=None):
    import os
    all_data = []
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)

            # Add year and team name from file name if not in columns
            if 'Year' not in df.columns:
                df['Year'] = file.split("_")[-1].split(".")[0]
            if 'Team' not in df.columns:
                df['Team'] = file.split("_")[0]

            # Handle mapping for advanced stats
            if 'Category' in df.columns:
                category_map = df.set_index('Category')['TeamR'].to_dict()
                if additional_columns:
                    for col in additional_columns:
                        if col in category_map:
                            df[col] = df['Category'].map(lambda x: category_map.get(x, None))

            all_data.append(df)

    return pd.concat(all_data, ignore_index=True)


# Paths to data folders
advanced_team_stats_path = "advancedteamstats"
last_5_seasons_path = "last5seasonsrecords"

# Load datasets
advanced_team_data = load_and_preprocess(
    advanced_team_stats_path, additional_columns=["NetRtg", "ORtg", "DRtg"]
)
records_data = load_and_preprocess(last_5_seasons_path, additional_columns=["W/L%"])

# Merge datasets
final_data = pd.merge(
    advanced_team_data,
    records_data,
    on=["Team", "Year"],
    how="inner"
)

# Select features and target
features = ["NetRtg", "ORtg", "DRtg", "W/L%"]
target = "Win Finals"

# Drop rows with missing data in selected features or target
final_data.dropna(subset=features + [target], inplace=True)

# Define X and y
X = final_data[features]
y = final_data[target]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load conference standings and advanced stats directly
last_season_records_path = "last5seasonsrecords"  # Replace with actual path
advanced_team_stats_path = "advancedteamstats"   # Replace with actual path

def load_data():
    # Load conference standings
    eastern_standings = pd.read_csv(f"{last_season_records_path}/NBA_2024_Eastern_Conference_Standings.csv")
    western_standings = pd.read_csv(f"{last_season_records_path}/NBA_2024_Western_Conference_Standings.csv")
    standings = pd.concat([eastern_standings, western_standings], ignore_index=True)

    # Ensure the column name is consistent
    standings.rename(columns={"Eastern Conference": "Team", "Western Conference": "Team"}, inplace=True)

    # Load advanced stats
    advanced_stats = pd.read_csv(f"{advanced_team_stats_path}/Boston_Celtics_2024_Team_Misc_Stats.csv")
    advanced_stats.rename(columns={"Team.": "Team"}, inplace=True)

    # Debug column names
    print(standings.columns)
    print(advanced_stats.columns)

    # Merge data on relevant columns
    data = standings.merge(advanced_stats, how="inner", left_on="Team", right_on="Team")

    return data

data = load_data()
print(data.head())


# Define features and target
features = ["W/L%", "NetRtg", "Lg Rank"]  # Adjust based on columns available
target = "Win Finals"  # Predict whether a team wins the finals

# Clean and process data
data = data[features + [target]].dropna()  # Drop rows with missing values
X = data[features]
y = data[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Index(['Conference', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS'], dtype='object')
Index(['Category', 'TeamR', 'Lg Rank', 'Year', 'Team'], dtype='object')
Empty DataFrame
Columns: [Conference, Team, W, L, W/L%, GB, PS/G, PA/G, SRS, Category, TeamR, Lg Rank, Year]
Index: []


KeyError: "['NetRtg', 'Win Finals'] not in index"

In [2]:
print(data.columns)


Index(['Conference', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS',
       'Category', 'TeamR', 'Lg Rank', 'Year'],
      dtype='object')
