In [20]:
# ParlAid V1

import os, joblib, warnings
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings("ignore")

INPUT_PATH = "/Users/mayanktyagi/Downloads/nba_stats_recent_years.csv"  #change the input path to whatever path your computer uses since u have the frontend
df = pd.read_csv(INPUT_PATH)

# Convert main stat column (points)
df['PTS'] = pd.to_numeric(df.get('PTS', df.columns[0]), errors='coerce')
df['stat'] = df['PTS'].fillna(df['PTS'].mean())

# Handle minutes
min_col = None
for c in df.columns:
    if c.lower() == 'min' or 'min' in c.lower() or 'minutes' in c.lower():
        min_col = c
        break

def parse_minutes(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    if ':' in s:
        try:
            mm, ss = s.split(':')
            return float(mm) + float(ss)/60.0
        except:
            return np.nan
    s2 = ''.join(ch for ch in s if (ch.isdigit() or ch=='.' or ch=='-'))
    try:
        return float(s2) if s2 != '' else np.nan
    except:
        return np.nan

if min_col:
    df['minutes'] = df[min_col].apply(parse_minutes)
else:
    df['minutes'] = 30.0
df['minutes'] = df['minutes'].fillna(30.0)

# Ensure identifiers exist
if 'Player' not in df.columns: df['Player'] = 'player_' + df.index.astype(str)
if 'Team' not in df.columns: df['Team'] = 'TEAM'
df.rename(columns={'Player': 'player', 'Team': 'team'}, inplace=True)

# Handle date or create synthetic one
date_col = None
for c in df.columns:
    if 'date' in c.lower():
        date_col = c
        break
if date_col:
    df['date'] = pd.to_datetime(df[date_col], errors='coerce')
else:
    df['date'] = pd.to_datetime('2021-01-01') + pd.to_timedelta(np.arange(len(df)) % 365, unit='D')

df = df.sort_values(['player', 'date']).reset_index(drop=True)
df['stat_roll_3'] = df.groupby('player')['stat'].transform(lambda x: x.shift(1).rolling(3, min_periods=1).mean()).fillna(df['stat'].mean())
df['stat_roll_7'] = df.groupby('player')['stat'].transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean()).fillna(df['stat'].mean())
df['stat_roll_30'] = df.groupby('player')['stat'].transform(lambda x: x.shift(1).rolling(30, min_periods=1).mean()).fillna(df['stat'].mean())
df['stat_per_min'] = df['stat'] / df['minutes']

possible_feats = ['stat_roll_3','stat_roll_7','stat_roll_30','stat_per_min','minutes',
                  'FGM','FGA','3PM','3PA','FTM','FTA','TRB','AST','STL','BLK','TO','PF']
feature_cols = [c for c in possible_feats if c in df.columns]

# Convert numbers properly (strip commas, %, etc.)
def clean_numeric(df_in):
    df_out = df_in.copy()
    for c in df_out.columns:
        df_out[c] = pd.to_numeric(df_out[c].astype(str).str.replace('%','').str.replace(',',''), errors='coerce').fillna(0.0)
    return df_out

X = clean_numeric(df[feature_cols])
y = df['stat'].astype(float).fillna(df['stat'].mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=50, max_depth=8, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

os.makedirs("parlaid_artifacts", exist_ok=True)
joblib.dump(model, "parlaid_artifacts/rf_quick_model.joblib")
joblib.dump(feature_cols, "parlaid_artifacts/feature_columns.joblib")

df.head(200).to_csv("parlaid_artifacts/parlaid_clean_preview.csv", index=False)

print(" Quick model trained and saved.")
print(f"Test RMSE: {rmse:.4f}, R2: {r2:.4f}")
print("Features used:", feature_cols[:10])

recent_data = df.groupby('player').tail(1).copy()
X_pred = clean_numeric(recent_data[feature_cols])
recent_data['predicted_next_stat'] = model.predict(X_pred)

predicted_path = "parlaid_artifacts/predicted_next_game_stats.csv"
recent_data.to_csv(predicted_path, index=False)
print(f" Saved player predictions → {predicted_path}")

print("\n Building parlay recommendations...")

df_preds = pd.read_csv(predicted_path)

# each player's season avg
season_avg = df_preds.groupby('player')['stat'].mean().reset_index()
season_avg.rename(columns={'stat':'season_avg'}, inplace=True)
df_preds = df_preds.merge(season_avg, on='player', how='left')

# Improvement vs average
df_preds['improvement'] = df_preds['predicted_next_stat'] - df_preds['season_avg']
df_preds['percent_diff'] = 100 * df_preds['improvement'] / df_preds['season_avg'].replace(0, np.nan)

# Confidence score
df_preds['recent_consistency'] = np.exp(-np.abs(df_preds['stat'] - df_preds['season_avg']) / df_preds['season_avg'])
df_preds['confidence'] = (0.6 * df_preds['recent_consistency'] + 0.4 * (df_preds['percent_diff'] / 100)).clip(0, 1)

# Filter and rank
parlay_candidates = df_preds[(df_preds['improvement'] > 0) & (df_preds['confidence'] > 0.5)]
parlay_top = parlay_candidates.sort_values('confidence', ascending=False).head(10)

print("\n Top 10 Parlay Picks — Predicted to Beat Season Average:")
print(parlay_top[['player','team','predicted_next_stat','season_avg','improvement','confidence']])

# Save results
df_preds.to_csv("parlaid_artifacts/all_player_predictions_with_confidence.csv", index=False)
parlay_top.to_csv("parlaid_artifacts/parlay_recommendations.csv", index=False)

avg_conf = parlay_top['confidence'].mean() if not parlay_top.empty else 0
avg_diff = parlay_top['percent_diff'].mean() if not parlay_top.empty else 0

print("\n Saved outputs:")
print(" - All player predictions → parlai_artifacts/all_player_predictions_with_confidence.csv")
print(" - Top parlay picks        → parlai_artifacts/parlay_recommendations.csv")
print(f" Avg Confidence: {avg_conf:.2f} | Avg Improvement: {avg_diff:.1f}%")

 Quick model trained and saved.
Test RMSE: 26.7143, R2: 0.9800
Features used: ['stat_roll_3', 'stat_roll_7', 'stat_roll_30', 'stat_per_min', 'minutes', 'FGM', 'FGA', '3PM', '3PA', 'FTM']
 Saved player predictions → parlaid_artifacts/predicted_next_game_stats.csv

 Building parlay recommendations...

 Top 10 Parlay Picks — Predicted to Beat Season Average:
                 player team  predicted_next_stat  season_avg  improvement  \
14   Andre Jackson, Jr.  MIL           148.160000  125.000000    23.160000   
303       Norman Powell  LAC           693.113897  595.261477    97.852420   
250         Kris Murray  POR           437.402722  376.000000    61.402722   
184       James Johnson  BRK           391.247451  340.000000    51.247451   
370    Tristan Thompson  CLE           182.920000  161.000000    21.920000   
34         Blake Wesley  SAS           300.036976  268.000000    32.036976   
249           Kris Dunn  UTA           398.448181  357.000000    41.448181   
33      Bismack Bi

In [24]:
#Parlaid V2
# NBA Player Performance Predictor (this one js got Points, Rebounds, Assists; imma add more later and advance my model later)
import os, joblib, warnings
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
warnings.filterwarnings("ignore")

INPUT_PATH = "/Users/mayanktyagi/Downloads/nba_stats_recent_years.csv" #change the input path to whatever path your computer uses since u have the frontend
df = pd.read_csv(INPUT_PATH)

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.replace(',', '').str.replace('%', '')
    try:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    except:
        pass

for key in ['Player', 'Team']:
    if key not in df.columns:
        df[key] = f'{key}_unknown'

df.rename(columns={'Player':'player','Team':'team'}, inplace=True)

def parse_minutes(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    if ':' in s:
        try:
            mm, ss = s.split(':'); return float(mm) + float(ss)/60.0
        except: return np.nan
    try: return float(s)
    except: return np.nan

min_col = None
for c in df.columns:
    if 'min' in c.lower(): min_col = c; break
df['minutes'] = df[min_col].apply(parse_minutes) if min_col else 30.0
df['minutes'] = df['minutes'].fillna(30.0)

date_col = None
for c in df.columns:
    if 'date' in c.lower(): date_col = c; break
if date_col:
    df['date'] = pd.to_datetime(df[date_col], errors='coerce')
else:
    df['date'] = pd.to_datetime('2021-01-01') + pd.to_timedelta(np.arange(len(df))%365, unit='D')

# target stats (will add more later)
target_stats = ['PTS', 'TRB', 'AST']
available_targets = [t for t in target_stats if t in df.columns]
if not available_targets:
    raise ValueError("No PTS/TRB/AST columns found in your data!")

possible_feats = ['minutes','FGM','FGA','3PM','3PA','FTM','FTA','TRB','AST','STL','BLK','TO','PF']
feature_cols = [c for c in possible_feats if c in df.columns]
X_base = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

df = df.sort_values(['player','date']).reset_index(drop=True)
for stat in available_targets:
    df[f'{stat}_roll_3'] = df.groupby('player')[stat].transform(lambda x: x.shift(1).rolling(3,min_periods=1).mean())
    df[f'{stat}_roll_7'] = df.groupby('player')[stat].transform(lambda x: x.shift(1).rolling(7,min_periods=1).mean())

results = {}
models = {}
os.makedirs("parlaid_artifacts", exist_ok=True)

for stat in available_targets:
    y = df[stat].fillna(df[stat].mean())
    X = pd.concat([X_base, df[[f'{stat}_roll_3', f'{stat}_roll_7']]], axis=1).fillna(0.0)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=60, max_depth=10, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)

    results[stat] = {'RMSE': rmse, 'R2': r2}
    models[stat] = model
    joblib.dump(model, f"parlaid_artifacts/{stat}_rf_model.joblib")

print("\n=== Model Performance Summary ===")
for k, v in results.items():
    print(f"{k}: RMSE={v['RMSE']:.2f}, R2={v['R2']:.3f}")

recent_data = df.groupby('player').tail(1).reset_index(drop=True)
predictions = []
for stat in available_targets:
    X_pred = pd.concat([recent_data[feature_cols],
                        recent_data[[f'{stat}_roll_3', f'{stat}_roll_7']]], axis=1).fillna(0.0)
    preds = models[stat].predict(X_pred)
    recent_data[f'pred_{stat}'] = preds
    predictions.append(preds)

recent_data['total_pred'] = sum(predictions)

#Top 10 predicted performers
top10 = recent_data[['player','team'] + [f'pred_{s}' for s in available_targets] + ['total_pred']].sort_values('total_pred', ascending=False).head(10)
print("\n=== Top 10 Predicted Player Performances ===")
print(top10.to_string(index=False))

# --- Save to file ---
top10_path = "parlaid_artifacts/top10_predictions.csv"
top10.to_csv(top10_path, index=False)
print(f"\nSaved Top 10 parlay predictions to {top10_path}")


=== Model Performance Summary ===
PTS: RMSE=374.87, R2=0.319
TRB: RMSE=143.82, R2=0.302
AST: RMSE=107.63, R2=0.360

=== Top 10 Predicted Player Performances ===
                 player team    pred_PTS   pred_TRB   pred_AST  total_pred
  Giannis Antetokounmpo  MIL 2003.300000 845.320000 449.492311 3298.112311
           Nikola Jokic  DEN 1956.850000 769.033333 563.116667 3289.000000
            Luka Doncic  DAL 1971.200000 624.734921 544.966667 3140.901587
           Jayson Tatum  BOS 1950.150000 609.484921 380.394722 2940.029643
             Trae Young  ATL 1988.416667 304.797713 605.500000 2898.714380
          Pascal Siakam  TOR 1814.520000 633.926587 412.103172 2860.549760
          Anthony Davis  LAL 1806.783333 772.433333 239.969188 2819.185854
Shai Gilgeous-Alexander  OKC 1932.650000 417.208889 433.772073 2783.630962
         Paolo Banchero  ORL 1775.564444 616.929365 371.127500 2763.621310
           Kevin Durant  PHX 1847.666667 514.075952 386.868357 2748.610977

Saved Top 10