In [95]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

In [96]:
pitching = pd.read_csv("../projectData/Pitching.csv")
people = pd.read_csv("../projectData/People.csv")
hof = pd.read_csv("../projectData/HallOfFame.csv")

In [97]:
pitching["IP"] = pitching["IPouts"] / 3
career = pitching.groupby("playerID").agg({
    "W": "sum",
    "L": "sum",
    "G": "sum",
    "GS": "sum",
    "CG": "sum",
    "SHO": "sum",
    "SV": "sum",
    "IPouts": "sum",
    "IP": "sum",
    "H": "sum",
    "ER": "sum",
    "HR": "sum",
    "BB": "sum",
    "SO": "sum",
    "IBB": "sum",
    "WP": "sum",
    "HBP": "sum",
    "BK": "sum",
    "BFP": "sum",
    "GF": "sum",
    "R": "sum",
    "SH": "sum",
    "SF": "sum",
    "GIDP": "sum"
}).reset_index()

In [98]:
# ERA
career["ERA_career"] = 9 * career["ER"] / career["IP"]

# WHIP
career["WHIP"] = (career["BB"] + career["H"]) / career["IP"]

# K/9, BB/9, HR/9
career["K9"]   = 9 * career["SO"] / career["IP"]
career["BB9"]  = 9 * career["BB"] / career["IP"]
career["HR9"]  = 9 * career["HR"] / career["IP"]

# K% and BB%
career["K_pct"]  = career["SO"] / career["BFP"]
career["BB_pct"] = career["BB"] / career["BFP"]

In [99]:
people["status"] = people["finalGame"].apply(
    lambda x: "retired" if pd.notna(x) else "active"
)


In [100]:
career = career.merge(
    people[["playerID", "nameGiven", "nameLast", "debut", "status"]],
    on="playerID",
    how="left"
)

career = career.dropna(subset=["ERA_career"])
career

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,IPouts,IP,...,WHIP,K9,BB9,HR9,K_pct,BB_pct,nameGiven,nameLast,debut,status
0,aardsda01,16,18,331,0,0,0,69,1011,337.000000,...,1.421365,9.080119,4.887240,1.094955,0.230508,0.124068,David Allan,Aardsma,2004-04-06,retired
1,aasedo01,66,60,448,91,22,5,82,3328,1109.333333,...,1.390024,5.200421,3.707632,0.722055,0.135518,0.096617,Donald William,Aase,1977-07-26,retired
2,abadfe01,9,29,406,6,0,0,2,1064,354.666667,...,1.322368,7.409774,3.197368,1.141917,0.192994,0.083278,Fernando Antonio,Abad,2010-07-28,retired
3,abbeybe01,22,40,79,65,52,0,1,1704,568.000000,...,1.545775,2.551056,3.042254,0.285211,0.062695,0.074766,Bert Wood,Abbey,1892-06-14,retired
4,abbeych01,0,0,1,0,0,0,0,6,2.000000,...,3.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Charles S.,Abbey,1893-08-16,retired
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10739,zulueyo01,0,0,12,0,0,0,0,49,16.333333,...,1.346939,11.020408,3.857143,1.102041,0.289855,0.101449,Yosver Jose,Zulueta,2024-06-25,active
10740,zumayjo01,13,12,171,0,0,0,5,629,209.666667,...,1.349762,9.014308,4.893482,0.772655,0.230516,0.125137,Joel Martin,Zumaya,2006-04-03,retired
10741,zuniggu01,0,0,17,0,0,0,2,59,19.666667,...,1.322034,7.322034,3.661017,2.288136,0.190476,0.095238,Guillermo Enrique,Zuñiga,2023-05-02,active
10742,zuverge01,32,36,265,31,9,2,40,1927,642.333333,...,1.343539,3.124546,2.844318,0.784639,0.081209,0.073926,George,Zuverink,1951-04-21,retired


In [101]:
active = career[career['status'] == 'active']

In [102]:
retired = career[career['status'] == 'retired'].copy()

In [103]:
retired = retired.merge(
    hof[["playerID", "inducted"]],
    on="playerID",
    how="left"
)

In [104]:
retired["inducted"] = retired["inducted"].map({"Y": 1, "N": 0})
retired["inducted"] = retired["inducted"].fillna(0)  # Not voted = 0

In [105]:
logistic_features = [
    "IP","W","SO","H","ERA_career","G","HR9","BB","WHIP",
    "GS","L","BB9","K9","HR","SV"
]

In [106]:
X_train_log = retired[logistic_features].replace([np.inf, -np.inf], np.nan).fillna(0)
y_train_log = retired["inducted"]

# Optional: scale features (helps with logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_log)

In [107]:
logreg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
logreg.fit(X_train_scaled, y_train_log)


In [108]:
y_pred = logreg.predict(X_train_scaled)
y_prob = logreg.predict_proba(X_train_scaled)[:,1]

print(classification_report(y_train_log, y_pred))
print("ROC-AUC:", roc_auc_score(y_train_log, y_prob))

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.93     11575
         1.0       0.05      0.72      0.09       114

    accuracy                           0.86     11689
   macro avg       0.52      0.79      0.51     11689
weighted avg       0.99      0.86      0.92     11689

ROC-AUC: 0.8949676025917925


In [110]:
X_active_log = active[logistic_features].replace([np.inf, -np.inf], np.nan).fillna(0)
X_active_scaled = scaler.transform(X_active_log)

active["hof_prob_logistic"] = logreg.predict_proba(X_active_scaled)[:,1]

top_active_log = active.sort_values("hof_prob_logistic", ascending=False)
top_active_log[["nameGiven","nameLast","hof_prob_logistic"]].head(20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  active["hof_prob_logistic"] = logreg.predict_proba(X_active_scaled)[:,1]


Unnamed: 0,nameGiven,nameLast,hof_prob_logistic
9935,Justin Brooks,Verlander,0.974737
8588,Maxwell Martin,Scherzer,0.949455
5044,Clayton Edward,Kershaw,0.939105
5081,Craig Michael,Kimbrel,0.93641
4700,Kenley Geronimo,Jansen,0.913598
1802,Gerrit Alan,Cole,0.813998
10488,Patrick Ian-Cashel,Wisdom,0.77349
8445,Christopher,Sale,0.75495
1622,Albertin Aroldis,Chapman,0.674997
9775,Justin Matthew,Turner,0.649934


In [111]:
top_active_log[top_active_log['nameLast'] =='Ohtani']

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,IPouts,IP,...,K9,BB9,HR9,K_pct,BB_pct,nameGiven,nameLast,debut,status,hof_prob_logistic
7143,ohtansh01,38,19,86,86,1,1,0,1445,481.666667,...,11.360554,3.232526,0.990311,0.311635,0.088672,Shohei,Ohtani,2018-03-29,active,0.21623
