In [1]:
import pandas as pd

years = [23, 24, 25]

#we concatenate all years dataframes into a single dataframe with dummies

df_final = pd.DataFrame()

for year in years:
    df = pd.read_csv(f"df_pre_dummies/df_{year}.csv")

    # drop_first=True to avoid collinearity during regression
    df = pd.get_dummies(df, columns=["EventType", "Surface"], prefix=["EventType", "Surface"], dtype=int, drop_first=True)

    df_final = pd.concat([df_final, df], ignore_index=True)

display(df_final)


Unnamed: 0,playercode,EventId,iscritto,Year,date_tournament,EventName,EventCountry,TotPrizeMoney,Same_Nationality,Rank,ha_pts_def,EventType_500,Surface_Grass,Surface_Hard
0,mv14,301,1,2023,2023-01-09,Auckland,New Zealand,642735,0,54,1,0,0,1
1,mv14,375,0,2023,2023-02-06,Montpellier,France,562815,0,52,0,0,0,1
2,mv14,8998,0,2023,2023-01-09,Adelaide 2,Australia,642735,0,54,1,0,0,1
3,mv14,9158,0,2023,2023-02-06,Cordoba,Argentina,642735,0,52,0,0,0,0
4,mv14,424,0,2023,2023-02-06,Dallas,"TX, U.S.A.",737170,0,52,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53551,j0dz,314,0,2025,2025-07-14,Gstaad,Switzerland,596035,0,555,0,0,0,0
53552,j0dz,7480,0,2025,2025-07-14,Los Cabos,Mexico,889890,0,555,0,0,0,1
53553,j0dz,316,0,2025,2025-07-14,Bastad,Sweden,596035,0,555,0,0,0,0
53554,n0db,414,0,2025,2025-05-19,Hamburg,Germany,2158560,0,477,0,1,0,0


In [2]:
import statsmodels.api as sm
import numpy as np

df_final_copy = df_final.copy()


y = df_final_copy["iscritto"]

df_final_copy["TotPrizeMoney_log"] = np.log1p(df_final_copy["TotPrizeMoney"])
#df_final_copy["Rank_log"] = np.log1p(df_final_copy["Rank"])
X_fin = df_final_copy[["Rank", "TotPrizeMoney_log", "Same_Nationality", "ha_pts_def", "Surface_Grass", "Surface_Hard", "EventType_500"]]

# X_fin = df_final_copy.drop(columns=["iscritto", "playercode", "EventId", "Year", "date_tournament", "EventName", "EventCountry", "TotPrizeMoney"])

X_fin = sm.add_constant(X_fin)
model = sm.OLS(y, X_fin)
result = model.fit(clustered=True, cov_type='cluster', cov_kwds={'groups': df_final_copy['playercode']})

print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               iscritto   R-squared:                       0.135
Model:                            OLS   Adj. R-squared:                  0.135
Method:                 Least Squares   F-statistic:                     9020.
Date:                dom, 18 gen 2026   Prob (F-statistic):               0.00
Time:                        20:00:33   Log-Likelihood:                -5538.6
No. Observations:               53556   AIC:                         1.109e+04
Df Residuals:                   53548   BIC:                         1.116e+04
Df Model:                           7                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.1733      0.09

In [3]:
df = df_final.copy()

# rinomina Rank
df.rename(columns={"IFNULL(`Rank`, 601)": "Rank"}, inplace=True)

# log prize money
df["TotPrizeMoney_log"] = np.log1p(df["TotPrizeMoney"])

train_df = df[df["Year"].isin([2023, 2024])].copy()
test_df  = df[df["Year"] == 2025].copy()

y_train = train_df["iscritto"]
y_test  = test_df["iscritto"]

X_cols = [
    "Rank",
    "TotPrizeMoney_log",
    "Same_Nationality",
    "ha_pts_def",
    "Surface_Grass",
    "Surface_Hard",
    "EventType_500"
]

X_train = train_df[X_cols]
X_test  = test_df[X_cols]

# aggiungi costante
X_train = sm.add_constant(X_train)
X_test  = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train)

result = model.fit(
    cov_type="cluster",
    cov_kwds={"groups": train_df["playercode"]}
)

print(result.summary())

test_df["y_hat"] = result.predict(X_test)
test_df["y_hat_binary"] = (test_df["y_hat"] >= 0.5).astype(int)

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, test_df["y_hat_binary"])
mse = mean_squared_error(y_test, test_df["y_hat_binary"])
mae = mean_absolute_error(y_test, test_df["y_hat_binary"])

print("Test MSE:", mse)
print("Test MAE:", mae)
print("Test Accuracy:", accuracy)


                            OLS Regression Results                            
Dep. Variable:               iscritto   R-squared:                       0.139
Model:                            OLS   Adj. R-squared:                  0.139
Method:                 Least Squares   F-statistic:                     6096.
Date:                dom, 18 gen 2026   Prob (F-statistic):               0.00
Time:                        20:01:01   Log-Likelihood:                -3685.7
No. Observations:               36720   AIC:                             7387.
Df Residuals:                   36712   BIC:                             7455.
Df Model:                           7                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.0889      0.11