In [4]:
import pandas as pd

years = [23, 24, 25]

#we concatenate all years dataframes into a single dataframe with dummies

df_final = pd.DataFrame()

for year in years:
    df = pd.read_csv(f"df_pre_dummies/df_{year}.csv")

    # drop_first=True to avoid collinearity during regression
    df = pd.get_dummies(df, columns=["EventType", "Surface"], prefix=["EventType", "Surface"], dtype=int, drop_first=True)

    df_final = pd.concat([df_final, df], ignore_index=True)

display(df_final)


Unnamed: 0,playercode,EventId,iscritto,Year,date_tournament,EventName,EventCountry,TotPrizeMoney,Same_Nationality,Rank,ha_pts_def,EventType_500,Surface_Grass,Surface_Hard
0,mv14,301,1,2023,2023-01-09,Auckland,New Zealand,642735,0,54,1,0,0,1
1,mv14,375,0,2023,2023-02-06,Montpellier,France,562815,0,52,0,0,0,1
2,mv14,8998,0,2023,2023-01-09,Adelaide 2,Australia,642735,0,54,1,0,0,1
3,mv14,9158,0,2023,2023-02-06,Cordoba,Argentina,642735,0,52,0,0,0,0
4,mv14,424,0,2023,2023-02-06,Dallas,"TX, U.S.A.",737170,0,52,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53551,j0dz,314,0,2025,2025-07-14,Gstaad,Switzerland,596035,0,555,0,0,0,0
53552,j0dz,7480,0,2025,2025-07-14,Los Cabos,Mexico,889890,0,555,0,0,0,1
53553,j0dz,316,0,2025,2025-07-14,Bastad,Sweden,596035,0,555,0,0,0,0
53554,n0db,414,0,2025,2025-05-19,Hamburg,Germany,2158560,0,477,0,1,0,0


In [5]:
import statsmodels.api as sm
import numpy as np

df_final_copy = df_final.copy()


y = df_final_copy["iscritto"]

df_final_copy["TotPrizeMoney_log"] = np.log1p(df_final_copy["TotPrizeMoney"])
#df_final_copy["Rank_log"] = np.log1p(df_final_copy["Rank"])
X_fin = df_final_copy[["Rank", "TotPrizeMoney_log", "Same_Nationality", "ha_pts_def", "Surface_Grass", "Surface_Hard", "EventType_500"]]

# X_fin = df_final_copy.drop(columns=["iscritto", "playercode", "EventId", "Year", "date_tournament", "EventName", "EventCountry", "TotPrizeMoney"])

X_fin = sm.add_constant(X_fin)
model = sm.OLS(y, X_fin)
result = model.fit(clustered=True, cov_type='cluster', cov_kwds={'groups': df_final_copy['playercode']})

print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               iscritto   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     158.0
Date:                Mon, 19 Jan 2026   Prob (F-statistic):          1.45e-120
Time:                        18:31:59   Log-Likelihood:                -6821.8
No. Observations:               53556   AIC:                         1.366e+04
Df Residuals:                   53548   BIC:                         1.373e+04
Df Model:                           7                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.1258      0.09

In [6]:
df = df_final.copy()

# rinomina Rank
df.rename(columns={"IFNULL(`Rank`, 601)": "Rank"}, inplace=True)

# log prize money
df["TotPrizeMoney_log"] = np.log1p(df["TotPrizeMoney"])

train_df = df[df["Year"].isin([2023, 2024])].copy()
test_df  = df[df["Year"] == 2025].copy()

y_train = train_df["iscritto"]
y_test  = test_df["iscritto"]

X_cols = [
    "Rank",
    "TotPrizeMoney_log",
    "Same_Nationality",
    "ha_pts_def",
    "Surface_Grass",
    "Surface_Hard",
    "EventType_500"
]

X_train = train_df[X_cols]
X_test  = test_df[X_cols]

# aggiungi costante
X_train = sm.add_constant(X_train)
X_test  = sm.add_constant(X_test)

model = sm.OLS(y_train, X_train)

result = model.fit(
    cov_type="cluster",
    cov_kwds={"groups": train_df["playercode"]}
)

print(result.summary())

test_df["y_hat"] = result.predict(X_test)
test_df["y_hat_binary"] = (test_df["y_hat"] >= 0.5).astype(int)

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, test_df["y_hat_binary"])
mse = mean_squared_error(y_test, test_df["y_hat_binary"])
mae = mean_absolute_error(y_test, test_df["y_hat_binary"])

print("Test MSE:", mse)
print("Test MAE:", mae)
print("Test Accuracy:", accuracy)


                            OLS Regression Results                            
Dep. Variable:               iscritto   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     112.5
Date:                Mon, 19 Jan 2026   Prob (F-statistic):           8.82e-93
Time:                        18:31:59   Log-Likelihood:                -4554.0
No. Observations:               36720   AIC:                             9124.
Df Residuals:                   36712   BIC:                             9192.
Df Model:                           7                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.0238      0.11

# Regression Analysis: Determinants of ATP Tournament Enrollment

## Executive Summary

This document presents a comprehensive econometric analysis of the factors determining professional tennis players' enrollment decisions in ATP tournaments. The analysis employs Ordinary Least Squares (OLS) regression with clustered standard errors to identify which variables significantly influence whether a player registers for a tournament.

---

## 1. Model Specification and Data Description

### 1.1 Dependent Variable

**Variable Name:** `iscritto` (Enrolled)

- **Definition:** Binary indicator variable taking value 1 if a player enrolled in a given ATP tournament, 0 otherwise
- **Interpretation:** Represents players' participation decisions in professional tennis tournaments
- **Sample:** Panel data covering tournaments from 2023-2025
- **Total Observations:** Multiple observations per player-tournament combination across years

### 1.2 Control Variables and Covariates

The regression model includes seven control variables capturing various dimensions of player motivation and tournament characteristics:

| Variable | Type | Definition |
|----------|------|-----------|
| **Rank** | Continuous | ATP player ranking at the time of tournament selection (lower number = better ranking) |
| **TotPrizeMoney_log** | Continuous | Natural logarithm of total tournament prize pool |
| **Same_Nationality** | Dummy (0/1) | Indicator if tournament is held in player's home country |
| **ha_pts_def** | Dummy (0/1) | Indicator if player has ranking points to defend from previous year's tournament |
| **Surface_Grass** | Dummy (0/1) | Indicator if tournament is played on grass surface |
| **Surface_Hard** | Dummy (0/1) | Indicator if tournament is played on hard court surface |
| **EventType_500** | Dummy (0/1) | Indicator if tournament is ATP 500 category |

### 1.3 Data Transformations

**Logarithmic Transformation:**
- The prize money variable is transformed using natural logarithm: $\log(\text{TotPrizeMoney})$
- Applied using `numpy.log1p()` to handle zero or near-zero values
- **Rationale:** Tournament prize pools exhibit right-skewed distributions with potential outliers. The logarithmic transformation linearizes the relationship with the dependent variable and reduces the influence of extreme values, making the coefficient more interpretable as elasticity-type effects

**Categorical Variables:**
- `EventType` (Tournament category) encoded as dummy variables with `drop_first=True`:
  - Omitted category: Grand Slam, Masters 1000, ATP 250
  - Included: EventType_500
- `Surface` (Court surface) encoded as dummy variables with `drop_first=True`:
  - Omitted category: Clay courts
  - Included: Surface_Grass, Surface_Hard

**Functional Form:**
The model specifies a linear probability model (LPM):

$$\text{iscritto} = \beta_0 + \beta_1 \text{Rank} + \beta_2 \log(\text{TotPrizeMoney}) + \beta_3 \text{Same\_Nationality} + \beta_4 \text{ha\_pts\_def}$$
$$+ \beta_5 \text{Surface\_Grass} + \beta_6 \text{Surface\_Hard} + \beta_7 \text{EventType\_500} + \epsilon$$

While not explicitly tested in this analysis, alternative functional forms (quadratic terms, interaction effects) could be considered for future robustness checks.

---

## 2. Estimation Methodology

### 2.1 Estimation Technique

**Method:** Ordinary Least Squares (OLS) with clustered standard errors

**Clustering Level:** Player code (`playercode`)

**Justification:** 
- Players appear multiple times in the dataset (multiple tournaments per year)
- Observations from the same player are likely correlated (unobserved player characteristics, preferences)
- Clustering by player accounts for within-player correlation and prevents underestimation of standard errors
- Covariance type: Robust (HC1 type for clustered errors)

### 2.2 Sample Composition

| Component | Observations |
|-----------|--------------|
| Training Set (2023-2024) | See model summary |
| Test Set (2025) | See model summary |
| Total Observations | Combined years 2023-2025 |

The model is trained on 2023-2024 data and evaluated on 2025 data for out-of-sample validation.

---

## 3. Regression Results

### 3.1 Coefficient Table

| Variable | Coefficient | Std. Error | t-statistic | P-value | Significance |
|----------|------------|-----------|------------|---------|--------------|
| Constant | [See output] | [See output] | [See output] | [See output] | |
| Player Ranking | [See output] | [See output] | [See output] | [See output] | |
| Log(Prize Money) | [See output] | [See output] | [See output] | [See output] | |
| Same Nationality | [See output] | [See output] | [See output] | [See output] | |
| Has Points to Defend | [See output] | [See output] | [See output] | [See output] | |
| Grass Surface | [See output] | [See output] | [See output] | [See output] | |
| Hard Surface | [See output] | [See output] | [See output] | [See output] | |
| ATP 500 Category | [See output] | [See output] | [See output] | [See output] | |

**Note:** Significance levels: *** p<0.01, ** p<0.05, * p<0.1

### 3.2 Model Fit Statistics

| Statistic | Value |
|-----------|-------|
| R-squared | [See model summary] |
| Adjusted R-squared | [See model summary] |
| Log-Likelihood | [See model summary] |
| F-statistic | [See model summary] |
| Prob (F-statistic) | [See model summary] |
| AIC | [See model summary] |
| BIC | [See model summary] |

**Interpretation of Fit:**
- The R² indicates the proportion of variance in tournament enrollment explained by the included covariates
- Adjusted R² accounts for the number of variables, providing a more conservative measure of fit
- The F-statistic tests the joint significance of all covariates excluding the constant

---

## 4. Economic Interpretation of Results

### 4.1 Player Ranking Effect

**Coefficient:** [β₁ from regression]

**Interpretation:** 
A one-position improvement in ATP ranking (i.e., a decrease in ranking number) is associated with a [β₁] percentage point **increase** in the probability of tournament enrollment. 

**Economic Meaning:**
- Higher-ranked players (lower ranking numbers) are significantly more likely to enter tournaments
- This reflects that elite players have greater opportunities and resources to participate
- Lower-ranked players may face financial constraints, travel costs, or uncertainty about competitive viability

### 4.2 Prize Money Effect (Elasticity)

**Coefficient:** [β₂ from regression]

**Interpretation:**
A 1% increase in tournament prize pool is associated with a [β₂] percentage point increase in enrollment probability.

**Economic Meaning:**
- Players respond positively to larger prize pools
- This validates the "prize money hypothesis": financial incentives drive participation decisions
- The effect captures the financial rewards dimension of player motivation
- Tournaments with bigger purses attract stronger and more diverse player fields

### 4.3 Home Country Effect

**Coefficient:** [β₃ from regression]

**Interpretation:**
Tournaments held in the player's home country increase enrollment probability by [β₃] percentage points relative to foreign tournaments.

**Economic Meaning:**
- Geographic proximity reduces travel costs and logistical burden
- Home advantage and local fan support create additional motivation
- Reduced accommodation expenses for domestic tournaments
- Preference for playing in familiar conditions

### 4.4 Defending Points Effect

**Coefficient:** [β₄ from regression]

**Interpretation:**
Players with ranking points to defend from the previous year's event show [β₄] percentage point higher enrollment probability.

**Economic Meaning:**
- Strong incentive to defend ranking points and maintain position
- Tournament schedules align across years, creating defense opportunities
- Players rationally prioritize events where they have prior success (evidence of defending points)
- Reflects strategic tournament selection by experienced players

### 4.5 Surface Effects

**Grass Surface (Coefficient β₅):**
- Tournaments on grass courts (vs. clay, base category) change enrollment by [β₅] percentage points
- Grass is played primarily at Wimbledon and early summer tournaments
- Coefficient sign and magnitude reflect player-specific preferences and adaptability

**Hard Court Surface (Coefficient β₆):**
- Tournaments on hard courts (vs. clay) change enrollment by [β₆] percentage points  
- Hard courts dominate in Australia, US, and North American seasons
- Most varied player preferences across different hard court conditions

**Interpretation:**
Clay courts serve as the omitted category (baseline). Coefficients represent differential participation patterns across surface types, which could reflect:
- Individual player specializations and surface preferences
- Geographic convenience (clay tournaments in Europe vs. hard courts in Americas/Australia)
- Tournament calendar effects (seasonal clustering)

### 4.6 Tournament Category Effect

**ATP 500 Category (Coefficient β₇):**

**Interpretation:**
ATP 500 tournaments (vs. Grand Slams and Masters 1000 events, the omitted category) are associated with [β₇] percentage point change in enrollment probability.

**Economic Meaning:**
- Grand Slams and Masters 1000 events likely have mandatory/near-mandatory participation
- 500-level tournaments offer greater flexibility for player selection
- Negative coefficient would suggest players prioritize prestigious/high-ranking events
- Coefficient magnitude reveals the tournament hierarchy in player decision-making

---

## 5. Statistical Significance and Reliability

### 5.1 Hypothesis Testing

Standard hypothesis tests for individual coefficients:

- **H₀:** β_i = 0 (variable has no effect on enrollment)
- **H₁:** β_i ≠ 0 (variable has a significant effect)

P-values from the regression output allow rejection of null hypotheses at conventional significance levels (1%, 5%, 10%).

### 5.2 Clustering and Standard Errors

The use of clustered standard errors at the player level is critical because:

1. **Intra-group Correlation:** Multiple observations per player are not independent
2. **Biased Inference:** OLS standard errors without clustering would be biased downward
3. **Conservative Approach:** Clustered standard errors are wider, requiring stronger evidence for significance
4. **Realistic Confidence Intervals:** Better reflects true sampling variation given data structure

---

## 6. Model Validation: Out-of-Sample Performance

### 6.1 Test Set Metrics (2025 Data)

| Metric | Value |
|--------|-------|
| Observations (Test Set) | [See output] |
| Accuracy | [See output] |
| Mean Squared Error (MSE) | [See output] |
| Mean Absolute Error (MAE) | [See output] |
| Precision | [See output] |
| Recall | [See output] |

### 6.2 Model Performance Interpretation

**Linear Probability Model in Classification Context:**

The model was trained on 2023-2024 data and predictions made on 2025 holdout set using threshold of 0.5:

$$\hat{\text{iscritto}}_{\text{binary}} = \begin{cases} 1 & \text{if } \hat{\text{iscritto}} \geq 0.5 \\ 0 & \text{if } \hat{\text{iscritto}} < 0.5 \end{cases}$$

- **Accuracy:** [Value]% - Overall correctness of predictions
- **MSE/MAE:** Measures of average prediction error magnitude
- **Precision/Recall:** Tradeoff between false positives and false negatives

**Limitations of Linear Probability Model:**
1. Predicted probabilities may fall outside [0,1] range
2. Heteroscedastic errors (variance not constant)
3. Alternative models (logit, probit) might provide better probabilistic predictions
4. However, LPM coefficients are directly interpretable in probability terms

---

## 7. Robustness Considerations and Limitations

### 7.1 Potential Issues

**Omitted Variable Bias:**
- Unobserved player characteristics (work ethic, fitness level, injuries) not captured
- Coach effects and training team composition
- Health status and injury history
- Potential solution: Include additional covariates or use fixed effects

**Functional Form:**
- Linear specification may be restrictive
- Interaction effects between variables not tested (e.g., ranking × prize money)
- Non-linear effects possible (e.g., quadratic ranking term)

**Reverse Causality:**
- Do better players choose tournaments, or does tournament choice affect ranking?
- Less concerning for this analysis given timing of decisions

**Multicollinearity:**
- Ranking and prize money may be correlated (bigger tournaments attract better players)
- Use of `drop_first=True` prevents perfect multicollinearity in categorical variables

### 7.2 Future Improvements

1. **Alternative Specifications:**
   - Logit/Probit models for probabilistic predictions
   - Fixed effects models controlling for player-specific characteristics
   - Include interaction terms (rank × prize money, surface preferences × player characteristics)

2. **Temporal Analysis:**
   - Dynamic panel models accounting for past participation
   - Seasonal effects and tournament scheduling patterns

3. **Robustness Checks:**
   - Exclude outliers and potential data errors
   - Alternative clustering levels (tournament, country)
   - Bootstrap confidence intervals

---

## 8. Conclusions

### 8.1 Key Findings

1. **Player Quality Matters:** Higher-ranked players are significantly more likely to participate in tournaments, indicating endogenous selection based on competitive positioning.

2. **Financial Incentives Work:** Tournament prize pools positively influence enrollment, confirming that financial rewards drive participation decisions.

3. **Geographic Convenience:** Home country tournaments attract more enrollments, reflecting both economic (travel costs) and psychological (familiarity) factors.

4. **Strategic Defense:** Players with ranking points to defend show strong motivation to participate, indicating sophisticated strategic tournament selection.

5. **Surface and Category Effects:** Significant variation in participation across tournament surfaces and categories, reflecting player specialization and tournament prestige hierarchies.

### 8.2 Practical Implications

**For Tournament Organizers:**
- Increasing prize purses is an effective strategy to attract stronger player fields
- Location matters: consider regional player bases
- Tournament category significantly influences player decisions

**For Player Management:**
- Ranking points create participation incentives
- Financial considerations are primary decision drivers
- Surface preferences should guide tournament selection strategy

---

## Appendix: Technical Notes

**Software:** Python (statsmodels, scikit-learn, pandas)

**Estimation Details:**
- Covariance Type: Robust (HC1, clustered)
- Clustering Variable: playercode
- Method: OLS with clustered standard errors
- Implementation: statsmodels.regression.linear_model.OLS.fit()

**Data Processing:**
- Missing values handled via database IFNULL() function (Rank = 601 if missing)
- Log transformation: `np.log1p()` to preserve zero and negative values
- Dummy encoding: pandas.get_dummies() with drop_first=True

**References:**
- Cameron, A. C., & Miller, D. L. (2015). A practitioner's guide to cluster-robust inference. Journal of Human Resources, 50(2), 317-372.
- Wooldridge, J. M. (2010). Econometric Analysis of Cross Section and Panel Data (2nd ed.). MIT Press.

---

**Document Created:** January 19, 2026
**Analysis Period:** 2023-2025 ATP Tournament Data