In [128]:
# load packages


import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [91]:
# read in data


model_df = pd.read_csv("cleaned_atp_data.csv", index_col=0)
model_df.head()

Unnamed: 0,best_of,grand_slam,surface,seed_diff,rank_points_diff,binned_age,binned_opp_age,avg_fantasy_pts,avg_ace,avg_df,avg_svpt,avg_FirstIn,avg_FirstWon,avg_SecondWon,avg_SvGms,avg_bpSaved,avg_bpFaced,h2h_fantasy_pts,fantasy_pts
0,3,0,Hard,49.0,-2689.0,prime,prime,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,16.290966,12.423053,4.176324,6.722741,14.412773,25.0
1,3,0,Hard,0.0,250.0,prime,prime,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,16.290966,12.423053,4.176324,6.722741,14.412773,21.5
2,3,0,Hard,0.0,547.0,prime,prime,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,16.290966,12.423053,4.176324,6.722741,14.412773,22.0
3,3,0,Hard,43.0,-654.0,prime,prime,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,16.290966,12.423053,4.176324,6.722741,14.412773,15.0
4,3,0,Hard,-46.0,378.0,prime,prime,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,16.290966,12.423053,4.176324,6.722741,14.412773,14.5


In [92]:
# dummy variables


# saving numeric variables
num_vars = ['best_of', 'grand_slam', # tourney info
       'seed_diff', 'rank_points_diff', # player info
       'avg_fantasy_pts', 'avg_ace', 'avg_df', 'avg_svpt', 'avg_FirstIn', 'avg_FirstWon', 'avg_SecondWon', 'avg_SvGms', 'avg_bpSaved', 'avg_bpFaced', # stat avgs
       'h2h_fantasy_pts'] # h2h avgs

# creating dummy variables for surface, player handedness, opponent handedness
model_df = pd.get_dummies(data=model_df, columns=['surface', 'binned_age', 'binned_opp_age'])

model_df.head()

Unnamed: 0,best_of,grand_slam,seed_diff,rank_points_diff,avg_fantasy_pts,avg_ace,avg_df,avg_svpt,avg_FirstIn,avg_FirstWon,...,surface_Grass,surface_Hard,binned_age_old,binned_age_prime,binned_age_veterans,binned_age_young,binned_opp_age_old,binned_opp_age_prime,binned_opp_age_veterans,binned_opp_age_young
0,3,0,49.0,-2689.0,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,...,0,1,0,1,0,0,0,1,0,0
1,3,0,0.0,250.0,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,...,0,1,0,1,0,0,0,1,0,0
2,3,0,0.0,547.0,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,...,0,1,0,1,0,0,0,1,0,0
3,3,0,43.0,-654.0,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,...,0,1,0,1,0,0,0,1,0,0
4,3,0,-46.0,378.0,14.412773,5.368847,3.271028,80.410592,48.639875,34.669159,...,0,1,0,1,0,0,0,1,0,0


In [93]:
# creating train-test split


X = model_df.drop(['fantasy_pts'], axis = 1)
y = model_df['fantasy_pts']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [123]:
# simple linear regression


# fit and predict a linear model
lm = LinearRegression()
lm.fit(X_train, y_train)
y_train_pred = lm.predict(X_train)
y_pred = lm.predict(X_test)

# train MSE
print("Train R-Squared:", r2_score(y_train_pred, y_train))
print("Train MSE:", mean_squared_error(y_train_pred, y_train))
print()

# test MSE
print("Test R-Squared:", r2_score(y_pred, y_test))
print("Test MSE:", mean_squared_error(y_pred, y_test))

Train R-Squared: 0.08570303569650595
Train MSE: 61.08275157804766

Test R-Squared: 0.0895967112609708
Test MSE: 61.50857023246887


In [126]:
# lasso regression


# lasso with 5 fold cross-validation
model = LassoCV(cv=5, random_state=1, max_iter=10000)

# fit these models and record accuracy
model.fit(X_train, y_train)

# run lasso on the best alpha value
lasso_best = Lasso(alpha=model.alpha_)
lasso_best.fit(X_train, y_train)
y_train_pred = lasso_best.predict(X_train)
y_pred = lasso_best.predict(X_test)

# train MSE
print("Train R-Squared:", r2_score(y_train_pred, y_train))
print("Train MSE:", mean_squared_error(y_train_pred, y_train))
print()

# test MSE
print("Test R-Squared:", r2_score(y_pred, y_test))
print("Test MSE:", mean_squared_error(y_pred, y_test))

Train R-Squared: -0.4532447075487036
Train MSE: 65.70810653754728

Test R-Squared: -0.43710479050516504
Test MSE: 65.88960574049463


In [127]:
# pre-pruned decision tree regressor


# grid search for best parameters
parameters = {'max_depth': [2,4,6,8,10,12],
             'min_samples_split': [2,3,4],
             'min_samples_leaf': [1,2]}
tree = DecisionTreeRegressor()
gscv = GridSearchCV(estimator=tree,param_grid=parameters)
gscv.fit(X_train, y_train)

# fit model with the best parameters
model = gscv.best_estimator_
model.fit(X_train, y_train)

# train MSE
y_train_pred = model.predict(X_train)
print("Train R-Squared:", r2_score(y_train_pred, y_train))
print("Train MSE:", mean_squared_error(y_train_pred, y_train))
print()

# test MSE
y_pred = model.predict(X_test)
print("Test R-Squared:", r2_score(y_pred, y_test))
print("Test MSE:", mean_squared_error(y_pred, y_test))

Train R-Squared: 0.5758678477244839
Train MSE: 38.088294815687874

Test R-Squared: 0.5524108622026782
Test MSE: 40.49056198770195


In [132]:
# random forest regressor


# fit a random forest and predict
rfor = RandomForestRegressor()
rfor.fit(X_train, y_train)


# train MSE
y_train_pred = rfor.predict(X_train)
print("Train R-Squared:", r2_score(y_train_pred, y_train))
print("Train MSE:", mean_squared_error(y_train_pred, y_train))
print()

# test MSE
y_pred = rfor.predict(X_test)
print("Test R-Squared:", r2_score(y_pred, y_test))
print("Test MSE:", mean_squared_error(y_pred, y_test))

Train R-Squared: 0.9563690695870484
Train MSE: 4.690625109570441

Test R-Squared: 0.6651205851835698
Test MSE: 31.42408366408846
