# UVA men's soccer project

In [1]:
import pandas as pd
import numpy as np
import sqlite3
from future_encoders import ColumnTransformer, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR,SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

np.random.seed(42)

In [2]:
#LOAD THE DATA
with sqlite3.connect('database.sqlite') as con:
    euSoccer = pd.read_sql_query("SELECT * FROM Player_Attributes",con=con)

### 1. Data Discovery, Cleaning, and Scaling

In [3]:
#drop the missing values and control the data set to be around 80,000 entries
euSoccer = euSoccer.dropna()

drop_indices = np.random.choice(euSoccer.index, 150000, replace=False)
euSoccer = euSoccer.drop(drop_indices)

#drop strange values within certain features
euSoccer_clean = euSoccer[euSoccer['attacking_work_rate'].isin(['medium','high','low'])]
euSoccerSub = euSoccer_clean[euSoccer_clean['defensive_work_rate'].isin(['medium','high','low'])]

In [4]:
#drop unrelated features and split out the y(rating)
#also drop the features that is gk
euSoccer_x = euSoccerSub.drop(["id","player_fifa_api_id","player_api_id","date","overall_rating","gk_diving","gk_handling", "gk_kicking", "gk_positioning", "gk_reflexes"],axis = 1)
euSoccer_y = euSoccerSub["overall_rating"].copy()

euSoccer_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29660 entries, 5 to 183973
Data columns (total 32 columns):
potential              29660 non-null float64
preferred_foot         29660 non-null object
attacking_work_rate    29660 non-null object
defensive_work_rate    29660 non-null object
crossing               29660 non-null float64
finishing              29660 non-null float64
heading_accuracy       29660 non-null float64
short_passing          29660 non-null float64
volleys                29660 non-null float64
dribbling              29660 non-null float64
curve                  29660 non-null float64
free_kick_accuracy     29660 non-null float64
long_passing           29660 non-null float64
ball_control           29660 non-null float64
acceleration           29660 non-null float64
sprint_speed           29660 non-null float64
agility                29660 non-null float64
reactions              29660 non-null float64
balance                29660 non-null float64
shot_power         

In [5]:
#check correlations
corr_matrix = euSoccerSub.corr()
corr_matrix["overall_rating"].sort_values(ascending=False)

overall_rating        1.000000
reactions             0.776383
potential             0.765024
short_passing         0.443724
ball_control          0.429318
vision                0.424666
long_passing          0.423216
shot_power            0.419368
penalties             0.385166
long_shots            0.381752
positioning           0.358530
volleys               0.353619
curve                 0.343878
crossing              0.342461
dribbling             0.342265
free_kick_accuracy    0.339648
finishing             0.321128
aggression            0.321013
strength              0.318978
stamina               0.316200
heading_accuracy      0.304755
jumping               0.261050
sprint_speed          0.251024
interceptions         0.248915
acceleration          0.240066
agility               0.230660
standing_tackle       0.163822
balance               0.153108
marking               0.135117
sliding_tackle        0.132117
gk_kicking            0.036335
gk_diving             0.036253
gk_posit

In [6]:
#split out the train set and test set
train_x, test_x, train_y, test_y = train_test_split(euSoccer_x, euSoccer_y, test_size=0.2, random_state=42)

In [7]:
#pipeline setup for train set
cat_attribs = ['preferred_foot','attacking_work_rate','defensive_work_rate']
euSoccer_num = euSoccer_x.drop(['preferred_foot','attacking_work_rate','defensive_work_rate'],axis = 1)
num_attribs = list(euSoccer_num)

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

train_x_prepared = full_pipeline.fit_transform(train_x)

In [8]:
#scale test set for numerical variables
scaler = StandardScaler()
scaler.fit_transform(train_x.drop(['preferred_foot','attacking_work_rate','defensive_work_rate'],axis = 1))

test_num = test_x.drop(['preferred_foot','attacking_work_rate','defensive_work_rate'],axis = 1)

mean = scaler.mean_
var = scaler.var_

test_num_prepared = (test_num - mean) / var

test_num_array = test_num_prepared.values

#oneHotEncode test set for catrgorical variables
test_cat = test_x[['preferred_foot','attacking_work_rate','defensive_work_rate']]

cat_encoder = OneHotEncoder()
test_cat_1hot = cat_encoder.fit_transform(test_cat)
test_cat_array = test_cat_1hot.toarray()

test_x_prepared = np.concatenate((test_num_array,test_cat_array),axis=1)

### 2. Fit Models and Error Analysis

##### Linear Regression

In [9]:
lreg=LinearRegression()
lreg.fit(train_x_prepared, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
#error analysis
pred_L_y=lreg.predict(train_x_prepared)
print("The R^2 of Linear Regression is:",round(r2_score(train_y, pred_L_y),4))

pred_L_rmse_y=lreg.predict(test_x_prepared)
rmse_lreg = np.sqrt(mean_squared_error(test_y, pred_L_rmse_y))
print("The RMSE of Linear Regression is:",round(rmse_lreg,4))

The R^2 of Linear Regression is: 0.7932
The RMSE of Linear Regression is: 6.343


##### Support Vector Regression (linear)

In [11]:
svrL=LinearSVR(epsilon=2.5,random_state=42)
svrL.fit(train_x_prepared, train_y)

LinearSVR(C=1.0, dual=True, epsilon=2.5, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=42, tol=0.0001, verbose=0)

In [12]:
#error analysis
print("The R^2 of SVR(Linear) is:",round(svrL.score(train_x_prepared, train_y),4))

pred_SL_y=svrL.predict(test_x_prepared)
rmse_svr = np.sqrt(mean_squared_error(test_y, pred_SL_y))
print("The RMSE of SVR(Linear) is:",round(rmse_svr,4))

The R^2 of SVR(Linear) is: 0.7928
The RMSE of SVR(Linear) is: 6.3425


##### Support Vector Regression (poly kernel)

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42)
rnd_search_cv.fit(train_x_prepared, train_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=4.745401188473625, gamma=0.07969454818643928 ..................
[CV] ... C=4.745401188473625, gamma=0.07969454818643928, total= 1.6min
[CV] C=4.745401188473625, gamma=0.07969454818643928 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min remaining:    0.0s


[CV] ... C=4.745401188473625, gamma=0.07969454818643928, total= 1.7min
[CV] C=4.745401188473625, gamma=0.07969454818643928 ..................
[CV] ... C=4.745401188473625, gamma=0.07969454818643928, total= 1.5min
[CV] C=8.31993941811405, gamma=0.015751320499779724 ..................
[CV] ... C=8.31993941811405, gamma=0.015751320499779724, total=  25.9s
[CV] C=8.31993941811405, gamma=0.015751320499779724 ..................
[CV] ... C=8.31993941811405, gamma=0.015751320499779724, total=  26.0s
[CV] C=8.31993941811405, gamma=0.015751320499779724 ..................
[CV] ... C=8.31993941811405, gamma=0.015751320499779724, total=  25.9s
[CV] C=2.560186404424365, gamma=0.002051110418843397 .................
[CV] .. C=2.560186404424365, gamma=0.002051110418843397, total=  21.6s
[CV] C=2.560186404424365, gamma=0.002051110418843397 .................
[CV] .. C=2.560186404424365, gamma=0.002051110418843397, total=  21.7s
[CV] C=2.560186404424365, gamma=0.002051110418843397 .................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 21.6min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a286c7400>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a286c7748>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [17]:
rnd_search_cv.best_estimator_

SVR(C=7.011150117432088, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.026070247583707663, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
pred_rbf_y = rnd_search_cv.best_estimator_.predict(test_x_prepared)
rmse_svrr = np.sqrt(mean_squared_error(test_y, pred_rbf_y))
print("The RMSE of SVR(rbf) is:",round(rmse_svrr,4))

The RMSE of SVR(rbf) is: 10.5035


In [21]:
param_distributions_poly = {"kernel":'poly',"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10), "degree":[2,3]}
rnd_search_cv_poly = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42)
rnd_search_cv_poly.fit(train_x_prepared, train_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=4.745401188473625, gamma=0.07969454818643928 ..................
[CV] ... C=4.745401188473625, gamma=0.07969454818643928, total= 1.6min
[CV] C=4.745401188473625, gamma=0.07969454818643928 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min remaining:    0.0s


[CV] ... C=4.745401188473625, gamma=0.07969454818643928, total= 1.6min
[CV] C=4.745401188473625, gamma=0.07969454818643928 ..................
[CV] ... C=4.745401188473625, gamma=0.07969454818643928, total= 1.7min
[CV] C=8.31993941811405, gamma=0.015751320499779724 ..................
[CV] ... C=8.31993941811405, gamma=0.015751320499779724, total=  27.5s
[CV] C=8.31993941811405, gamma=0.015751320499779724 ..................
[CV] ... C=8.31993941811405, gamma=0.015751320499779724, total=  31.0s
[CV] C=8.31993941811405, gamma=0.015751320499779724 ..................
[CV] ... C=8.31993941811405, gamma=0.015751320499779724, total=  27.1s
[CV] C=2.560186404424365, gamma=0.002051110418843397 .................
[CV] .. C=2.560186404424365, gamma=0.002051110418843397, total=  34.1s
[CV] C=2.560186404424365, gamma=0.002051110418843397 .................
[CV] .. C=2.560186404424365, gamma=0.002051110418843397, total=  22.4s
[CV] C=2.560186404424365, gamma=0.002051110418843397 .................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 22.8min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a286c7400>, 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a286c7748>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [22]:
rnd_search_cv_poly.best_estimator_

SVR(C=7.011150117432088, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.026070247583707663, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
#next steps:
#1. fit SVR with other kernels (using gridsearch to tuning hyper and kernel tricks)
#2. explore feature selection for more accurate result
#3. Find data with UVA men's soccer team to test our model