In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import validation_curve
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Data Cleaning

In [2]:
# import the data, read, examine it
soccer=pd.read_csv("Edited_European_Rosters-2.csv")
soccer.head()

Unnamed: 0,FullName,PlayerName,Affiliation,Champions League,League,League Nation,Jersey,Birth Date,Age,Height (meters),...,Continent,Games Played,2018/19 Games Played,Market Value (Euros),Accumulated Transfer Sums (Euros),Highest Market Value (Euros),Highest Market Value Date,Years Since Peak,NationalTeamCaps,MostRecentInjury
0,Kylian Sanmi Mbappé Lottin,Kylian Mbappe,Paris SG,Yes,Ligue 1,France,#7,12/20/1998,20,1.78,...,UEFA,5,29,200000000.0,145000000,200000000.0,12/17/2018,0,33,
1,Neymar da Silva Santos Júnior,Neymar,Paris SG,Yes,Ligue 1,France,#10,2/5/1992,27,1.75,...,CONMEBOL,4,17,180000000.0,310200000,180000000.0,1/24/2018,0,102,Hamstring Injury: Return unknown
2,,Mohamed Salah,Liverpool,Yes,Premier League,England,#11,6/15/1992,27,1.75,...,CAF,6,38,150000000.0,81000000,150000000.0,5/28/2018,0,67,
3,Eden Michael Hazard,Eden Hazard,Real Madrid,Yes,LaLiga,Spain,#7,1/7/1991,28,1.75,...,UEFA,1,37,150000000.0,135000000,150000000.0,10/17/2018,0,106,
4,Lionel Andrés Messi Cuccitini,Lionel Messi,FC Barcelona,Yes,LaLiga,Spain,#10,6/24/1987,32,1.7,...,CONMEBOL,8,34,150000000.0,0,180000000.0,1/1/2018,1,138,


In [3]:
soccer.shape

(4308, 27)

In [4]:
soccer.columns

Index(['FullName', 'PlayerName', 'Affiliation', 'Champions League', 'League',
       'League Nation', 'Jersey', 'Birth Date', 'Age', 'Height (meters)',
       'Position', 'Foot', 'Agent', 'Agent Yes/No', 'PlayerSponsor',
       'Nationality', 'Nation/League', 'Continent', 'Games Played',
       '2018/19 Games Played', 'Market Value (Euros)',
       'Accumulated Transfer Sums (Euros)', 'Highest Market Value (Euros)',
       'Highest Market Value Date', 'Years Since Peak', 'NationalTeamCaps',
       'MostRecentInjury'],
      dtype='object')

In [5]:
#shuffling
soccer=soccer.sample(frac=1, random_state=0)

In [6]:
# pulling only the columns we want to use
soccer1=soccer.iloc[:,[3,4,8,9,10,11,13,14,16,17,19,20,22,24,25]].copy()
soccer1.head()

Unnamed: 0,Champions League,League,Age,Height (meters),Position,Foot,Agent Yes/No,PlayerSponsor,Nation/League,Continent,2018/19 Games Played,Market Value (Euros),Highest Market Value (Euros),Years Since Peak,NationalTeamCaps
1119,No,Ligue 1,21,1.74,Forward,right,Yes,Other,Yes,UEFA,35,7500000.0,7500000.0,0,2
2734,No,Ligue 1,25,1.77,Midfielder,right,Yes,Other,No,CAF,33,1000000.0,1000000.0,0,1
1798,No,Serie A,24,1.87,Defender,left,Yes,Other,No,UEFA,3,3500000.0,4000000.0,1,8
481,No,Serie A,25,1.74,Midfielder,right,Yes,Other,Yes,UEFA,33,18000000.0,100000.0,0,1
559,Yes,Premier Liga,24,1.82,Midfielder,left,No,adidas,Yes,UEFA,0,16000000.0,16000000.0,0,25


In [7]:
soccer1.columns

Index(['Champions League', 'League', 'Age', 'Height (meters)', 'Position',
       'Foot', 'Agent Yes/No', 'PlayerSponsor', 'Nation/League', 'Continent',
       '2018/19 Games Played', 'Market Value (Euros)',
       'Highest Market Value (Euros)', 'Years Since Peak', 'NationalTeamCaps'],
      dtype='object')

In [8]:
#checking categorical variables for misspelled or incorrect values
print('Champions League:','\n',soccer1['Champions League'].value_counts(),'\n')
print('League:','\n',soccer1['League'].value_counts(),'\n')
print('Position:','\n',soccer1['Position'].value_counts(),'\n')
print('Foot:','\n',soccer1['Foot'].value_counts(),'\n')
print('Agent:','\n',soccer1['Agent Yes/No'].value_counts(),'\n')
print('Player Sponsor','\n',soccer1['PlayerSponsor'].value_counts(),'\n')
print('Nation/League','\n',soccer1['Nation/League'].value_counts(),'\n')
print('Continent','\n',soccer1['Continent'].value_counts(),'\n')

Champions League: 
 No     3668
Yes     640
Name: Champions League, dtype: int64 

League: 
 Serie A               547
Ligue 1               546
Bundesliga            533
Premier League        505
LaLiga                490
Eredivisie            486
Liga NOS              478
Premier Liga          413
Jupiler Pro League    310
Name: League, dtype: int64 

Position: 
 Defender       1434
Midfielder     1215
Forward        1136
Goalkeeper      494
Midfielder        1
Name: Position, dtype: int64 

Foot: 
 right    2888
left     1004
both      200
-           6
Name: Foot, dtype: int64 

Agent: 
 Yes    3618
No      690
Name: Agent Yes/No, dtype: int64 

Player Sponsor 
 Other     2576
None       675
Nike       513
adidas     456
Puma        88
Name: PlayerSponsor, dtype: int64 

Nation/League 
 No     2231
Yes    2077
Name: Nation/League, dtype: int64 

Continent 
 UEFA        3241
CONMEBOL     502
CAF          405
AFC           67
CONCACAF      66
OFC            5
Name: Continent, dtype: 

In [9]:
#checking for null values
soccer1.isnull().sum()

Champions League                  0
League                            0
Age                               0
Height (meters)                  83
Position                         28
Foot                            210
Agent Yes/No                      0
PlayerSponsor                     0
Nation/League                     0
Continent                        22
2018/19 Games Played              0
Market Value (Euros)             19
Highest Market Value (Euros)      1
Years Since Peak                  0
NationalTeamCaps                  0
dtype: int64

In [10]:
# checking that column types are all accurate
soccer1.dtypes

Champions League                 object
League                           object
Age                               int64
Height (meters)                 float64
Position                         object
Foot                             object
Agent Yes/No                     object
PlayerSponsor                    object
Nation/League                    object
Continent                        object
2018/19 Games Played              int64
Market Value (Euros)            float64
Highest Market Value (Euros)    float64
Years Since Peak                  int64
NationalTeamCaps                  int64
dtype: object

In [11]:
# dropping null and incorrect values
soccer1.dropna(inplace=True)
soccer1.drop(soccer1[soccer1['Foot']=='-'].index,inplace=True)

In [12]:
# creating features and target sets
x_soccer1=soccer1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12,13,14]]
y_soccer1=soccer1.iloc[:,11]

In [13]:
x_soccer1.head()

Unnamed: 0,Champions League,League,Age,Height (meters),Position,Foot,Agent Yes/No,PlayerSponsor,Nation/League,Continent,2018/19 Games Played,Highest Market Value (Euros),Years Since Peak,NationalTeamCaps
1119,No,Ligue 1,21,1.74,Forward,right,Yes,Other,Yes,UEFA,35,7500000.0,0,2
2734,No,Ligue 1,25,1.77,Midfielder,right,Yes,Other,No,CAF,33,1000000.0,0,1
1798,No,Serie A,24,1.87,Defender,left,Yes,Other,No,UEFA,3,4000000.0,1,8
481,No,Serie A,25,1.74,Midfielder,right,Yes,Other,Yes,UEFA,33,100000.0,0,1
559,Yes,Premier Liga,24,1.82,Midfielder,left,No,adidas,Yes,UEFA,0,16000000.0,0,25


In [14]:
y_soccer1.head()

1119     7500000.0
2734     1000000.0
1798     3500000.0
481     18000000.0
559     16000000.0
Name: Market Value (Euros), dtype: float64

In [15]:
# encoding dummy variables
x_dummies=pd.get_dummies(x_soccer1)
x_dummies.head()

Unnamed: 0,Age,Height (meters),2018/19 Games Played,Highest Market Value (Euros),Years Since Peak,NationalTeamCaps,Champions League_No,Champions League_Yes,League_Bundesliga,League_Eredivisie,...,PlayerSponsor_Puma,PlayerSponsor_adidas,Nation/League_No,Nation/League_Yes,Continent_AFC,Continent_CAF,Continent_CONCACAF,Continent_CONMEBOL,Continent_OFC,Continent_UEFA
1119,21,1.74,35,7500000.0,0,2,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2734,25,1.77,33,1000000.0,0,1,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1798,24,1.87,3,4000000.0,1,8,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
481,25,1.74,33,100000.0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
559,24,1.82,0,16000000.0,0,25,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1


In [16]:
x_dummies.shape

(4041, 39)

In [17]:
x_dummies.columns

Index(['Age', 'Height (meters)', '2018/19 Games Played',
       'Highest Market Value (Euros)', 'Years Since Peak', 'NationalTeamCaps',
       'Champions League_No', 'Champions League_Yes', 'League_Bundesliga',
       'League_Eredivisie', 'League_Jupiler Pro League', 'League_LaLiga',
       'League_Liga NOS', 'League_Ligue 1', 'League_Premier League',
       'League_Premier Liga', 'League_Serie A', 'Position_Defender ',
       'Position_Forward ', 'Position_Goalkeeper', 'Position_Midfielder ',
       'Foot_both', 'Foot_left', 'Foot_right', 'Agent Yes/No_No',
       'Agent Yes/No_Yes', 'PlayerSponsor_Nike', 'PlayerSponsor_None',
       'PlayerSponsor_Other', 'PlayerSponsor_Puma', 'PlayerSponsor_adidas',
       'Nation/League_No', 'Nation/League_Yes', 'Continent_AFC',
       'Continent_CAF', 'Continent_CONCACAF', 'Continent_CONMEBOL',
       'Continent_OFC', 'Continent_UEFA'],
      dtype='object')

# Descriptive Analysis

In [18]:
soccer1.describe()

Unnamed: 0,Age,Height (meters),2018/19 Games Played,Market Value (Euros),Highest Market Value (Euros),Years Since Peak,NationalTeamCaps
count,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0
mean,25.647117,1.826043,16.475872,8080358.0,8963982.0,1.538233,11.769611
std,4.36,0.066506,13.27739,15080750.0,16462530.0,2.47694,20.550207
min,16.0,1.59,0.0,25000.0,25000.0,0.0,0.0
25%,22.0,1.78,0.0,800000.0,900000.0,0.0,0.0
50%,25.0,1.83,17.0,2500000.0,3000000.0,0.0,3.0
75%,29.0,1.87,29.0,8000000.0,10000000.0,2.0,13.0
max,42.0,2.04,46.0,200000000.0,200000000.0,15.0,176.0


In [19]:
x_dummies.describe()

Unnamed: 0,Age,Height (meters),2018/19 Games Played,Highest Market Value (Euros),Years Since Peak,NationalTeamCaps,Champions League_No,Champions League_Yes,League_Bundesliga,League_Eredivisie,...,PlayerSponsor_Puma,PlayerSponsor_adidas,Nation/League_No,Nation/League_Yes,Continent_AFC,Continent_CAF,Continent_CONCACAF,Continent_CONMEBOL,Continent_OFC,Continent_UEFA
count,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,...,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0,4041.0
mean,25.647117,1.826043,16.475872,8963982.0,1.538233,11.769611,0.845335,0.154665,0.127691,0.10344,...,0.021282,0.111111,0.522148,0.477852,0.015838,0.091561,0.015838,0.117298,0.001237,0.758228
std,4.36,0.066506,13.27739,16462530.0,2.47694,20.550207,0.361629,0.361629,0.333787,0.30457,...,0.14434,0.314309,0.499571,0.499571,0.124863,0.288442,0.124863,0.321815,0.035158,0.42821
min,16.0,1.59,0.0,25000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,1.78,0.0,900000.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,25.0,1.83,17.0,3000000.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,29.0,1.87,29.0,10000000.0,2.0,13.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,42.0,2.04,46.0,200000000.0,15.0,176.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Predictive Models

In [25]:
#splitting into train and test sets
X_train, X_test, y_train, y_test= train_test_split(x_dummies, y_soccer1, random_state=0, test_size=.2)

## k-NN Regression

In [26]:
# validation curve
neighbors_range=[2,3,4,5,6,8]
knn_train_scores, knn_test_scores=validation_curve(KNeighborsRegressor(),x_dummies,y_soccer1,param_name='n_neighbors',param_range=neighbors_range, cv=4)
print('train scores: ',knn_train_scores.mean(axis=1))
print('test scores: ',knn_test_scores.mean(axis=1))
print('neighbors range: ',neighbors_range)

train scores:  [0.96637561 0.95460112 0.94718689 0.94036028 0.93554984 0.92726844]
test scores:  [0.89741405 0.90602516 0.90825177 0.90937612 0.90894904 0.90678256]
neighbors range:  [2, 3, 4, 5, 6, 8]


In [27]:
#defining and fitting the model KNN
knn_reg=KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train,y_train)

KNeighborsRegressor()

In [28]:
# evaluating the model
print('KNN Regression acc on train: {:.2%}'.format(knn_reg.score(X_train, y_train)))
print('KNN Regression acc on test: {:.2%}'.format(knn_reg.score(X_test, y_test)))

KNN Regression acc on train: 93.75%
KNN Regression acc on test: 92.99%


## Linear Regression

In [29]:
# defining and fitting the model linear regression
lr=LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [30]:
# evaluating the model
print('LR acc on train: {:.2%}'.format(lr.score(X_train, y_train)))
print('LR acc on test: {:.2%}'.format(lr.score(X_test, y_test)))

LR acc on train: 88.32%
LR acc on test: 89.71%


## Ridge Regression

In [31]:
# validation curve
ridge_alpha_range=[1,10,20,50,100]
ridge_train_scores, ridge_test_scores=validation_curve(Ridge(random_state=0),x_dummies,y_soccer1,param_name='alpha',param_range=ridge_alpha_range, cv=4)
print('train scores: ',ridge_train_scores.mean(axis=1))
print('test scores: ',ridge_test_scores.mean(axis=1))
print('alpha range: ',ridge_alpha_range)

train scores:  [0.8866491  0.88660887 0.8865693  0.88643091 0.8861408 ]
test scores:  [0.88278708 0.88292992 0.88298273 0.88301123 0.88289771]
alpha range:  [1, 10, 20, 50, 100]


  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


In [32]:
# define and fit the model Ridge
ridge=Ridge(alpha=50,random_state=0)
ridge.fit(X_train, y_train)

  return linalg.solve(A, Xy, sym_pos=True,


Ridge(alpha=50, random_state=0)

In [33]:
# evaluate the model
print('Ridge R-square on train {:.2%}'.format(ridge.score(X_train, y_train)))
print('Ridge R-square on test {:.2%}'.format(ridge.score(X_test, y_test)))

Ridge R-square on train 88.30%
Ridge R-square on test 89.73%


## Lasso Regression

In [34]:
# validation curve
lasso_alpha_range=[1,10,20,50,100]
lasso_train_scores, lasso_test_scores=validation_curve(Lasso(max_iter=300000,random_state=0),x_dummies,y_soccer1,param_name='alpha',param_range=lasso_alpha_range, cv=4)
print('train scores: ',lasso_train_scores.mean(axis=1))
print('test scores: ',lasso_test_scores.mean(axis=1))
print('alpha range: ',lasso_alpha_range)

train scores:  [0.88665133 0.88665132 0.88665132 0.88665131 0.88665126]
test scores:  [0.88273896 0.88274002 0.88274119 0.88274469 0.8827505 ]
alpha range:  [1, 10, 20, 50, 100]


In [35]:
# define and fit the model lasso
lasso=Lasso(alpha=100, max_iter=300000,random_state=0)
lasso.fit(X_train, y_train)

Lasso(alpha=100, max_iter=300000, random_state=0)

In [36]:
# evaluate performance of model
print('lass on train {:.2%}'.format(lasso.score(X_train, y_train)))
print('lass on test {:.2%}'.format(lasso.score(X_test, y_test)))

lass on train 88.32%
lass on test 89.71%


## Decision Tree Regression

In [37]:
# validation curve
depth_range=[6,7,8,9,10,11]
dt_train_scores, dt_test_scores=validation_curve(DecisionTreeRegressor(random_state=0),x_dummies,y_soccer1,param_name='max_depth',param_range=depth_range, cv=4)
print('train scores: ',dt_train_scores.mean(axis=1))
print('test scores: ',dt_test_scores.mean(axis=1))
print('max depth range: ',depth_range)

train scores:  [0.95810179 0.97086341 0.97930278 0.98463459 0.98884054 0.99264951]
test scores:  [0.92526641 0.93017397 0.93158848 0.9270375  0.92316693 0.92527252]
max depth range:  [6, 7, 8, 9, 10, 11]


In [38]:
# define and fit the model decision tree
dt_reg=DecisionTreeRegressor(random_state=0, max_depth=8)
dt_reg.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=8, random_state=0)

In [39]:
# evaluating the model
print('dt_reg acc on train: {:.2%}'.format(dt_reg.score(X_train, y_train)))
print('dt_reg acc on test: {:.2%}'.format(dt_reg.score(X_test, y_test)))

dt_reg acc on train: 97.65%
dt_reg acc on test: 93.46%


In [40]:
# decision tree feature importances
dt_feat_imp=pd.DataFrame(index=x_dummies.columns, data=dt_reg.feature_importances_,columns=['importance'])
dt_feat_imp.sort_values('importance', ascending=False)

Unnamed: 0,importance
Highest Market Value (Euros),0.920096
Years Since Peak,0.038269
2018/19 Games Played,0.010109
League_Ligue 1,0.00652
Position_Forward,0.004476
Age,0.004007
PlayerSponsor_None,0.003672
Champions League_No,0.003156
NationalTeamCaps,0.001769
Continent_CONMEBOL,0.00169


In [41]:
# exporting the graphviz
export_graphviz(dt_reg, out_file='soccer_reg_tree.dot', feature_names=x_dummies.columns,impurity=True, filled=True)

## Random Forest Regression

In [42]:
# validation curve
estimator_range=[200,300,400,500,600]
rf_train_scores, rf_test_scores=validation_curve(RandomForestRegressor(random_state=0),x_dummies,y_soccer1,param_name='n_estimators',param_range=estimator_range, cv=4)
print('train scores: ',rf_train_scores.mean(axis=1))
print('test scores: ',rf_test_scores.mean(axis=1))
print('estimators range: ',estimator_range)

train scores:  [0.99377203 0.99393205 0.9939085  0.99387843 0.99389639]
test scores:  [0.95871417 0.95875655 0.9592309  0.95950715 0.95967964]
estimators range:  [200, 300, 400, 500, 600]


In [43]:
# define and fit the model random forest
rf_reg=RandomForestRegressor(n_estimators=600, random_state=0)
rf_reg.fit(X_train,y_train)

RandomForestRegressor(n_estimators=600, random_state=0)

In [44]:
# evaluate the model
print('rf_reg acc on train: {:.2%}'.format(rf_reg.score(X_train, y_train)))
print('rf_reg acc on test: {:.2%}'.format(rf_reg.score(X_test, y_test)))

rf_reg acc on train: 99.39%
rf_reg acc on test: 96.33%


In [45]:
# random forest feature importances
rf_feat_imp=pd.DataFrame(index=x_dummies.columns, data=rf_reg.feature_importances_,columns=['importance'])
rf_feat_imp.sort_values('importance', ascending=False)

Unnamed: 0,importance
Highest Market Value (Euros),0.8985289
Years Since Peak,0.03539236
2018/19 Games Played,0.01807035
Age,0.01079318
NationalTeamCaps,0.0071444
Height (meters),0.003267061
PlayerSponsor_None,0.003166388
League_Premier League,0.002205699
Champions League_Yes,0.00211898
League_Ligue 1,0.002010687


# comparing model performances on test set

In [46]:
print('KNN Regression acc on test: {:.2%}'.format(knn_reg.score(X_test, y_test)),'\n')
print('Linear Regression acc on test: {:.2%}'.format(lr.score(X_test, y_test)),'\n')
print('Ridge Regression acc on test {:.2%}'.format(ridge.score(X_test, y_test)),'\n')
print('Lasso Regression on test {:.2%}'.format(lasso.score(X_test, y_test)),'\n')
print('Decision Tree Regression acc on test: {:.2%}'.format(dt_reg.score(X_test, y_test)),'\n')
print('Random Forest Regression acc on test: {:.2%}'.format(rf_reg.score(X_test, y_test)),'\n')

KNN Regression acc on test: 92.99% 

Linear Regression acc on test: 89.71% 

Ridge Regression acc on test 89.73% 

Lasso Regression on test 89.71% 

Decision Tree Regression acc on test: 93.46% 

Random Forest Regression acc on test: 96.33% 



# Predictions 

In [47]:
x_dummies.columns

Index(['Age', 'Height (meters)', '2018/19 Games Played',
       'Highest Market Value (Euros)', 'Years Since Peak', 'NationalTeamCaps',
       'Champions League_No', 'Champions League_Yes', 'League_Bundesliga',
       'League_Eredivisie', 'League_Jupiler Pro League', 'League_LaLiga',
       'League_Liga NOS', 'League_Ligue 1', 'League_Premier League',
       'League_Premier Liga', 'League_Serie A', 'Position_Defender ',
       'Position_Forward ', 'Position_Goalkeeper', 'Position_Midfielder ',
       'Foot_both', 'Foot_left', 'Foot_right', 'Agent Yes/No_No',
       'Agent Yes/No_Yes', 'PlayerSponsor_Nike', 'PlayerSponsor_None',
       'PlayerSponsor_Other', 'PlayerSponsor_Puma', 'PlayerSponsor_adidas',
       'Nation/League_No', 'Nation/League_Yes', 'Continent_AFC',
       'Continent_CAF', 'Continent_CONCACAF', 'Continent_CONMEBOL',
       'Continent_OFC', 'Continent_UEFA'],
      dtype='object')

In [48]:
C_Pulisic=[21,1.72,25,66000000,0,34,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0]
D_Silva=[34,1.70,27,55000000,8,125,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1]
F_Cardoso=[26,1.87,30,2200000,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1]

## KNN

In [49]:
print('KNN Regression prediction on Pulisic: ', knn_reg.predict([C_Pulisic]).round(2),'\n')
print('KNN Regression prediction on Silva: ', knn_reg.predict([D_Silva]).round(2),'\n')
print('KNN Regression prediction on Cardoso: ', knn_reg.predict([F_Cardoso]).round(2),'\n')

KNN Regression prediction on Pulisic:  [58000000.] 

KNN Regression prediction on Silva:  [43000000.] 

KNN Regression prediction on Cardoso:  [1660000.] 



## Linear Regression

In [50]:
print('Linear Regression prediction on Pulisic: ', lr.predict([C_Pulisic]).round(2),'\n')
print('Linear Regression prediction on Silva: ', lr.predict([D_Silva]).round(2),'\n')
print('Linear Regression prediction on Cardoso: ', lr.predict([F_Cardoso]).round(2),'\n')

Linear Regression prediction on Pulisic:  [61122959.32] 

Linear Regression prediction on Silva:  [34272645.18] 

Linear Regression prediction on Cardoso:  [3897175.74] 



## Ridge Regression

In [51]:
print('Ridge Regression prediction on Pulisic: ', ridge.predict([C_Pulisic]).round(2),'\n')
print('Ridge Regression prediction on Silva: ', ridge.predict([D_Silva]).round(2),'\n')
print('Ridge Regression prediction on Cardoso: ', ridge.predict([F_Cardoso]).round(2),'\n')

Ridge Regression prediction on Pulisic:  [60541893.77] 

Ridge Regression prediction on Silva:  [34846697.4] 

Ridge Regression prediction on Cardoso:  [3995723.9] 



## Lasso Regression

In [52]:
print('Lasso Regression prediction on Pulisic: ', lasso.predict([C_Pulisic]).round(2),'\n')
print('Lasso Regression prediction on Silva: ', lasso.predict([D_Silva]).round(2),'\n')
print('Lasso Regression prediction on Cardoso: ', lasso.predict([F_Cardoso]).round(2),'\n')

Lasso Regression prediction on Pulisic:  [61122810.04] 

Lasso Regression prediction on Silva:  [34276321.36] 

Lasso Regression prediction on Cardoso:  [3900016.62] 



## Decision Tree Regression

In [53]:
print('Decision Tree Regression prediction on Pulisic: ', dt_reg.predict([C_Pulisic]).round(2),'\n')
print('Decision Tree Regression prediction on Silva: ', dt_reg.predict([D_Silva]).round(2),'\n')
print('Decision Tree Regression prediction on Cardoso: ', dt_reg.predict([F_Cardoso]).round(2),'\n')

Decision Tree Regression prediction on Pulisic:  [65000000.] 

Decision Tree Regression prediction on Silva:  [25000000.] 

Decision Tree Regression prediction on Cardoso:  [1529473.68] 



## Random Forest Regression

In [54]:
print('Random Forest Regression prediction on Pulisic: ', rf_reg.predict([C_Pulisic]).round(2),'\n')
print('Random Forest Regression prediction on Silva: ', rf_reg.predict([D_Silva]).round(2),'\n')
print('Random Forest Regression prediction on Cardoso: ', rf_reg.predict([F_Cardoso]).round(2),'\n')

Random Forest Regression prediction on Pulisic:  [64030000.] 

Random Forest Regression prediction on Silva:  [20228333.33] 

Random Forest Regression prediction on Cardoso:  [2090750.] 

