In [49]:
import sqlite3 #interact with relational datasets
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

In [None]:
#Read data from Database into pandas
connection = sqlite3.connect('./soccer/database.sqlite')
df = pd.read_sql_query('SELECT * FROM Player_Attributes',connection)

### EDA

In [5]:
df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [10]:
#Removing the useless columns
df.drop(['id','player_fifa_api_id','player_api_id','date'], axis=1, inplace=True)
df.head()

Unnamed: 0,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [12]:
#No of features available
df.shape

(180354, 38)

In [13]:
#check is any null value 
df.isnull().values.any()

False

In [16]:
#drop them
df.dropna(inplace=True)
df.head()

Unnamed: 0,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


### Preprocessing

In [18]:
#converting categorial features to numerials
df.preferred_foot = LabelEncoder().fit_transform(df.preferred_foot)
df.attacking_work_rate = LabelEncoder().fit_transform(df.attacking_work_rate)
df.defensive_work_rate  = LabelEncoder().fit_transform(df.defensive_work_rate)

df.head()

Unnamed: 0,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,67.0,71.0,1,4,14,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,67.0,71.0,1,4,14,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,62.0,66.0,1,4,14,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,61.0,65.0,1,4,14,48.0,43.0,70.0,60.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,61.0,65.0,1,4,14,48.0,43.0,70.0,60.0,43.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [39]:
features = df.drop(['overall_rating'], axis=1)

In [21]:
features.columns

Index(['potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

In [33]:
#Target
y = df['overall_rating'].values

In [40]:
#feature scaling
X = StandardScaler().fit_transform(features.values)

In [41]:
X.shape

(180354, 37)

In [45]:
# split the data into training and test datasets
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.33, random_state=324)

### We will use different models and try to evaluate for better performance 

In [50]:
#Model-1: Linear Regression 
regressor = SGDRegressor()
regressor.fit(X_train,y_train)

y_pred_SGD = regressor.predict(X_test)

#Calculating r2-score - show how well the statistical model fits the underlying data
print('r2-score for SGD regressor: {}'.format(r2_score(y_pred=y_pred_SGD,y_true=y_test)))



r2-score for SGD regressor: 0.8387101590625065


In [54]:
#Model-2: Decision Tree Regressor
treeRegressor = DecisionTreeRegressor(max_depth=20)
treeRegressor.fit(X_train, y_train)

y_pred_treeRegressor = treeRegressor.predict(X_test)

#Calculating r2-score - show how well the statistical model fits the underlying data
print('r2-score for Tree regressor: {}'.format(r2_score(y_pred=y_pred_treeRegressor,y_true=y_test)))

r2-score for Tree regressor: 0.9569872043401213


In [55]:
#Model-3: Decision Tree Classifier
treeClassifier = DecisionTreeClassifier()
treeClassifier.fit(X_train, y_train)

y_pred_treeClassifier = treeClassifier.predict(X_test)

#Calculating r2-score - show how well the statistical model fits the underlying data
print('r2-score for Tree regressor: {}'.format(r2_score(y_pred=y_pred_treeClassifier,y_true=y_test)))

r2-score for Tree regressor: 0.9452483211628474
