# 1. LINEAR REGRESSION 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split,GridSearchCV,KFold

from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('updated_data')

In [4]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,0,67.0,71.0,0,1,1,49.0,44.0,71.0,61.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,1,67.0,71.0,0,1,1,49.0,44.0,71.0,61.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,2,62.0,66.0,0,1,1,49.0,44.0,71.0,61.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0


In [6]:
df.drop(columns='Unnamed: 0',inplace=True,axis = 1)

In [7]:
df.shape

(183978, 38)

In [9]:
df.head(2)

Unnamed: 0,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,67.0,71.0,0,1,1,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,67.0,71.0,0,1,1,49.0,44.0,71.0,61.0,44.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0


In [10]:
y = df['overall_rating']

X = df.drop(columns='overall_rating',axis=1)

In [12]:
X.shape

(183978, 37)

In [13]:
y

0         67.0
1         67.0
2         62.0
3         61.0
4         61.0
          ... 
183973    83.0
183974    78.0
183975    77.0
183976    78.0
183977    80.0
Name: overall_rating, Length: 183978, dtype: float64

In [14]:
y.shape

(183978,)

In [8]:
### Perfect

## Lets split the data into train and test

from sklearn.preprocessing import StandardScaler


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [16]:
scaler = StandardScaler()

In [17]:
X_scaled = scaler.fit_transform(X_train)

# X_SCALED IS TRAINING SCALED DATA FOR X_TRAIN

In [18]:
X_scaled_test = scaler.transform(X_test)

In [19]:
lr_model = LinearRegression()

In [20]:
lr_model.fit(X_scaled,y_train)

LinearRegression()

In [22]:
lr_model.score(X_scaled_test,y_test)

0.8436931815781487

In [25]:
## LETS GENERATE PREDICTIONS

test_pred = lr_model.predict(X_scaled_test)
test_pred

array([76.20755985, 68.70602883, 64.52157611, ..., 71.35117754,
       85.20701084, 56.81466409])

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [26]:
mean_squared_error(y_test, test_pred)

7.715869503710054

In [27]:
mean_absolute_error(y_test, test_pred)

2.117211354383321

In [28]:
np.sqrt(mean_squared_error(y_test, test_pred))

2.7777453993679937

In [29]:
# """If there is lot of difference between MAE and RMSE then it means your model is performing well
#on most of the points nearby but not far away points.

In [30]:
## CHECKING ADJ R2

# Let's create a function to create adjusted R-Squared
def adj_r2(x,y):
    r2 = lr_model.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [31]:
adj_r2(X_scaled_test,y_test)

0.8435883271238444

In [None]:
## 84 % ACCURACY HAS BEEN ACHIEVED FOR LINEAR REGRESSION

# LETS USE REGULARIZATION

In [33]:
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LinearRegression

In [34]:
# Lasso Regularization
# LassoCV will return best alpha and coefficients after performing 10 cross validations
lasscv = LassoCV(alphas = None,cv =10, max_iter = 100000, normalize = True)
lasscv.fit(X_scaled, y_train)

LassoCV(cv=10, max_iter=100000, normalize=True)

In [35]:
# best alpha parameter
alpha = lasscv.alpha_
alpha

1.5020673855754528e-05

In [36]:
#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before

lasso_reg = Lasso(alpha)
lasso_reg.fit(X_scaled, y_train)

Lasso(alpha=1.5020673855754528e-05)

In [37]:
lasso_reg.score(X_scaled_test, y_test)

0.8436931967549095

In [38]:
### THERES NT MUCH DIFF HERE IN SCORE SO OUR MODEL DIDNT OVERFIT THE DATA

# 2. RANDOM FORREST

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
rand_model = RandomForestRegressor()

In [42]:
rand_model.fit(X_train,y_train)

RandomForestRegressor()

In [43]:
rand_model.score(X_test,y_test)

0.9818990340567026

# 3. XG BOOST

In [44]:
import xgboost as xgb

In [45]:
from xgboost import XGBRFRegressor

In [46]:
xgb_model = XGBRFRegressor()

In [47]:
xgb_model.fit(X_train,y_train)

XGBRFRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
               interaction_constraints='', max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=4, num_parallel_tree=100,
               objective='reg:squarederror', random_state=0, reg_alpha=0,
               scale_pos_weight=1, tree_method='exact', validate_parameters=1,
               verbosity=None)

In [51]:
xgb_model.score(X_test,y_test)

0.8565094185036485