<a href="https://colab.research.google.com/github/michaelalassaad/Basketball-Statistics-Prediction/blob/main/Basketball_PPG_Predction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# read dataset
url = '/content/gdrive/MyDrive/Projects/ML/NBA_Player_Stats.csv'
dataset = pd.read_csv(url)
dataset= dataset.drop(labels=['Player','Rk','Pos','Tm','Year','GS','G','MP','ORB','DRB','FGA','3PA','2PA','FTA','eFG%','FT%','2P%','3P%','FG%'], axis=1)
# dataset = dataset.drop_duplicates(subset=['Player'],keep='first')
dataset.reset_index(drop=True,inplace=True)
dataset.head()

Unnamed: 0,Age,FG,3P,2P,FT,TRB,AST,STL,BLK,TOV,PF,PTS
0,28,3.3,0.2,3.2,0.5,1.2,1.9,0.5,0.0,0.6,1.0,7.3
1,23,2.4,0.1,2.4,1.4,2.0,0.9,0.6,0.2,1.1,1.4,6.4
2,21,8.0,0.3,7.7,6.1,7.1,2.6,1.1,0.9,3.1,2.5,22.3
3,24,2.9,1.1,1.8,1.3,2.4,3.5,1.2,0.2,1.9,1.6,8.1
4,24,1.6,0.5,1.1,0.7,1.3,1.9,0.7,0.1,1.3,1.4,4.5


## Importing the dataset

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
dataset.columns

Index(['Age', 'FG', '3P', '2P', 'FT', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS'],
      dtype='object')

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(dataset)

In [None]:
print(X)

[[28.   3.3  0.2 ...  0.   0.6  1. ]
 [23.   2.4  0.1 ...  0.2  1.1  1.4]
 [21.   8.   0.3 ...  0.9  3.1  2.5]
 ...
 [23.   2.3  0.  ...  0.4  0.7  1.5]
 [29.   1.9  0.  ...  0.2  0.7  2.1]
 [24.   4.1  0.  ...  1.   1.5  2.7]]


In [None]:
print(y)

[ 7.3  6.4 22.3 ...  5.3  5.2 10.3]


## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:11])
X[:, 1:11] = imputer.transform(X[:, 1:11])

In [None]:
print(X)

[[28.   3.3  0.2 ...  0.   0.6  1. ]
 [23.   2.4  0.1 ...  0.2  1.1  1.4]
 [21.   8.   0.3 ...  0.9  3.1  2.5]
 ...
 [23.   2.3  0.  ...  0.4  0.7  1.5]
 [29.   1.9  0.  ...  0.2  0.7  2.1]
 [24.   4.1  0.  ...  1.   1.5  2.7]]


## Encoding categorical data

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)


In [None]:
print(X_train)

[[32.   2.5  1.4 ...  0.2  0.6  1.5]
 [31.   5.3  0.6 ...  0.5  1.2  1.2]
 [32.   3.   0.7 ...  0.4  1.4  2.7]
 ...
 [31.   5.3  0.  ...  2.   1.4  2.8]
 [34.   0.3  0.  ...  0.3  0.5  1.3]
 [26.   3.2  0.  ...  0.5  0.7  1.4]]


In [None]:
print(X_test)

[[32.   2.   1.  ...  0.4  0.5  2.1]
 [20.   3.7  1.4 ...  0.5  1.7  2.3]
 [25.   1.2  0.4 ...  0.   1.   0.6]
 ...
 [34.   5.1  1.1 ...  0.   2.1  2.5]
 [28.   0.1  0.  ...  0.   0.1  0.2]
 [25.   6.5  2.2 ...  0.3  1.7  2.4]]


In [None]:
print(y_train)

[ 6.8 12.7  7.8 ... 12.   0.5  7.8]


In [None]:
print(y_test)

[ 5.5 10.5  3.  ... 13.9  0.3 17.8]


## Feature Scaling

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

# define model
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X_train, y_train) 
pred_train_lasso= model_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print(r2_score(y_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print(r2_score(y_test, pred_test_lasso))

0.07550500642445848
0.9998313182827568
0.0757706241105004
0.9998375626010853


In [None]:
print("The dimension of X_train is {}".format(X_train.shape))
print("The dimension of X_test is {}".format(X_test.shape))

The dimension of X_train is (11658, 11)
The dimension of X_test is (2915, 11)


In [None]:
#                         Age, FG, 3P, 2P, FT, TRB, AST, STL, BLK, TOV, PF
print(model_lasso.predict([[20, 4, 1, 10.,4 , 1.2, 1.9, 0.5, 3, 60, 1.0]]))

[12.95875952]
