## Scrap th data 

In [1]:
import csv
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Send a GET request to the website
url="https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
response = requests.get(url)

In [3]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# Find the table containing the player stats
table = soup.find('table', {'id': 'per_game_stats'})

In [5]:
# Find all the rows in the table
rows = table.find_all('tr')

In [6]:
# Extract the data from each row and store it in a list
data_rows = []
for row in rows[1:]:
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data_rows.append(cols)


In [7]:
#convert list to dataframe and add columns names
column_names=['Player','Position','Age','Team','Games','Games Started','Minutes Played Per Game','Field Goals Per Game','Field Goal Attempts Per Game','Field Goal Percentage','3-Point Field Goals Per Game','3-Point Field Goal Attempts Per Game','3-Point Field Goal Percentage','2-Point Field Goals Per Game','2-Point Field Goal Attempts Per Game','2-Point Field Goal Percentage','Effective Field Goal Percentage','Free Throws Per Game','Free Throw Attempts Per Game','Free Throw Percentage','Offensive Rebounds Per Game','Defensive Rebounds Per Game','Total Rebounds Per Game','Assists Per Game','Steals Per Game','Blocks Per Game','Turnovers Per Game','Personal Fouls Per Game','Points Per Game']
df=pd.DataFrame(data_rows, columns=column_names)
df.head()

Unnamed: 0,Player,Position,Age,Team,Games,Games Started,Minutes Played Per Game,Field Goals Per Game,Field Goal Attempts Per Game,Field Goal Percentage,...,Free Throw Percentage,Offensive Rebounds Per Game,Defensive Rebounds Per Game,Total Rebounds Per Game,Assists Per Game,Steals Per Game,Blocks Per Game,Turnovers Per Game,Personal Fouls Per Game,Points Per Game
0,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,0.439,...,0.595,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
1,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,0.547,...,0.543,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9
2,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,0.557,...,0.753,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1
3,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,0.402,...,0.625,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1
4,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,0.55,...,0.873,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9


## Data Wrangling

In [8]:
#check for duplicated
df.duplicated().sum()

29

In [9]:
#check for duplicated Player
df.Player.duplicated().sum()

236

In [10]:
# remove duplicated player 
df.drop_duplicates(["Player"],inplace=True)
df.shape

(606, 29)

In [11]:
df.head()

Unnamed: 0,Player,Position,Age,Team,Games,Games Started,Minutes Played Per Game,Field Goals Per Game,Field Goal Attempts Per Game,Field Goal Percentage,...,Free Throw Percentage,Offensive Rebounds Per Game,Defensive Rebounds Per Game,Total Rebounds Per Game,Assists Per Game,Steals Per Game,Blocks Per Game,Turnovers Per Game,Personal Fouls Per Game,Points Per Game
0,Precious Achiuwa,C,22,TOR,73,28,23.6,3.6,8.3,0.439,...,0.595,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
1,Steven Adams,C,28,MEM,76,75,26.3,2.8,5.1,0.547,...,0.543,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9
2,Bam Adebayo,C,24,MIA,56,56,32.6,7.3,13.0,0.557,...,0.753,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1
3,Santi Aldama,PF,21,MEM,32,0,11.3,1.7,4.1,0.402,...,0.625,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1
4,LaMarcus Aldridge,C,36,BRK,47,12,22.3,5.4,9.7,0.55,...,0.873,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 606 entries, 0 to 841
Data columns (total 29 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Player                                605 non-null    object
 1   Position                              605 non-null    object
 2   Age                                   605 non-null    object
 3   Team                                  605 non-null    object
 4   Games                                 605 non-null    object
 5   Games Started                         605 non-null    object
 6   Minutes Played Per Game               605 non-null    object
 7   Field Goals Per Game                  605 non-null    object
 8   Field Goal Attempts Per Game          605 non-null    object
 9   Field Goal Percentage                 605 non-null    object
 10  3-Point Field Goals Per Game          605 non-null    object
 11  3-Point Field Goal Attempts Per 

In [13]:
# convert columns to float
df[['Minutes Played Per Game','Field Goals Per Game','Field Goal Attempts Per Game','Field Goal Percentage','3-Point Field Goals Per Game','3-Point Field Goal Attempts Per Game','3-Point Field Goal Percentage','2-Point Field Goals Per Game','2-Point Field Goal Attempts Per Game','2-Point Field Goal Percentage','Effective Field Goal Percentage','Free Throws Per Game','Free Throw Attempts Per Game','Free Throw Percentage','Offensive Rebounds Per Game','Defensive Rebounds Per Game','Total Rebounds Per Game','Assists Per Game','Steals Per Game','Blocks Per Game','Turnovers Per Game','Personal Fouls Per Game','Points Per Game']]= df[['Minutes Played Per Game','Field Goals Per Game','Field Goal Attempts Per Game','Field Goal Percentage','3-Point Field Goals Per Game','3-Point Field Goal Attempts Per Game','3-Point Field Goal Percentage','2-Point Field Goals Per Game','2-Point Field Goal Attempts Per Game','2-Point Field Goal Percentage','Effective Field Goal Percentage','Free Throws Per Game','Free Throw Attempts Per Game','Free Throw Percentage','Offensive Rebounds Per Game','Defensive Rebounds Per Game','Total Rebounds Per Game','Assists Per Game','Steals Per Game','Blocks Per Game','Turnovers Per Game','Personal Fouls Per Game','Points Per Game']].apply(pd.to_numeric)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 606 entries, 0 to 841
Data columns (total 29 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Player                                605 non-null    object 
 1   Position                              605 non-null    object 
 2   Age                                   605 non-null    object 
 3   Team                                  605 non-null    object 
 4   Games                                 605 non-null    object 
 5   Games Started                         605 non-null    object 
 6   Minutes Played Per Game               605 non-null    float64
 7   Field Goals Per Game                  605 non-null    float64
 8   Field Goal Attempts Per Game          605 non-null    float64
 9   Field Goal Percentage                 596 non-null    float64
 10  3-Point Field Goals Per Game          605 non-null    float64
 11  3-Point Field Goal 

In [20]:
#remove null row
df = df[df['Player'].notna()]

## Data Cleaning

In [21]:
#replace null values with zeros
df=df.replace(np.nan,0)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 605 entries, 0 to 841
Data columns (total 29 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Player                                605 non-null    object 
 1   Position                              605 non-null    object 
 2   Age                                   605 non-null    object 
 3   Team                                  605 non-null    object 
 4   Games                                 605 non-null    object 
 5   Games Started                         605 non-null    object 
 6   Minutes Played Per Game               605 non-null    float64
 7   Field Goals Per Game                  605 non-null    float64
 8   Field Goal Attempts Per Game          605 non-null    float64
 9   Field Goal Percentage                 605 non-null    float64
 10  3-Point Field Goals Per Game          605 non-null    float64
 11  3-Point Field Goal 

In [24]:
#save the work
df.to_csv('nba_clean.csv',index=False)