# ALL NBA TEAMS PROJECT

In this notebook I do the following:
 - Data cleaning and formatting
 - Save cleaned data in SQL and also as csv

## Load libraries

In [1]:
import pandas as pd
import numpy as np
import re

import pymysql                        # for getting data from a SQL database
from sqlalchemy import create_engine  # for establishing the connection and authentication

from getpass import getpass  # To get the password without showing the input

## Read datasets

In [2]:
all_nba_teams = pd.read_excel('all_nba_teams.xlsx')
players = pd.read_excel('player_data.xlsx')

### Check datasets

In [3]:
pd.set_option('display.max_columns', None)

display(all_nba_teams)
players

Unnamed: 0,Season,First team,Second team,Third team
0,1989–90,Karl Malone* (3),Larry Bird* (10),James Worthy*
1,1989–90,Charles Barkley* (5),Tom Chambers (2),Chris Mullin* (2)
2,1989–90,Patrick Ewing* (3),Akeem Olajuwon*[e] (5),David Robinson*
3,1989–90,Magic Johnson* (9),John Stockton* (3),Clyde Drexler* (2)
4,1989–90,Michael Jordan* (5),Kevin Johnson (2),Joe Dumars*
...,...,...,...,...
165,2022–23,Giannis Antetokounmpo^ (7),Jimmy Butler^ (5),LeBron James^ (19)
166,2022–23,Jayson Tatum^ (3),Jaylen Brown^,Julius Randle^ (2)
167,2022–23,Joel Embiid^ (5),Nikola Jokić^ (5),Domantas Sabonis^
168,2022–23,Luka Dončić^ (4),Stephen Curry^ (9),De'Aaron Fox^


Unnamed: 0,Player,GS,AS,Season,Age,Team,G,MP,FG,FGA,2P,2PA,3P,3PA,FT,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,FG%,2P%,3P%,FT%,TS%,eFG%,WS,ORtg,DRtg,OWS,DWS,WS/48,OBPM,DBPM,BPM,VORP,PER,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Pos
0,Bam Adebayo,43,1,2023-24,26,MIA,43,34.5,7.6,15.0,7.6,14.8,0.0,0.3,5.0,6.4,2.2,8.3,10.6,4.0,1.0,1.0,2.4,2.5,20.3,0.505,0.513,0.091,0.780,0.566,0.506,3.9,113,110,1.4,2.5,0.126,0.4,1.4,1.8,1.4,19.5,7.1,27.9,17.3,19.4,1.5,2.8,11.9,26.0,C-F
1,Grayson Allen,47,0,2023-24,28,PHO,47,32.9,4.3,8.4,1.7,3.2,2.6,5.2,1.6,1.8,0.6,3.4,4.0,3.1,0.9,0.6,1.3,2.1,12.8,0.511,0.547,0.490,0.884,0.693,0.663,4.1,131,118,2.8,1.3,0.128,0.9,-0.1,0.9,1.1,13.5,2.4,11.1,7.0,12.5,1.3,1.7,12.7,13.9,G
2,Jarrett Allen,46,0,2023-24,25,CLE,46,30.5,6.4,9.9,6.4,9.8,0.0,0.1,2.8,3.7,3.6,7.1,10.6,2.8,0.8,1.2,1.8,2.3,15.5,0.644,0.650,0.000,0.747,0.673,0.644,6.6,132,108,3.9,2.8,0.228,1.9,1.7,3.6,2.0,22.1,13.2,25.0,19.3,13.4,1.3,3.4,13.4,18.9,C-F
3,Giannis Antetokounmpo,51,1,2023-24,29,MIL,51,35.0,11.5,18.9,11.0,17.1,0.5,1.8,7.3,11.1,2.5,8.7,11.2,6.3,1.4,1.1,3.5,3.1,30.7,0.609,0.646,0.250,0.657,0.646,0.620,8.6,125,113,6.1,2.5,0.232,6.1,2.1,8.2,4.6,29.2,8.2,26.4,17.6,30.5,1.9,2.5,12.9,33.1,F-G
4,OG Anunoby,41,0,2023-24,26,"NYK,TOR",41,34.1,5.8,11.7,3.7,6.1,2.1,5.5,1.6,2.0,1.0,3.2,4.2,2.3,1.3,0.7,1.5,2.3,15.3,0.498,0.606,0.379,0.774,0.609,0.588,2.5,116,117,1.3,1.3,0.087,-0.6,0.5,-0.1,0.7,13.5,3.1,10.6,6.8,9.4,1.9,1.8,10.7,17.6,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,Dominique Wilkins,79,1,1989-90,30,ATL,80,36.1,10.1,20.9,9.4,18.6,0.7,2.3,5.7,7.1,2.7,3.8,6.5,2.5,1.6,0.6,2.2,1.8,26.7,0.484,0.504,0.322,0.807,0.556,0.502,11.1,118,112,9.3,1.8,0.184,6.5,-1.1,5.4,5.4,24.6,8.4,12.4,10.4,11.8,2.2,1.0,8.3,30.4,F-G
5160,Gerald Wilkins,80,0,1989-90,26,NYK,82,31.8,5.8,12.6,5.3,11.1,0.5,1.5,2.5,3.2,1.6,2.9,4.5,4.0,1.2,0.3,2.4,2.3,14.5,0.457,0.477,0.312,0.803,0.520,0.476,4.4,106,110,2.3,2.1,0.082,0.3,-0.6,-0.3,1.1,14.3,5.8,9.8,7.9,18.5,1.8,0.5,14.5,21.4,G-F
5161,Buck Williams,82,0,1989-90,29,POR,82,34.2,5.0,9.2,5.0,9.2,0.0,0.0,3.5,5.0,3.0,6.7,9.8,1.4,0.8,0.5,2.0,3.5,13.6,0.548,0.548,0.000,0.706,0.597,0.548,8.7,116,104,4.3,4.3,0.149,0.1,0.6,0.7,1.9,14.3,9.6,21.6,15.5,5.5,1.2,0.8,15.3,15.5,F-C
5162,Kevin Willis,51,0,1989-90,27,ATL,81,28.1,5.2,9.9,5.1,9.9,0.0,0.1,2.1,3.0,3.1,4.8,8.0,0.7,0.8,0.6,1.8,3.2,12.4,0.519,0.521,0.286,0.683,0.551,0.520,4.7,111,111,3.0,1.7,0.098,0.0,-1.8,-1.7,0.1,15.7,12.5,20.4,16.4,3.7,1.4,1.3,13.6,19.5,F-C


## Convert column names to snake case

In [4]:
# define a function that converts all column names to lowercase and replace spaces by "_"

def lowerand_(df):
    df.columns = df.columns.str.lower()
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

In [5]:
# apply the function

lowerand_(all_nba_teams)
lowerand_(players)

Unnamed: 0,player,gs,as,season,age,team,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,pos
0,Bam Adebayo,43,1,2023-24,26,MIA,43,34.5,7.6,15.0,7.6,14.8,0.0,0.3,5.0,6.4,2.2,8.3,10.6,4.0,1.0,1.0,2.4,2.5,20.3,0.505,0.513,0.091,0.780,0.566,0.506,3.9,113,110,1.4,2.5,0.126,0.4,1.4,1.8,1.4,19.5,7.1,27.9,17.3,19.4,1.5,2.8,11.9,26.0,C-F
1,Grayson Allen,47,0,2023-24,28,PHO,47,32.9,4.3,8.4,1.7,3.2,2.6,5.2,1.6,1.8,0.6,3.4,4.0,3.1,0.9,0.6,1.3,2.1,12.8,0.511,0.547,0.490,0.884,0.693,0.663,4.1,131,118,2.8,1.3,0.128,0.9,-0.1,0.9,1.1,13.5,2.4,11.1,7.0,12.5,1.3,1.7,12.7,13.9,G
2,Jarrett Allen,46,0,2023-24,25,CLE,46,30.5,6.4,9.9,6.4,9.8,0.0,0.1,2.8,3.7,3.6,7.1,10.6,2.8,0.8,1.2,1.8,2.3,15.5,0.644,0.650,0.000,0.747,0.673,0.644,6.6,132,108,3.9,2.8,0.228,1.9,1.7,3.6,2.0,22.1,13.2,25.0,19.3,13.4,1.3,3.4,13.4,18.9,C-F
3,Giannis Antetokounmpo,51,1,2023-24,29,MIL,51,35.0,11.5,18.9,11.0,17.1,0.5,1.8,7.3,11.1,2.5,8.7,11.2,6.3,1.4,1.1,3.5,3.1,30.7,0.609,0.646,0.250,0.657,0.646,0.620,8.6,125,113,6.1,2.5,0.232,6.1,2.1,8.2,4.6,29.2,8.2,26.4,17.6,30.5,1.9,2.5,12.9,33.1,F-G
4,OG Anunoby,41,0,2023-24,26,"NYK,TOR",41,34.1,5.8,11.7,3.7,6.1,2.1,5.5,1.6,2.0,1.0,3.2,4.2,2.3,1.3,0.7,1.5,2.3,15.3,0.498,0.606,0.379,0.774,0.609,0.588,2.5,116,117,1.3,1.3,0.087,-0.6,0.5,-0.1,0.7,13.5,3.1,10.6,6.8,9.4,1.9,1.8,10.7,17.6,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,Dominique Wilkins,79,1,1989-90,30,ATL,80,36.1,10.1,20.9,9.4,18.6,0.7,2.3,5.7,7.1,2.7,3.8,6.5,2.5,1.6,0.6,2.2,1.8,26.7,0.484,0.504,0.322,0.807,0.556,0.502,11.1,118,112,9.3,1.8,0.184,6.5,-1.1,5.4,5.4,24.6,8.4,12.4,10.4,11.8,2.2,1.0,8.3,30.4,F-G
5160,Gerald Wilkins,80,0,1989-90,26,NYK,82,31.8,5.8,12.6,5.3,11.1,0.5,1.5,2.5,3.2,1.6,2.9,4.5,4.0,1.2,0.3,2.4,2.3,14.5,0.457,0.477,0.312,0.803,0.520,0.476,4.4,106,110,2.3,2.1,0.082,0.3,-0.6,-0.3,1.1,14.3,5.8,9.8,7.9,18.5,1.8,0.5,14.5,21.4,G-F
5161,Buck Williams,82,0,1989-90,29,POR,82,34.2,5.0,9.2,5.0,9.2,0.0,0.0,3.5,5.0,3.0,6.7,9.8,1.4,0.8,0.5,2.0,3.5,13.6,0.548,0.548,0.000,0.706,0.597,0.548,8.7,116,104,4.3,4.3,0.149,0.1,0.6,0.7,1.9,14.3,9.6,21.6,15.5,5.5,1.2,0.8,15.3,15.5,F-C
5162,Kevin Willis,51,0,1989-90,27,ATL,81,28.1,5.2,9.9,5.1,9.9,0.0,0.1,2.1,3.0,3.1,4.8,8.0,0.7,0.8,0.6,1.8,3.2,12.4,0.519,0.521,0.286,0.683,0.551,0.520,4.7,111,111,3.0,1.7,0.098,0.0,-1.8,-1.7,0.1,15.7,12.5,20.4,16.4,3.7,1.4,1.3,13.6,19.5,F-C


In [6]:
# check the column names

display(all_nba_teams.columns)
display(players.columns)

Index(['season', 'first_team', 'second_team', 'third_team'], dtype='object')

Index(['player', 'gs', 'as', 'season', 'age', 'team', 'g', 'mp', 'fg', 'fga',
       '2p', '2pa', '3p', '3pa', 'ft', 'fta', 'orb', 'drb', 'trb', 'ast',
       'stl', 'blk', 'tov', 'pf', 'pts', 'fg%', '2p%', '3p%', 'ft%', 'ts%',
       'efg%', 'ws', 'ortg', 'drtg', 'ows', 'dws', 'ws/48', 'obpm', 'dbpm',
       'bpm', 'vorp', 'per', 'orb%', 'drb%', 'trb%', 'ast%', 'stl%', 'blk%',
       'tov%', 'usg%', 'pos'],
      dtype='object')

## Check for null values

In [7]:
display(all_nba_teams.isnull().sum())
display(players.isnull().sum())

season         0
first_team     0
second_team    0
third_team     0
dtype: int64

player      0
gs          0
as          0
season      0
age         0
team        0
g           0
mp          0
fg          0
fga         0
2p          0
2pa         0
3p          0
3pa         0
ft          0
fta         0
orb         0
drb         0
trb         0
ast         0
stl         0
blk         0
tov         0
pf          0
pts         0
fg%         0
2p%         0
3p%       311
ft%         0
ts%         0
efg%        0
ws          0
ortg        0
drtg        0
ows         0
dws         0
ws/48       0
obpm        0
dbpm        0
bpm         0
vorp        0
per         0
orb%        0
drb%        0
trb%        0
ast%        0
stl%        0
blk%        0
tov%        0
usg%        0
pos         0
dtype: int64

In [8]:
players[players['3p%'].isnull()]

Unnamed: 0,player,gs,as,season,age,team,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,pos
6,Deandre Ayton,35,0,2023-24,25,POR,35,32.0,6.4,11.6,6.4,11.6,0.0,0.0,1.1,1.3,3.1,7.3,10.4,1.7,1.0,0.8,1.8,2.2,13.9,0.554,0.554,,0.809,0.572,0.554,2.0,112,115,0.7,1.3,0.086,-1.2,-0.2,-1.3,0.2,16.4,10.5,27.2,18.4,8.5,1.6,2.2,12.9,18.6,C
17,Bismack Biyombo,27,0,2023-24,31,MEM,30,23.9,2.2,4.0,2.2,4.0,0.0,0.0,0.7,1.5,1.9,4.5,6.4,1.7,0.3,1.1,1.3,2.7,5.2,0.563,0.563,,0.478,0.560,0.563,1.0,109,114,0.1,0.9,0.069,-4.9,1.6,-3.3,-0.2,9.9,8.4,21.5,14.7,10.0,0.7,4.4,21.4,10.6,C-F
31,Clint Capela,47,0,2023-24,29,ATL,47,25.7,4.8,8.4,4.8,8.4,0.0,0.0,1.8,2.8,4.8,5.8,10.6,1.0,0.6,1.5,0.9,2.4,11.5,0.575,0.575,,0.649,0.595,0.575,4.1,132,117,3.0,1.1,0.164,1.2,-1.2,0.0,0.6,20.8,19.4,25.1,22.2,5.3,1.1,5.2,8.1,16.7,C
53,Daniel Gafford,45,0,2023-24,25,"DAL,WAS",46,26.3,4.6,6.6,4.6,6.6,0.0,0.0,1.9,2.7,2.9,5.1,8.0,1.5,1.0,2.1,1.0,3.2,11.1,0.689,0.689,,0.712,0.707,0.689,4.5,138,115,3.1,1.4,0.178,-0.3,1.1,0.8,0.9,20.1,11.8,21.1,16.4,7.8,1.7,6.5,11.3,14.1,C-F
106,Jakob Poeltl,42,0,2023-24,28,TOR,42,26.8,4.8,7.0,4.8,7.0,0.0,0.0,1.2,2.2,2.7,5.8,8.5,2.4,0.7,1.5,1.7,3.0,10.8,0.688,0.688,,0.538,0.679,0.688,3.2,125,115,1.9,1.4,0.137,-0.8,1.1,0.3,0.7,18.1,11.1,24.3,17.5,12.6,1.2,5.0,17.3,15.4,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5073,Horace Grant,80,0,1989-90,24,CHI,80,34.4,5.6,10.7,5.6,10.7,0.0,0.0,2.2,3.2,3.0,4.9,7.9,2.8,1.2,1.1,1.4,2.9,13.4,0.523,0.523,,0.699,0.555,0.523,8.0,118,108,5.3,2.8,0.140,1.3,-0.2,1.1,2.1,16.6,10.1,16.9,13.5,11.3,1.7,1.9,10.2,16.7,F-C
5088,Charles Jones,81,0,1989-90,32,WSB,81,27.7,1.2,2.3,1.2,2.3,0.0,0.0,0.8,1.3,1.8,4.4,6.2,1.7,0.6,2.4,0.9,3.7,3.2,0.508,0.508,,0.648,0.554,0.508,3.8,111,106,1.0,2.8,0.082,-3.1,2.7,-0.5,0.9,9.2,6.9,17.4,12.1,7.2,1.1,5.0,24.7,5.6,F-C
5117,Robert Parish,78,1,1989-90,36,BOS,79,30.3,6.4,11.0,6.4,11.0,0.0,0.0,2.9,3.9,3.3,6.8,10.1,1.3,0.5,0.9,2.1,2.4,15.7,0.580,0.580,,0.747,0.616,0.580,7.8,116,106,4.8,3.0,0.157,1.9,-0.8,1.1,1.9,19.2,12.6,23.5,18.3,6.2,0.8,1.7,14.4,20.7,C
5150,Mychal Thompson,70,0,1989-90,35,LAL,70,26.9,4.0,8.0,4.0,8.0,0.0,0.0,2.1,2.9,2.5,4.3,6.8,0.6,0.5,1.0,1.1,3.0,10.1,0.500,0.500,,0.706,0.542,0.500,4.7,112,107,2.4,2.3,0.119,-1.1,-0.8,-2.0,0.0,13.4,10.9,17.9,14.5,3.2,0.9,2.3,10.8,16.6,C-F


In [9]:
# the column 3p% is null in these rows because the player did't attempt a single 3 point shot.
# the most reasonable way to deal with these NaN values is to fill them with 0.
players['3p%'] = players['3p%'].fillna(0)

In [10]:
players.isnull().sum().sum()

0

## Drop duplicates

In [11]:
display(all_nba_teams.shape)
display(players.shape)

(170, 4)

(5164, 51)

In [12]:
# I get rid of the duplicates, if any
all_nba_teams = all_nba_teams.drop_duplicates()
players = players.drop_duplicates()

In [13]:
display(all_nba_teams.shape)
display(players.shape)

# there weren't any duplicates

(170, 4)

(5164, 51)

## Clean all_nba_teams dataframe

In [14]:
all_nba_teams

Unnamed: 0,season,first_team,second_team,third_team
0,1989–90,Karl Malone* (3),Larry Bird* (10),James Worthy*
1,1989–90,Charles Barkley* (5),Tom Chambers (2),Chris Mullin* (2)
2,1989–90,Patrick Ewing* (3),Akeem Olajuwon*[e] (5),David Robinson*
3,1989–90,Magic Johnson* (9),John Stockton* (3),Clyde Drexler* (2)
4,1989–90,Michael Jordan* (5),Kevin Johnson (2),Joe Dumars*
...,...,...,...,...
165,2022–23,Giannis Antetokounmpo^ (7),Jimmy Butler^ (5),LeBron James^ (19)
166,2022–23,Jayson Tatum^ (3),Jaylen Brown^,Julius Randle^ (2)
167,2022–23,Joel Embiid^ (5),Nikola Jokić^ (5),Domantas Sabonis^
168,2022–23,Luka Dončić^ (4),Stephen Curry^ (9),De'Aaron Fox^


In [15]:
# clean 'season' column replacing a long hypen with a hypen
all_nba_teams['season'] = all_nba_teams['season'].str.replace('–', '-')

In [16]:
# check the consistency of the column 'season'
# define a pattern that matches 4 numbers, a hypen and 2 numbers

pattern_season = r"^\d{4}+[-]{1}+\d{2}$"

# use str.contains() to check which values in the column don't match the pattern
inconsistent_values_season_col = all_nba_teams[~all_nba_teams['season'].str.contains(pattern_season, na=False)]

display(inconsistent_values_season_col)

# column 'season' is cleaned and consistent

Unnamed: 0,season,first_team,second_team,third_team


In [17]:
# make a new dataframe just with the columns that still need cleaning
teams = all_nba_teams.drop(['season'], axis=1)

In [18]:
# define a function to clean the values that contain * after the player's name
def teamsclean1(x):
    return x.split('*')[0].strip()

In [19]:
teams

Unnamed: 0,first_team,second_team,third_team
0,Karl Malone* (3),Larry Bird* (10),James Worthy*
1,Charles Barkley* (5),Tom Chambers (2),Chris Mullin* (2)
2,Patrick Ewing* (3),Akeem Olajuwon*[e] (5),David Robinson*
3,Magic Johnson* (9),John Stockton* (3),Clyde Drexler* (2)
4,Michael Jordan* (5),Kevin Johnson (2),Joe Dumars*
...,...,...,...
165,Giannis Antetokounmpo^ (7),Jimmy Butler^ (5),LeBron James^ (19)
166,Jayson Tatum^ (3),Jaylen Brown^,Julius Randle^ (2)
167,Joel Embiid^ (5),Nikola Jokić^ (5),Domantas Sabonis^
168,Luka Dončić^ (4),Stephen Curry^ (9),De'Aaron Fox^


In [20]:
# perform the fuction on all columns
for col in teams.columns:
    teams[col] = teams[col].apply(teamsclean1)

In [21]:
# define a function to clean the values that contain ^ after the player's name
def teamsclean2(x):
    return x.split('^')[0].strip()

In [22]:
# perform the fuction on all columns
for col in teams.columns:
    teams[col] = teams[col].apply(teamsclean2)

In [23]:
# check the consistency of the columns using regex library

# define a pattern that matches a name and surname
pattern = r"^[A-Za-z'-]+\s[A-Za-z'-]+$"

# use str.contains() to check which values in the column don't match the pattern
inconsistent_values = teams[~teams['first_team'].str.contains(pattern, na=False)]

display(inconsistent_values)

Unnamed: 0,first_team,second_team,third_team
19,Mark Price (3),Joe Dumars,Dražen Petrović
34,Penny Hardaway (2),John Stockton,Reggie Miller
87,Amare Stoudemire[g] (2),Yao Ming,Dwight Howard†
92,Dwight Howard† (2),Amare Stoudemire[g] (3),Yao Ming
97,Dwight Howard† (3),Yao Ming,Shaquille O'Neal
102,Dwight Howard† (4),Amar'e Stoudemire (4),Andrew Bogut
107,Dwight Howard† (5),Amar'e Stoudemire (5),Al Horford
112,Dwight Howard† (6),Andrew Bynum,Tyson Chandler§
122,Joakim Noah§,Dwight Howard† (8),Al Jefferson
127,Marc Gasol† (2),DeMarcus Cousins†,DeAndre Jordan


In [24]:
# define a function to clean the values that contain † after the player's name
def teamsclean3(x):
    return x.split('†')[0].strip()

In [25]:
# perform the fuction on all columns
for col in teams.columns:
    teams[col] = teams[col].apply(teamsclean3)

In [26]:
# define a function to clean the values that contain § after the player's name
def teamsclean4(x):
    return x.split('§')[0].strip()

In [27]:
# perform the fuction on all columns
for col in teams.columns:
    teams[col] = teams[col].apply(teamsclean4)

In [28]:
teams

Unnamed: 0,first_team,second_team,third_team
0,Karl Malone,Larry Bird,James Worthy
1,Charles Barkley,Tom Chambers (2),Chris Mullin
2,Patrick Ewing,Akeem Olajuwon,David Robinson
3,Magic Johnson,John Stockton,Clyde Drexler
4,Michael Jordan,Kevin Johnson (2),Joe Dumars
...,...,...,...
165,Giannis Antetokounmpo,Jimmy Butler,LeBron James
166,Jayson Tatum,Jaylen Brown,Julius Randle
167,Joel Embiid,Nikola Jokić,Domantas Sabonis
168,Luka Dončić,Stephen Curry,De'Aaron Fox


In [29]:
# define a function to clean the values that contain a number between () after the player's name
def teamsclean5(x):
    return x.split('(')[0].strip()

In [30]:
# perform the fuction on all columns
for col in teams.columns:
    teams[col] = teams[col].apply(teamsclean5)

In [31]:
# I check again for inconsistent values in the same column
inconsistent_values_recheck = teams[~teams['first_team'].str.contains(pattern, na=False)]

display(inconsistent_values_recheck)

Unnamed: 0,first_team,second_team,third_team
87,Amare Stoudemire[g],Yao Ming,Dwight Howard
147,Nikola Jokić,Joel Embiid,Rudy Gobert
154,Luka Dončić,Chris Paul,Russell Westbrook
157,Nikola Jokić,Joel Embiid,Rudy Gobert
159,Luka Dončić,Chris Paul,Kyrie Irving
162,Nikola Jokić,Joel Embiid,Karl-Anthony Towns
164,Luka Dončić,Ja Morant,Trae Young
168,Luka Dončić,Stephen Curry,De'Aaron Fox


Even though Jokić and Dončić don't match the pattern, the only value that needs cleaning is Amare Stoudemire[g]

In [32]:
# define a function to clean the values that contain a number between () after the player's name
def teamsclean6(x):
    return x.split('[')[0].strip()

In [33]:
# perform the fuction on all columns
for col in teams.columns:
    teams[col] = teams[col].apply(teamsclean6)

In [34]:
# check again for inconsistent values in the same column
inconsistent_values_recheck = teams[~teams['first_team'].str.contains(pattern, na=False)]

display(inconsistent_values_recheck)

# now the column is consistent and clean

Unnamed: 0,first_team,second_team,third_team
147,Nikola Jokić,Joel Embiid,Rudy Gobert
154,Luka Dončić,Chris Paul,Russell Westbrook
157,Nikola Jokić,Joel Embiid,Rudy Gobert
159,Luka Dončić,Chris Paul,Kyrie Irving
162,Nikola Jokić,Joel Embiid,Karl-Anthony Towns
164,Luka Dončić,Ja Morant,Trae Young
168,Luka Dončić,Stephen Curry,De'Aaron Fox


In [35]:
# let's do the same for the 'second_team' column
inconsistent_values_recheck = teams[~teams['second_team'].str.contains(pattern, na=False)]

display(inconsistent_values_recheck)

# the column is consistent and clean

Unnamed: 0,first_team,second_team,third_team
71,Tim Duncan,Peja Stojaković,Ron Artest
152,Anthony Davis,Nikola Jokić,Rudy Gobert
167,Joel Embiid,Nikola Jokić,Domantas Sabonis


In [36]:
# let's do the same for the 'third_team' column
inconsistent_values_recheck = teams[~teams['third_team'].str.contains(pattern, na=False)]

display(inconsistent_values_recheck)

# the column is consistent and clean

Unnamed: 0,first_team,second_team,third_team
19,Mark Price,Joe Dumars,Dražen Petrović
94,Chris Paul,Deron Williams,Manu Ginóbili
108,Kobe Bryant,Dwyane Wade,Manu Ginóbili
123,James Harden,Stephen Curry,Goran Dragić


In [37]:
# make a new dataframe with just the season column from the all_nba_teams dataframe
season = all_nba_teams[['season']]
season

Unnamed: 0,season
0,1989-90
1,1989-90
2,1989-90
3,1989-90
4,1989-90
...,...
165,2022-23
166,2022-23
167,2022-23
168,2022-23


In [38]:
# concatenate season and teams dataframes
all_nba_teams_cleaned = pd.concat([season, teams], axis=1)
all_nba_teams_cleaned

Unnamed: 0,season,first_team,second_team,third_team
0,1989-90,Karl Malone,Larry Bird,James Worthy
1,1989-90,Charles Barkley,Tom Chambers,Chris Mullin
2,1989-90,Patrick Ewing,Akeem Olajuwon,David Robinson
3,1989-90,Magic Johnson,John Stockton,Clyde Drexler
4,1989-90,Michael Jordan,Kevin Johnson,Joe Dumars
...,...,...,...,...
165,2022-23,Giannis Antetokounmpo,Jimmy Butler,LeBron James
166,2022-23,Jayson Tatum,Jaylen Brown,Julius Randle
167,2022-23,Joel Embiid,Nikola Jokić,Domantas Sabonis
168,2022-23,Luka Dončić,Stephen Curry,De'Aaron Fox


## Check dtypes

In [39]:
all_nba_teams_cleaned.dtypes

season         object
first_team     object
second_team    object
third_team     object
dtype: object

`all_nba_teams_cleaned` dataframe is cleaned and consistent

## Clean players dataframe

In [40]:
players

Unnamed: 0,player,gs,as,season,age,team,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,pos
0,Bam Adebayo,43,1,2023-24,26,MIA,43,34.5,7.6,15.0,7.6,14.8,0.0,0.3,5.0,6.4,2.2,8.3,10.6,4.0,1.0,1.0,2.4,2.5,20.3,0.505,0.513,0.091,0.780,0.566,0.506,3.9,113,110,1.4,2.5,0.126,0.4,1.4,1.8,1.4,19.5,7.1,27.9,17.3,19.4,1.5,2.8,11.9,26.0,C-F
1,Grayson Allen,47,0,2023-24,28,PHO,47,32.9,4.3,8.4,1.7,3.2,2.6,5.2,1.6,1.8,0.6,3.4,4.0,3.1,0.9,0.6,1.3,2.1,12.8,0.511,0.547,0.490,0.884,0.693,0.663,4.1,131,118,2.8,1.3,0.128,0.9,-0.1,0.9,1.1,13.5,2.4,11.1,7.0,12.5,1.3,1.7,12.7,13.9,G
2,Jarrett Allen,46,0,2023-24,25,CLE,46,30.5,6.4,9.9,6.4,9.8,0.0,0.1,2.8,3.7,3.6,7.1,10.6,2.8,0.8,1.2,1.8,2.3,15.5,0.644,0.650,0.000,0.747,0.673,0.644,6.6,132,108,3.9,2.8,0.228,1.9,1.7,3.6,2.0,22.1,13.2,25.0,19.3,13.4,1.3,3.4,13.4,18.9,C-F
3,Giannis Antetokounmpo,51,1,2023-24,29,MIL,51,35.0,11.5,18.9,11.0,17.1,0.5,1.8,7.3,11.1,2.5,8.7,11.2,6.3,1.4,1.1,3.5,3.1,30.7,0.609,0.646,0.250,0.657,0.646,0.620,8.6,125,113,6.1,2.5,0.232,6.1,2.1,8.2,4.6,29.2,8.2,26.4,17.6,30.5,1.9,2.5,12.9,33.1,F-G
4,OG Anunoby,41,0,2023-24,26,"NYK,TOR",41,34.1,5.8,11.7,3.7,6.1,2.1,5.5,1.6,2.0,1.0,3.2,4.2,2.3,1.3,0.7,1.5,2.3,15.3,0.498,0.606,0.379,0.774,0.609,0.588,2.5,116,117,1.3,1.3,0.087,-0.6,0.5,-0.1,0.7,13.5,3.1,10.6,6.8,9.4,1.9,1.8,10.7,17.6,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,Dominique Wilkins,79,1,1989-90,30,ATL,80,36.1,10.1,20.9,9.4,18.6,0.7,2.3,5.7,7.1,2.7,3.8,6.5,2.5,1.6,0.6,2.2,1.8,26.7,0.484,0.504,0.322,0.807,0.556,0.502,11.1,118,112,9.3,1.8,0.184,6.5,-1.1,5.4,5.4,24.6,8.4,12.4,10.4,11.8,2.2,1.0,8.3,30.4,F-G
5160,Gerald Wilkins,80,0,1989-90,26,NYK,82,31.8,5.8,12.6,5.3,11.1,0.5,1.5,2.5,3.2,1.6,2.9,4.5,4.0,1.2,0.3,2.4,2.3,14.5,0.457,0.477,0.312,0.803,0.520,0.476,4.4,106,110,2.3,2.1,0.082,0.3,-0.6,-0.3,1.1,14.3,5.8,9.8,7.9,18.5,1.8,0.5,14.5,21.4,G-F
5161,Buck Williams,82,0,1989-90,29,POR,82,34.2,5.0,9.2,5.0,9.2,0.0,0.0,3.5,5.0,3.0,6.7,9.8,1.4,0.8,0.5,2.0,3.5,13.6,0.548,0.548,0.000,0.706,0.597,0.548,8.7,116,104,4.3,4.3,0.149,0.1,0.6,0.7,1.9,14.3,9.6,21.6,15.5,5.5,1.2,0.8,15.3,15.5,F-C
5162,Kevin Willis,51,0,1989-90,27,ATL,81,28.1,5.2,9.9,5.1,9.9,0.0,0.1,2.1,3.0,3.1,4.8,8.0,0.7,0.8,0.6,1.8,3.2,12.4,0.519,0.521,0.286,0.683,0.551,0.520,4.7,111,111,3.0,1.7,0.098,0.0,-1.8,-1.7,0.1,15.7,12.5,20.4,16.4,3.7,1.4,1.3,13.6,19.5,F-C


In [41]:
# split players dataframe into categorical and numerical data
players.dtypes

player     object
gs          int64
as          int64
season     object
age         int64
team       object
g           int64
mp        float64
fg        float64
fga       float64
2p        float64
2pa       float64
3p        float64
3pa       float64
ft        float64
fta       float64
orb       float64
drb       float64
trb       float64
ast       float64
stl       float64
blk       float64
tov       float64
pf        float64
pts       float64
fg%       float64
2p%       float64
3p%       float64
ft%       float64
ts%       float64
efg%      float64
ws        float64
ortg        int64
drtg        int64
ows       float64
dws       float64
ws/48     float64
obpm      float64
dbpm      float64
bpm       float64
vorp      float64
per       float64
orb%      float64
drb%      float64
trb%      float64
ast%      float64
stl%      float64
blk%      float64
tov%      float64
usg%      float64
pos        object
dtype: object

In [42]:
players_num = players.select_dtypes(exclude='object')
players_cat = players.select_dtypes('object')

display(players_num)
display(players_cat)

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
0,43,1,26,43,34.5,7.6,15.0,7.6,14.8,0.0,0.3,5.0,6.4,2.2,8.3,10.6,4.0,1.0,1.0,2.4,2.5,20.3,0.505,0.513,0.091,0.780,0.566,0.506,3.9,113,110,1.4,2.5,0.126,0.4,1.4,1.8,1.4,19.5,7.1,27.9,17.3,19.4,1.5,2.8,11.9,26.0
1,47,0,28,47,32.9,4.3,8.4,1.7,3.2,2.6,5.2,1.6,1.8,0.6,3.4,4.0,3.1,0.9,0.6,1.3,2.1,12.8,0.511,0.547,0.490,0.884,0.693,0.663,4.1,131,118,2.8,1.3,0.128,0.9,-0.1,0.9,1.1,13.5,2.4,11.1,7.0,12.5,1.3,1.7,12.7,13.9
2,46,0,25,46,30.5,6.4,9.9,6.4,9.8,0.0,0.1,2.8,3.7,3.6,7.1,10.6,2.8,0.8,1.2,1.8,2.3,15.5,0.644,0.650,0.000,0.747,0.673,0.644,6.6,132,108,3.9,2.8,0.228,1.9,1.7,3.6,2.0,22.1,13.2,25.0,19.3,13.4,1.3,3.4,13.4,18.9
3,51,1,29,51,35.0,11.5,18.9,11.0,17.1,0.5,1.8,7.3,11.1,2.5,8.7,11.2,6.3,1.4,1.1,3.5,3.1,30.7,0.609,0.646,0.250,0.657,0.646,0.620,8.6,125,113,6.1,2.5,0.232,6.1,2.1,8.2,4.6,29.2,8.2,26.4,17.6,30.5,1.9,2.5,12.9,33.1
4,41,0,26,41,34.1,5.8,11.7,3.7,6.1,2.1,5.5,1.6,2.0,1.0,3.2,4.2,2.3,1.3,0.7,1.5,2.3,15.3,0.498,0.606,0.379,0.774,0.609,0.588,2.5,116,117,1.3,1.3,0.087,-0.6,0.5,-0.1,0.7,13.5,3.1,10.6,6.8,9.4,1.9,1.8,10.7,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,79,1,30,80,36.1,10.1,20.9,9.4,18.6,0.7,2.3,5.7,7.1,2.7,3.8,6.5,2.5,1.6,0.6,2.2,1.8,26.7,0.484,0.504,0.322,0.807,0.556,0.502,11.1,118,112,9.3,1.8,0.184,6.5,-1.1,5.4,5.4,24.6,8.4,12.4,10.4,11.8,2.2,1.0,8.3,30.4
5160,80,0,26,82,31.8,5.8,12.6,5.3,11.1,0.5,1.5,2.5,3.2,1.6,2.9,4.5,4.0,1.2,0.3,2.4,2.3,14.5,0.457,0.477,0.312,0.803,0.520,0.476,4.4,106,110,2.3,2.1,0.082,0.3,-0.6,-0.3,1.1,14.3,5.8,9.8,7.9,18.5,1.8,0.5,14.5,21.4
5161,82,0,29,82,34.2,5.0,9.2,5.0,9.2,0.0,0.0,3.5,5.0,3.0,6.7,9.8,1.4,0.8,0.5,2.0,3.5,13.6,0.548,0.548,0.000,0.706,0.597,0.548,8.7,116,104,4.3,4.3,0.149,0.1,0.6,0.7,1.9,14.3,9.6,21.6,15.5,5.5,1.2,0.8,15.3,15.5
5162,51,0,27,81,28.1,5.2,9.9,5.1,9.9,0.0,0.1,2.1,3.0,3.1,4.8,8.0,0.7,0.8,0.6,1.8,3.2,12.4,0.519,0.521,0.286,0.683,0.551,0.520,4.7,111,111,3.0,1.7,0.098,0.0,-1.8,-1.7,0.1,15.7,12.5,20.4,16.4,3.7,1.4,1.3,13.6,19.5


Unnamed: 0,player,season,team,pos
0,Bam Adebayo,2023-24,MIA,C-F
1,Grayson Allen,2023-24,PHO,G
2,Jarrett Allen,2023-24,CLE,C-F
3,Giannis Antetokounmpo,2023-24,MIL,F-G
4,OG Anunoby,2023-24,"NYK,TOR",F
...,...,...,...,...
5159,Dominique Wilkins,1989-90,ATL,F-G
5160,Gerald Wilkins,1989-90,NYK,G-F
5161,Buck Williams,1989-90,POR,F-C
5162,Kevin Willis,1989-90,ATL,F-C


### Clean numerical columns

In [43]:
# check the consistency of the numerical variables
players_num.describe()

# I check if maximum and minimum values of the variables are reasonable

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
count,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0,5164.0
mean,63.618125,0.167312,26.953718,70.663826,31.2616,5.363555,11.498974,4.463187,9.005984,0.900426,2.492622,2.717157,3.54361,1.501433,4.18445,5.684586,3.234431,1.02651,0.669655,1.926859,2.5,14.345178,0.468782,0.495504,3.756639,0.952009,0.547222,0.506911,5.432494,109.431448,107.25,3.084314,2.348799,0.114113,0.712548,0.09969,0.811871,1.710244,16.245159,5.590453,15.250852,10.426143,16.234624,1.654473,1.678079,13.145023,20.755403
std,14.610237,0.37329,4.009816,11.378717,5.260753,2.034352,4.280671,1.977264,3.871232,0.883101,2.301389,1.633738,2.033145,0.986888,1.866628,2.661658,2.244887,0.456782,0.63228,0.781485,0.600903,5.733363,0.052533,0.052041,58.926442,13.905537,0.044697,0.04746,3.202642,7.788119,5.074007,2.502943,1.271404,0.054116,2.382868,1.188275,2.771453,1.799998,4.269401,3.716025,6.238129,4.678882,10.278613,0.636465,1.563729,3.447005,5.375671
min,25.0,0.0,19.0,25.0,12.5,0.7,1.6,0.3,0.6,0.0,0.0,0.1,0.2,0.1,0.7,1.0,0.1,0.1,0.0,0.3,0.8,1.9,0.319,0.323,0.0,0.262,0.37,0.323,-2.1,80.0,87.0,-3.3,-1.0,-0.06,-6.5,-3.9,-7.4,-2.6,3.0,0.4,3.2,2.6,1.0,0.3,0.0,4.9,5.3
25%,51.0,0.0,24.0,65.0,27.9,3.9,8.3,3.0,6.1,0.0,0.2,1.5,2.1,0.7,2.8,3.6,1.5,0.7,0.2,1.3,2.1,10.1,0.433,0.461,0.2,0.71,0.518,0.476,3.2,104.0,104.0,1.3,1.4,0.078,-0.9,-0.7,-1.0,0.5,13.3,2.5,10.1,6.5,8.5,1.2,0.6,10.7,17.0
50%,66.0,0.0,27.0,74.0,31.8,5.2,11.2,4.2,8.7,0.8,2.3,2.4,3.1,1.2,3.8,5.1,2.6,0.95,0.5,1.8,2.5,13.7,0.462,0.49,0.333,0.775,0.545,0.503,4.9,109.0,107.0,2.7,2.2,0.11,0.5,0.0,0.5,1.4,15.8,4.5,14.0,9.3,13.2,1.6,1.1,12.8,20.4
75%,77.0,0.0,30.0,80.0,35.2,6.7,14.5,5.7,11.5,1.5,4.2,3.6,4.6,2.2,5.2,7.3,4.4,1.3,0.9,2.4,2.9,18.1,0.497,0.523,0.375,0.826,0.575,0.534,7.2,114.0,111.0,4.3,3.1,0.147,2.1,0.9,2.3,2.5,18.7,8.3,19.7,14.0,22.1,2.0,2.2,15.2,24.3
max,83.0,1.0,40.0,85.0,43.7,12.7,27.8,12.1,23.4,5.3,13.2,10.5,13.1,6.8,12.3,18.7,14.5,3.0,4.6,5.7,4.6,36.1,0.763,0.766,1000.0,1000.0,0.745,0.763,20.4,148.0,123.0,14.9,9.1,0.322,10.3,4.6,14.0,11.8,34.2,20.5,38.0,26.7,57.5,4.7,9.8,31.9,41.7


In [44]:
players[players['g']==85]

# the number of games played are 85 because these players were traded during the season to a team that had played less
# games than the origin team by the trade day, so they were able to appear in 85 games. In the seasons for which I have data,
# the maximum number of regular season games is 82.

# both players were traded from ATL to POR during the 2003-04 season

Unnamed: 0,player,gs,as,season,age,team,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%,pos
3031,Shareef Abdur-Rahim,56,0,2003-04,27,"ATL,POR",85,31.6,5.9,12.4,5.8,12.0,0.1,0.4,4.4,5.0,2.2,5.3,7.5,2.0,0.8,0.4,2.2,2.6,16.3,0.475,0.482,0.265,0.869,0.557,0.48,8.1,110,105,6.1,2.0,0.145,2.3,-0.7,1.6,2.4,19.9,8.2,19.0,13.6,12.3,1.4,1.0,12.9,24.4,F
3144,Theo Ratliff,83,0,2003-04,30,"ATL,POR",85,31.3,3.1,6.5,3.1,6.5,0.0,0.0,1.6,2.6,2.3,4.9,7.2,0.8,0.6,3.6,1.4,3.5,7.9,0.485,0.485,0.0,0.645,0.521,0.485,5.2,102,101,1.8,3.4,0.094,-2.0,2.3,0.3,1.5,14.4,8.6,17.9,13.3,4.3,1.1,8.6,15.7,13.2,C-F


In [45]:
# I saw a strange maximum value of 1000 in the 3p% column
players_num[players_num['3p%']>1]

# none of these players attempted a 3 pointer in that season. The value 1000 it's clearly an error.

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
426,67,0,26,68,29.0,6.0,9.8,6.0,9.7,0.0,0.0,1.4,2.8,3.9,5.5,9.3,2.8,0.7,1.7,1.6,3.1,13.5,0.618,0.618,1000.0,0.495,0.613,0.619,6.9,123,110,4.4,2.5,0.168,0.9,0.7,1.6,1.7,20.8,13.9,20.0,16.9,14.0,1.1,5.0,12.7,18.3
1055,54,0,28,54,25.3,5.8,10.7,5.7,10.7,0.0,0.0,2.4,3.4,3.2,8.2,11.4,1.0,0.7,1.7,1.7,2.4,14.0,0.54,0.538,1000.0,0.703,0.573,0.542,5.3,111,99,2.3,3.0,0.187,1.5,1.1,2.6,1.6,24.1,14.3,36.6,25.4,6.9,1.4,5.9,12.3,24.7
1222,66,0,31,70,20.7,2.5,4.0,2.5,4.0,0.0,0.0,0.3,0.7,1.7,5.3,7.0,2.3,0.5,1.6,1.2,3.2,5.4,0.627,0.626,1000.0,0.48,0.623,0.629,5.4,120,99,2.6,2.9,0.18,-0.9,3.2,2.4,1.6,15.9,9.5,26.0,18.3,14.8,1.1,5.8,21.6,11.4
1510,45,0,22,62,24.0,2.8,6.0,2.8,6.0,0.0,0.0,2.0,2.6,1.6,4.3,5.8,1.6,0.5,0.8,1.0,2.5,7.6,0.461,0.46,1000.0,0.774,0.53,0.462,3.8,108,102,1.6,2.2,0.123,-2.0,1.4,-0.6,0.5,14.1,7.0,20.1,13.4,10.9,1.2,2.6,12.2,15.6
1567,80,0,29,81,32.8,5.6,10.4,5.6,10.4,0.0,0.0,1.9,2.8,2.5,7.0,9.5,1.7,0.5,1.5,1.6,2.5,13.2,0.542,0.541,1000.0,0.686,0.568,0.542,8.1,112,103,4.1,4.0,0.146,0.2,0.6,0.8,1.8,17.6,8.7,25.1,16.7,8.4,0.8,3.8,11.8,18.2
2084,37,0,38,52,22.7,1.8,3.6,1.8,3.6,0.0,0.0,0.4,0.6,1.4,4.3,5.8,1.2,0.6,0.8,0.8,3.2,4.1,0.511,0.508,1000.0,0.625,0.527,0.513,3.2,108,98,0.7,2.6,0.131,-3.4,2.8,-0.6,0.4,10.0,7.7,21.9,15.0,7.4,1.4,2.7,17.3,9.4
2178,81,1,23,81,35.1,5.8,10.5,5.8,10.4,0.0,0.0,2.6,3.3,2.9,7.0,9.9,2.3,0.7,1.1,1.5,2.8,14.2,0.551,0.551,1000.0,0.789,0.594,0.552,10.9,121,105,6.9,3.9,0.183,1.8,0.5,2.3,3.1,19.4,9.6,23.3,16.4,10.4,1.1,2.4,11.2,17.6
2360,77,1,28,77,33.6,7.4,13.4,7.3,13.4,0.0,0.0,4.9,5.7,2.6,7.2,9.9,1.8,0.4,1.9,3.0,3.3,19.7,0.548,0.548,1000.0,0.866,0.618,0.549,10.6,114,101,5.6,5.0,0.196,1.6,1.6,3.1,3.3,22.7,9.6,24.1,17.1,10.0,0.6,4.2,16.0,26.2
2455,61,0,27,81,23.2,1.9,4.3,1.9,4.3,0.0,0.0,1.5,3.2,2.8,4.8,7.5,0.8,1.1,0.1,1.3,2.6,5.2,0.439,0.438,1000.0,0.467,0.463,0.441,2.7,97,103,-0.3,3.0,0.068,-3.0,0.0,-3.0,-0.5,11.0,14.0,24.7,19.3,5.0,2.4,0.3,19.0,13.6
2584,80,0,27,80,38.5,8.1,15.1,8.1,15.1,0.0,0.0,4.4,5.8,3.4,6.0,9.3,2.9,1.0,2.2,2.5,3.0,20.5,0.533,0.532,1000.0,0.761,0.581,0.533,11.5,114,103,7.2,4.3,0.18,3.4,0.8,4.2,4.8,23.1,10.5,18.3,14.5,14.7,1.3,4.5,12.5,24.3


In [46]:
# replace value 1000 with 0 in the column '3p%'
players_num['3p%'].replace(1000, 0, inplace=True)

In [47]:
# check if it's done correctly
players_num[players_num['3p%']>1]

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%


In [48]:
# I saw an unreasonable maximum value of 1000 in the ft% column
players_num[players_num['ft%']>1]

# this player made all the free throws he attempted so his ft% value should be 1.0

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
731,57,0,28,59,27.8,2.9,6.5,1.2,2.2,1.7,4.3,0.5,0.5,0.2,1.7,1.9,2.2,0.5,0.3,0.5,1.7,8.0,0.445,0.531,0.402,1000.0,0.598,0.579,2.6,121,117,2.1,0.5,0.076,-1.2,-0.6,-1.8,0.1,9.7,0.6,7.1,3.8,10.9,1.0,0.8,7.3,11.4


In [49]:
# replace value 1000 with 0 in the column 'ft%'
players_num['ft%'].replace(1000, 1, inplace=True)

In [50]:
# check if it's done correctly
players_num[players_num['ft%']>=1]

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
731,57,0,28,59,27.8,2.9,6.5,1.2,2.2,1.7,4.3,0.5,0.5,0.2,1.7,1.9,2.2,0.5,0.3,0.5,1.7,8.0,0.445,0.531,0.402,1.0,0.598,0.579,2.6,121,117,2.1,0.5,0.076,-1.2,-0.6,-1.8,0.1,9.7,0.6,7.1,3.8,10.9,1.0,0.8,7.3,11.4


In [51]:
players_num.columns

Index(['gs', 'as', 'age', 'g', 'mp', 'fg', 'fga', '2p', '2pa', '3p', '3pa',
       'ft', 'fta', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'fg%', '2p%', '3p%', 'ft%', 'ts%', 'efg%', 'ws', 'ortg', 'drtg',
       'ows', 'dws', 'ws/48', 'obpm', 'dbpm', 'bpm', 'vorp', 'per', 'orb%',
       'drb%', 'trb%', 'ast%', 'stl%', 'blk%', 'tov%', 'usg%'],
      dtype='object')

In [52]:
# transform 'fg%', '2p%', '3p%', 'ft%', 'ts%' and 'efg%' multiplying each column by 100.
list_percent = ['fg%', '2p%', '3p%', 'ft%', 'ts%','efg%']

for col in list_percent:
    players_num[col] = players_num[col]*100

In [53]:
players_num

Unnamed: 0,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
0,43,1,26,43,34.5,7.6,15.0,7.6,14.8,0.0,0.3,5.0,6.4,2.2,8.3,10.6,4.0,1.0,1.0,2.4,2.5,20.3,50.5,51.3,9.1,78.0,56.6,50.6,3.9,113,110,1.4,2.5,0.126,0.4,1.4,1.8,1.4,19.5,7.1,27.9,17.3,19.4,1.5,2.8,11.9,26.0
1,47,0,28,47,32.9,4.3,8.4,1.7,3.2,2.6,5.2,1.6,1.8,0.6,3.4,4.0,3.1,0.9,0.6,1.3,2.1,12.8,51.1,54.7,49.0,88.4,69.3,66.3,4.1,131,118,2.8,1.3,0.128,0.9,-0.1,0.9,1.1,13.5,2.4,11.1,7.0,12.5,1.3,1.7,12.7,13.9
2,46,0,25,46,30.5,6.4,9.9,6.4,9.8,0.0,0.1,2.8,3.7,3.6,7.1,10.6,2.8,0.8,1.2,1.8,2.3,15.5,64.4,65.0,0.0,74.7,67.3,64.4,6.6,132,108,3.9,2.8,0.228,1.9,1.7,3.6,2.0,22.1,13.2,25.0,19.3,13.4,1.3,3.4,13.4,18.9
3,51,1,29,51,35.0,11.5,18.9,11.0,17.1,0.5,1.8,7.3,11.1,2.5,8.7,11.2,6.3,1.4,1.1,3.5,3.1,30.7,60.9,64.6,25.0,65.7,64.6,62.0,8.6,125,113,6.1,2.5,0.232,6.1,2.1,8.2,4.6,29.2,8.2,26.4,17.6,30.5,1.9,2.5,12.9,33.1
4,41,0,26,41,34.1,5.8,11.7,3.7,6.1,2.1,5.5,1.6,2.0,1.0,3.2,4.2,2.3,1.3,0.7,1.5,2.3,15.3,49.8,60.6,37.9,77.4,60.9,58.8,2.5,116,117,1.3,1.3,0.087,-0.6,0.5,-0.1,0.7,13.5,3.1,10.6,6.8,9.4,1.9,1.8,10.7,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,79,1,30,80,36.1,10.1,20.9,9.4,18.6,0.7,2.3,5.7,7.1,2.7,3.8,6.5,2.5,1.6,0.6,2.2,1.8,26.7,48.4,50.4,32.2,80.7,55.6,50.2,11.1,118,112,9.3,1.8,0.184,6.5,-1.1,5.4,5.4,24.6,8.4,12.4,10.4,11.8,2.2,1.0,8.3,30.4
5160,80,0,26,82,31.8,5.8,12.6,5.3,11.1,0.5,1.5,2.5,3.2,1.6,2.9,4.5,4.0,1.2,0.3,2.4,2.3,14.5,45.7,47.7,31.2,80.3,52.0,47.6,4.4,106,110,2.3,2.1,0.082,0.3,-0.6,-0.3,1.1,14.3,5.8,9.8,7.9,18.5,1.8,0.5,14.5,21.4
5161,82,0,29,82,34.2,5.0,9.2,5.0,9.2,0.0,0.0,3.5,5.0,3.0,6.7,9.8,1.4,0.8,0.5,2.0,3.5,13.6,54.8,54.8,0.0,70.6,59.7,54.8,8.7,116,104,4.3,4.3,0.149,0.1,0.6,0.7,1.9,14.3,9.6,21.6,15.5,5.5,1.2,0.8,15.3,15.5
5162,51,0,27,81,28.1,5.2,9.9,5.1,9.9,0.0,0.1,2.1,3.0,3.1,4.8,8.0,0.7,0.8,0.6,1.8,3.2,12.4,51.9,52.1,28.6,68.3,55.1,52.0,4.7,111,111,3.0,1.7,0.098,0.0,-1.8,-1.7,0.1,15.7,12.5,20.4,16.4,3.7,1.4,1.3,13.6,19.5


players_num is consistent and clean

### Clean categorical columns

In [54]:
players_cat

Unnamed: 0,player,season,team,pos
0,Bam Adebayo,2023-24,MIA,C-F
1,Grayson Allen,2023-24,PHO,G
2,Jarrett Allen,2023-24,CLE,C-F
3,Giannis Antetokounmpo,2023-24,MIL,F-G
4,OG Anunoby,2023-24,"NYK,TOR",F
...,...,...,...,...
5159,Dominique Wilkins,1989-90,ATL,F-G
5160,Gerald Wilkins,1989-90,NYK,G-F
5161,Buck Williams,1989-90,POR,F-C
5162,Kevin Willis,1989-90,ATL,F-C


In [55]:
# check the consistency of the column using regex library.

# use str.contains() to check which values in the column don't match the pattern for player names
inconsistent_values_playername = players_cat[~players_cat['player'].str.contains(pattern, na=False)]

display(inconsistent_values_playername)

Unnamed: 0,player,season,team,pos
19,Bojan Bogdanović,2023-24,"DET,NYK",G-F
43,Luka Dončić,2023-24,DAL,G-F
75,Jaren Jackson Jr.,2023-24,MEM,F
80,Nikola Jokić,2023-24,DEN,C-F
81,Derrick Jones Jr.,2023-24,DAL,F
...,...,...,...,...
4838,Dražen Petrović,1991-92,NJN,G
4978,J.R. Reid,1990-91,CHH,F
5041,Joe Barry Carroll,1989-90,"DEN,NJN",C-F
5075,A.C. Green,1989-90,LAL,F-C


In [56]:
# make a dataframe with just the unique values of the 'player' column of the inconsistent_values_playername dataframe
inconsistent = inconsistent_values_playername['player'].value_counts().to_frame().reset_index(drop=False)
inconsistent = inconsistent.drop(['count'], axis=1)
inconsistent

Unnamed: 0,player
0,P.J. Brown
1,Metta World Peace
2,Nikola Vučević
3,Jonas Valančiūnas
4,Rasho Nesterović
...,...
86,Donatas Motiejūnas
87,Greivis Vásquez
88,Donté Greene
89,Francisco García


In [57]:
# make a dataframe for each all nba team from the all_nba_teams_cleaned dataframe 
first = all_nba_teams_cleaned['first_team'].to_frame()
second = all_nba_teams_cleaned['second_team'].to_frame()
third = all_nba_teams_cleaned['third_team'].to_frame()

# rename the unique column of each dataframe in order to perform a vertical merge
first.rename(columns={'first_team': 'all_nba_teams_player'}, inplace=True)
second.rename(columns={'second_team': 'all_nba_teams_player'}, inplace=True)
third.rename(columns={'third_team': 'all_nba_teams_player'}, inplace=True)

display(first.head())
display(second.head())
display(third.head())

Unnamed: 0,all_nba_teams_player
0,Karl Malone
1,Charles Barkley
2,Patrick Ewing
3,Magic Johnson
4,Michael Jordan


Unnamed: 0,all_nba_teams_player
0,Larry Bird
1,Tom Chambers
2,Akeem Olajuwon
3,John Stockton
4,Kevin Johnson


Unnamed: 0,all_nba_teams_player
0,James Worthy
1,Chris Mullin
2,David Robinson
3,Clyde Drexler
4,Joe Dumars


In [58]:
# merge dataframes vertically
concat_teams = pd.concat([first, second, third], axis=0, ignore_index=True).reset_index(drop=True)
concat_teams

Unnamed: 0,all_nba_teams_player
0,Karl Malone
1,Charles Barkley
2,Patrick Ewing
3,Magic Johnson
4,Michael Jordan
...,...
505,LeBron James
506,Julius Randle
507,Domantas Sabonis
508,De'Aaron Fox


In [59]:
# make a dataframe with just the unique values of the 'all_nba_teams_player' column 
concat_all_nba_player = concat_teams['all_nba_teams_player'].value_counts().to_frame().reset_index(drop=False)

# drop the 'count' column
concat_all_nba_player = concat_all_nba_player.drop(['count'], axis=1)

concat_all_nba_player

Unnamed: 0,all_nba_teams_player
0,LeBron James
1,Tim Duncan
2,Kobe Bryant
3,Shaquille O'Neal
4,Karl Malone
...,...
136,Detlef Schrempf
137,Juwan Howard
138,Anthony Mason
139,Antonio McDyess


In [60]:
# check the matches of player names between inconsistent and concat_all_nba_player dataframes
matches_inconsistent = pd.merge(left = concat_all_nba_player,
                                 right = inconsistent,
                                 how = 'inner', 
                                 left_on = "all_nba_teams_player", 
                                 right_on= "player")
matches_inconsistent

Unnamed: 0,all_nba_teams_player,player
0,Nikola Jokić,Nikola Jokić
1,Luka Dončić,Luka Dončić
2,Manu Ginóbili,Manu Ginóbili
3,Goran Dragić,Goran Dragić
4,Peja Stojaković,Peja Stojaković
5,Dražen Petrović,Dražen Petrović


They are all consistent, it's just that their names contain characters that were not take into account in the regex pattern

In [61]:
# check the matches of player names between players_cat and concat_all_nba_player dataframes
matches_general = pd.merge(left = concat_all_nba_player,
                                 right = players_cat,
                                 how = 'inner', 
                                 left_on = "all_nba_teams_player", 
                                 right_on= "player")
matches_general

Unnamed: 0,all_nba_teams_player,player,season,team,pos
0,LeBron James,LeBron James,2023-24,LAL,F-G
1,LeBron James,LeBron James,2022-23,LAL,F-G
2,LeBron James,LeBron James,2021-22,LAL,F-G
3,LeBron James,LeBron James,2020-21,LAL,F-G
4,LeBron James,LeBron James,2019-20,LAL,F-G
...,...,...,...,...,...
1398,De'Aaron Fox,De'Aaron Fox,2021-22,SAC,G
1399,De'Aaron Fox,De'Aaron Fox,2020-21,SAC,G
1400,De'Aaron Fox,De'Aaron Fox,2019-20,SAC,G
1401,De'Aaron Fox,De'Aaron Fox,2018-19,SAC,G


In [62]:
# all player names that match between concat_all_nba_player and player_cat dataframes

matches_general_player = matches_general['player'].value_counts().to_frame().reset_index(drop=False)
matches_general_player = matches_general_player.drop(['count'], axis=1)
matches_general_player.sort_values(by='player')

Unnamed: 0,player
34,Al Horford
75,Al Jefferson
38,Allen Iverson
62,Alonzo Mourning
64,Amar'e Stoudemire
...,...
118,Victor Oladipo
102,Vin Baker
25,Vince Carter
110,Yao Ming


In [63]:
pd.set_option('display.max_rows', None)

concat_all_nba_player.sort_values(by='all_nba_teams_player')

Unnamed: 0,all_nba_teams_player
124,Akeem Olajuwon
96,Al Horford
100,Al Jefferson
18,Allen Iverson
85,Alonzo Mourning
69,Amar'e Stoudemire
65,Amare Stoudemire
103,Andre Drummond
93,Andrew Bogut
117,Andrew Bynum


In [64]:
# every_player = players_cat['player'].value_counts().to_frame().reset_index(drop=False)

In [65]:
# every_player = every_player.drop(['count'], axis=1).sort_values(by='player')
# every_player

I noticed that in one dataframe 'Hakeem Olajuwon' is written wrongly as 'Akeem Olajuwon' and "Amar'e Stoudemire" is written incorrectly as "Amare Stoudemire"

In [66]:
# replace in all_nba_teams_cleaned 'Akeem Olajuwon' with 'Hakeem Olajuwon'
for col in all_nba_teams_cleaned.columns:
    all_nba_teams_cleaned[col].replace('Akeem Olajuwon', 'Hakeem Olajuwon', inplace=True)
    
# also replace in the vertically concatenated datadrame to check consistency again
for col in concat_all_nba_player.columns:
    concat_all_nba_player[col].replace('Akeem Olajuwon', 'Hakeem Olajuwon', inplace=True)

In [67]:
# replace in all_nba_teams_cleaned "Amare Stoudemire" with "Amar'e Stoudemire"
for col in all_nba_teams_cleaned.columns:
    all_nba_teams_cleaned[col].replace("Amare Stoudemire", "Amar'e Stoudemire", inplace=True)
    
# also replace in the vertically concatenated datadrame to check consistency again
for col in concat_all_nba_player.columns:
    concat_all_nba_player[col].replace("Amare Stoudemire", "Amar'e Stoudemire", inplace=True)

In [68]:
all_nba_teams_cleaned

Unnamed: 0,season,first_team,second_team,third_team
0,1989-90,Karl Malone,Larry Bird,James Worthy
1,1989-90,Charles Barkley,Tom Chambers,Chris Mullin
2,1989-90,Patrick Ewing,Hakeem Olajuwon,David Robinson
3,1989-90,Magic Johnson,John Stockton,Clyde Drexler
4,1989-90,Michael Jordan,Kevin Johnson,Joe Dumars
5,1990-91,Karl Malone,Dominique Wilkins,James Worthy
6,1990-91,Charles Barkley,Chris Mullin,Bernard King
7,1990-91,David Robinson,Patrick Ewing,Hakeem Olajuwon
8,1990-91,Michael Jordan,Kevin Johnson,John Stockton
9,1990-91,Magic Johnson,Clyde Drexler,Joe Dumars


In [69]:
players_cat

Unnamed: 0,player,season,team,pos
0,Bam Adebayo,2023-24,MIA,C-F
1,Grayson Allen,2023-24,PHO,G
2,Jarrett Allen,2023-24,CLE,C-F
3,Giannis Antetokounmpo,2023-24,MIL,F-G
4,OG Anunoby,2023-24,"NYK,TOR",F
5,Deni Avdija,2023-24,WAS,G-F
6,Deandre Ayton,2023-24,POR,C
7,Paolo Banchero,2023-24,ORL,F
8,Desmond Bane,2023-24,MEM,G
9,Harrison Barnes,2023-24,SAC,F


Now there's consistency between the columns that have player names in the dataframes all_nba_teams_cleaned and players_cat

In [70]:
# check consistency between the season columns in my dataframes all_nba_teams_cleaned and players_cat
season_check_players = players_cat['season'].value_counts().to_frame().reset_index(drop=False)
season_check_players = season_check_players.drop(['count'], axis=1)
season_check_players = list(season_check_players.sort_values(by='season'))

season_check_all_nba = all_nba_teams_cleaned['season'].value_counts().to_frame().reset_index(drop=False)
season_check_all_nba = season_check_all_nba.drop(['count'], axis=1)
season_check_all_nba = list(season_check_all_nba.sort_values(by='season'))

season_check_players == season_check_all_nba

# they are consistent

True

In [71]:
# check consistency of 'team' column
players_cat['team'].value_counts()

# It is consistent, it contains up to 3 teams. They are the teams the player has played for that season.

team
LAL            174
SAS            173
UTA            173
IND            171
BOS            170
LAC            168
CHI            168
MIA            168
DET            167
PHO            167
POR            167
SAC            166
MIN            166
ORL            166
HOU            166
DEN            165
MIL            164
PHI            163
NYK            160
ATL            158
CLE            157
GSW            155
DAL            151
TOR            132
WAS            126
NJN            105
MEM            103
SEA             91
OKC             76
CHH             61
NOP             51
BRK             50
CHO             49
CHA             42
NOH             42
WSB             38
VAN             26
NOK             11
DAL,WAS          7
NYK,TOR          6
DEN,NYK          6
DAL,NJN          6
IND,SAC          5
ORL,PHO          5
GSW,PHI          5
CHI,CLE          5
DAL,PHO          4
LAC,POR          4
DEN,ORL          4
CHI,IND          4
DAL,NYK          4
CHI,TOR          4
GSW,IND

In [72]:
# check consistency of 'pos' column
players_cat['pos'].value_counts()

# it is consistent

pos
G      1678
F      1073
F-C     687
C       615
G-F     492
F-G     335
C-F     284
Name: count, dtype: int64

In [73]:
# merge players_cat and players_num, as they are cleaned
players_cleaned = pd.concat([players_cat, players_num], axis=1)
players_cleaned.head()

Unnamed: 0,player,season,team,pos,gs,as,age,g,mp,fg,fga,2p,2pa,3p,3pa,ft,fta,orb,drb,trb,ast,stl,blk,tov,pf,pts,fg%,2p%,3p%,ft%,ts%,efg%,ws,ortg,drtg,ows,dws,ws/48,obpm,dbpm,bpm,vorp,per,orb%,drb%,trb%,ast%,stl%,blk%,tov%,usg%
0,Bam Adebayo,2023-24,MIA,C-F,43,1,26,43,34.5,7.6,15.0,7.6,14.8,0.0,0.3,5.0,6.4,2.2,8.3,10.6,4.0,1.0,1.0,2.4,2.5,20.3,50.5,51.3,9.1,78.0,56.6,50.6,3.9,113,110,1.4,2.5,0.126,0.4,1.4,1.8,1.4,19.5,7.1,27.9,17.3,19.4,1.5,2.8,11.9,26.0
1,Grayson Allen,2023-24,PHO,G,47,0,28,47,32.9,4.3,8.4,1.7,3.2,2.6,5.2,1.6,1.8,0.6,3.4,4.0,3.1,0.9,0.6,1.3,2.1,12.8,51.1,54.7,49.0,88.4,69.3,66.3,4.1,131,118,2.8,1.3,0.128,0.9,-0.1,0.9,1.1,13.5,2.4,11.1,7.0,12.5,1.3,1.7,12.7,13.9
2,Jarrett Allen,2023-24,CLE,C-F,46,0,25,46,30.5,6.4,9.9,6.4,9.8,0.0,0.1,2.8,3.7,3.6,7.1,10.6,2.8,0.8,1.2,1.8,2.3,15.5,64.4,65.0,0.0,74.7,67.3,64.4,6.6,132,108,3.9,2.8,0.228,1.9,1.7,3.6,2.0,22.1,13.2,25.0,19.3,13.4,1.3,3.4,13.4,18.9
3,Giannis Antetokounmpo,2023-24,MIL,F-G,51,1,29,51,35.0,11.5,18.9,11.0,17.1,0.5,1.8,7.3,11.1,2.5,8.7,11.2,6.3,1.4,1.1,3.5,3.1,30.7,60.9,64.6,25.0,65.7,64.6,62.0,8.6,125,113,6.1,2.5,0.232,6.1,2.1,8.2,4.6,29.2,8.2,26.4,17.6,30.5,1.9,2.5,12.9,33.1
4,OG Anunoby,2023-24,"NYK,TOR",F,41,0,26,41,34.1,5.8,11.7,3.7,6.1,2.1,5.5,1.6,2.0,1.0,3.2,4.2,2.3,1.3,0.7,1.5,2.3,15.3,49.8,60.6,37.9,77.4,60.9,58.8,2.5,116,117,1.3,1.3,0.087,-0.6,0.5,-0.1,0.7,13.5,3.1,10.6,6.8,9.4,1.9,1.8,10.7,17.6


## Saving Cleaned Data

In [74]:
# save cleaned dataframes to csv
players_cleaned.to_csv('players_cleaned.csv', sep=',', index=False)
all_nba_teams_cleaned.to_csv('all_nba_teams_cleaned.csv', sep=',', index=False)

In [75]:
# I created a new schema in SQL and I'll save the cleaned dataframes there
password = getpass()

········


In [76]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/nba'
engine = create_engine(connection_string)

In [77]:
# push the cleaned dataframes to SQL

players_cleaned.to_sql('player_stats', con=engine)
all_nba_teams_cleaned.to_sql('all_nba_teams', con=engine)

170