# Data Merging and Cleaning 

Let's merge the previous csv files in order to have one dataframe containing all data on a player for a given year. We will also delete any unnecessary statistics.

In [4]:
import pandas as pd
import os

In [5]:
YEAR = []

for year in range(1976,2020):
    YEAR.append(year)

We need a dictionary of abbreviations to full team names. This will be tedious but we will need it for later when we try and match a player to their team's standings.

In [58]:
team_dict = {
    "ATL": "Atlanta Hawks", "BOS": "Boston Celtics", "BRK": "Brooklyn Nets",
    "BUF": "Buffalo Braves", "CHA": "Charlotte Bobcats", "CHH": "Charlotte Hornets",
    "CHI": "Chicago Bulls", "CHO": "Charlotte Hornets", "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks", "DEN": "Denver Nuggets", "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors", "HOU": "Houston Rockets", "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers", "LAL": "Los Angeles Lakers", "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat", "MIL": "Milwaukee Bucks", "MIN": "Minnesota Timberwolves",
    "NJN": "New Jersey Nets", "NOH": "New Orleans Hornets", "NOJ": "New Orleans Jazz",
    "NOK": "New Orleans/Oklahoma City Hornets", "NOP": "New Orleans Pelicans", "NYK": "New York Knicks", 
    "NYN": "New York Nets", "OKC": "Oklahoma City Thunder", "ORL": "Orlando Magic", 
    "PHI": "Philadelphia 76ers", "PHO": "Phoenix Suns", "POR": "Portland Trail Blazers", 
    "SAC": "Sacramento Kings", "SAS": "San Antonio Spurs", "SDC": "San Diego Clippers", 
    "SEA": "Seattle SuperSonics", "TOR": "Toronto Raptors", "UTA": "Utah Jazz", 
    "VAN": "Vancouver Grizzlies", "WAS": "Washington Wizards", "WSB": "Washington Bullets", 
    "KCK": "Kansas City Kings"
}

In [64]:
# for i in tqdm(range(len(YEAR))):

i = 2011

#================= per_game ===================#
df_per_game = pd.read_csv('..\\..\\input\\per_game' + '\\{}-per_game.csv'.format(i))

#================= advanced ===================#
df_advanced = pd.read_csv('..\\..\\input\\advanced' + '\\{}-advanced.csv'.format(i))

#================= standings ===================#
df_standings = pd.read_csv('..\\..\\input\\standings' + '\\{}-standings.csv'.format(i))

# combine per game data with advanced stats data
df_combined = pd.concat([df_per_game,df_advanced], axis=1)

# drop duplicate column values
df_combined = df_combined.loc[:,~df_combined.columns.duplicated()]

# if player has been traded during the season keep statistics for the different teams
# delete total statistics
df_combined = df_combined[df_combined.Tm != 'TOT']

# delete any unnecessary statistics
# shots/shots attempted isn't needed as we already have percentages
# mvp's will always start their games
# offensive/defensive rebounds are covered in total rebounds
del df_combined['G']
del df_combined['GS']
del df_combined['FG']
del df_combined['FGA']
del df_combined['3P']
del df_combined['3PA']
del df_combined['2P']
del df_combined['2PA']
del df_combined['FT']
del df_combined['FTA']
del df_combined['ORB']
del df_combined['DRB']

# delete empty columns and rows
df_combined = df_combined.drop(df_combined.columns[30], axis=1)
df_combined = df_combined.drop(df_combined.columns[34], axis=1)
df_combined = df_combined.dropna(axis=0)

# reset index
df_combined = df_combined.reset_index()
del df_combined["index"]

# next add a column on whether or not the player's team made the playoffs or had a losing season
# also add the team's record

team_name = []
team_wins = []
team_playoffs = []

# change the corresponding team abbreviation to the team's full name using team_dict
for i in range(len(df_combined["Tm"])):
    team_name.append(team_dict[df_combined["Tm"][i]])

# implement changes to combined dataframe
df_combined['Tm'] = team_name

# populate team wins and team playoffs using data from df_standings
for i in range(len(df_combined["Tm"])):
    team_wins.append(df_standings.loc[df_standings['Team'] == df_combined["Tm"][i]]['W'].tolist()[0])
    team_playoffs.append(df_standings.loc[df_standings['Team'] == df_combined["Tm"][i]]['Playoffs'].tolist()[0])
    
# create columns record and playoffs
df_combined['Record'] = team_wins
df_combined['Playoffs'] = team_playoffs

df_combined.to_csv('..\\full_data_4.csv')