## TASK - Rank the teams from the Pacific and Southeast NBA divisions based on team totals for statistics (points, rebounds, dunks, fouls, etc.)


### Loading libraries and importing data

In [7]:
# import libraries
import pandas as pd
import altair as alt
from altair import datum
import matplotlib.pyplot as plt
import numpy as np


In [8]:
# read in data with filepath, dropping first column because is index, parsing year string to datetime object
path="data/FinalDataset.csv"
nba_data = pd.read_csv(path,index_col=[0],parse_dates=['Year'])

### Dataframe cleaning and new column creation

In [9]:
# make it such that the year column only contains year in dataframe and not month and day
nba_data['Year'] = nba_data['Year'].dt.year 

In [10]:
# players can be traded midway through a season and have multiple teams they played for but because we are doing totals should only count them once, will count them based on first team they played for that season

# function that takes a player's first team in the string of teams and adds it to a new column, if only one team played for that season, just return that team
def get_first_team(df, column, new_column_name):
    def get_first_item_list(value):
        if isinstance(value, str) and '[' in value and ']' in value:
            return value.split("'")[1]
        return value

    new_column = new_column_name
    df[new_column] = df[column].apply(get_first_item_list)
    
    return df

nba_data=get_first_team(nba_data,"team","start_team")
nba_data['start_team']

2912       DALLAS MAVERICKS
2913    CLEVELAND CAVALIERS
2914       DALLAS MAVERICKS
2916       DALLAS MAVERICKS
2918    CLEVELAND CAVALIERS
               ...         
9749         DENVER NUGGETS
9751             MIAMI HEAT
9752          ORLANDO MAGIC
9753         BOSTON CELTICS
9754             MIAMI HEAT
Name: start_team, Length: 4766, dtype: object

In [11]:
# get all unique team names in first_team
all_teams = nba_data['start_team'].unique()
all_teams

array(['DALLAS MAVERICKS', 'CLEVELAND CAVALIERS', 'MIAMI HEAT',
       'CHARLOTTE BOBCATS', 'SEATTLE SUPERSONICS', 'NEW JERSEY NETS',
       'BOSTON CELTICS', 'UTAH JAZZ', 'LOS ANGELES LAKERS',
       'SAN ANTONIO SPURS', 'SACRAMENTO KINGS', 'MEMPHIS GRIZZLIES',
       'LOS ANGELES CLIPPERS', 'INDIANA PACERS', 'TORONTO RAPTORS',
       'NEW ORLEANS HORNETS', 'WASHINGTON WIZARDS', 'NEW YORK KNICKS',
       'ORLANDO MAGIC', 'PORTLAND TRAIL BLAZERS', 'DETROIT PISTONS',
       'HOUSTON ROCKETS', 'MINNESOTA TIMBERWOLVES', 'DENVER NUGGETS',
       'ATLANTA HAWKS', 'GOLDEN STATE WARRIORS', 'CHICAGO BULLS',
       'PHILADELPHIA 76ERS', 'PHOENIX SUNS', 'MILWAUKEE BUCKS',
       'OKLAHOMA CITY THUNDER', 'BROOKLYN NETS', 'NEW ORLEANS PELICANS',
       'CHARLOTTE HORNETS'], dtype=object)

In [12]:
# make function that adds a column indicating team'continues division
# midwest division no longer exists so putting Vancouver Grizzlies in current Memphis division
# New Orleans/Oklahoma City Hornets combo was the year of hurricane Katrina so team relocated, will use New Orleans Pelicans division

def add_team_division(df, column):
    
    division_dict = {'ATLANTA HAWKS': 'Southeast',
                     'BOSTON CELTICS': 'Atlantic',
                     'BROOKLYN NETS': 'Atlantic',
                     'CHARLOTTE BOBCATS': "Southeast",
                     'CHARLOTTE HORNETS': 'Southeast',
                     'CHICAGO BULLS': 'Central',
                     'CLEVELAND CAVALIERS': 'Central',
                     'DALLAS MAVERICKS': 'Southwest',
                     'DENVER NUGGETS': 'Northwest',
                     'DETROIT PISTONS': 'Central',
                     'GOLDEN STATE WARRIORS': 'Pacific',
                     'HOUSTON ROCKETS': 'Southwest',
                     'INDIANA PACERS': 'Central',
                     'LOS ANGELES CLIPPERS': 'Pacific',
                     'LOS ANGELES LAKERS': 'Pacific',
                     'MEMPHIS GRIZZLIES': 'Southwest',
                     'MIAMI HEAT': 'Southeast',
                     'MILWAUKEE BUCKS': 'Central',
                     'MINNESOTA TIMBERWOLVES': 'Northwest',
                     'NEW JERSEY NETS': 'Atlantic',
                     'NEW ORLEANS HORNETS': "Southwest",
                     'NEW ORLEANS/OKLAHOMA CITY HORNETS': 'Southwest',
                     'NEW ORLEANS PELICANS': 'Southwest',
                     'NEW YORK KNICKS': 'Atlantic',
                     'OKLAHOMA CITY THUNDER': 'Northwest',
                     'ORLANDO MAGIC': 'Southeast',
                     'PHILADELPHIA 76ERS': 'Atlantic',
                     'PHOENIX SUNS': 'Pacific',
                     'PORTLAND TRAIL BLAZERS': 'Northwest',
                     'SACRAMENTO KINGS': 'Pacific',
                     'SAN ANTONIO SPURS': 'Southwest',
                     'SEATTLE SUPERSONICS': 'Northwest',
                     'TORONTO RAPTORS': 'Atlantic',
                     'UTAH JAZZ': 'Northwest',
                     'VANCOUVER GRIZZLIES': 'Southwest',
                     'WASHINGTON WIZARDS': 'Southeast'}
    
    new_col_name = column + '_division'
    df[new_col_name] = df[column].map(division_dict)
    
    return df

nba_data = add_team_division(nba_data,'start_team')




In [13]:
# filter data such that only keeps players in teams in the southeast and pacific divisions
se_pacific_div = ['Southeast','Pacific']
se_pacific_data = nba_data.query("start_team_division in @se_pacific_div")

### Streamgraph Creation

In [64]:
# create streamgraph for total defensive rebounds, code adapted from altair website

# colors matching team colors
colors = "red","peru","darkcyan","goldenrod","blue","blueviolet","firebrick","dodgerblue","chocolate","slategray","midnightblue"

# make uni-directional selector for team in legend
team_select = alt.selection_multi(fields=['start_team'], bind='legend')

# make actual streamgraph
nba_streamgraph = alt.Chart(se_pacific_data,title='Total Defensive Rebounds in the Southeast and Pacific NBA Divisions from 2008-2022').add_selection(team_select).mark_area().encode(
    alt.X('Year:O',
        axis=alt.Axis(domain=False, tickSize=0)
    ),
    alt.Y('sum(defensive_rebounds):Q', stack='center',title="Total Defensive Rebounds"),
    alt.Color('start_team:N', scale=alt.Scale(range=colors),legend=alt.Legend(title="Team")),
    tooltip = [
        alt.Tooltip('start_team:N',title="Team"),
        alt.Tooltip('start_team_division:N',title="Division"),
        alt.Tooltip('sum(defensive_rebounds):Q',title="Total"),
        alt.Tooltip('Year')
    ],
    opacity = alt.condition(team_select, alt.value(1), alt.value(0.2))
).properties(height=400,width=500)

nba_streamgraph

### Parallel Coordinates Graph Creation

#### Further data processing for parallel coordinates

In [65]:
# create parallel coordinates plot for team totals (points','attempted_field_goals', 'attempted_three_point_field_goals', attempted_free_throws, offensive_rebounds,
# defensive_rebounds, assists, steals, blocks, turnovers) for teams in Southeast and Pacific division for specified year, code adapted from here - https://stackoverflow.com/questions/70341974/parallel-coordinates-in-altair

# stats we want to visualize, plus the columns ('start_team','Year') we need to group by 
stats=['points','attempted_field_goals', 'attempted_three_point_field_goals',
        'attempted_free_throws', '# of Dunks','turnovers','assists', 'offensive_rebounds',
       'defensive_rebounds', 'steals', 'blocks','start_team','Year']


# filter dataframe for desired columns
stat_filter_df = se_pacific_data[stats]

# function that groups columns in dataframe based on a list of columns and returns the column totals 
def group_col_total(df, list_cols):

    # gets list of columns to sum
    sum_cols = [col for col in df.columns if col not in list_cols]

    # group dataframe by group_cols and sum all other columns
    grouped = df.groupby(list_cols)[sum_cols].sum().reset_index()

    return grouped

# group dataframe based on start_team and year
team_year=['start_team','Year']
grouped_total_stat_df = group_col_total(stat_filter_df,team_year)

# change column names for parallel coordinates graph
total_stat_col_adj = grouped_total_stat_df.copy()
total_stat_col_adj = total_stat_col_adj.rename(columns={'points':'Points','attempted_field_goals':'Attempted FGs', 'attempted_three_point_field_goals':'Attempted 3s',
        'attempted_free_throws':'Attempted FTs','turnovers':'Turnovers','assists':"Assists", 'offensive_rebounds':'Offensive rebounds',
       'defensive_rebounds':'Defensive rebounds', 'steals':'Steals', 'blocks': 'Blocks'})

# columns we want to VISUALIZE ONLY
stats_viz=['Points','Attempted FGs', 'Attempted 3s',
        'Attempted FTs', 'Offensive rebounds',
       'Defensive rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers']

### Parallel coordinates graph 

In [66]:
# make parallel coordinates graph function, can change what is visualized based on year

def make_nba_pcoord(year):

    nba_pcoord = alt.Chart(total_stat_col_adj,title=f"Southeast and Pacific Division Team Totals for {year}").transform_filter(
            (datum.Year == year)).transform_fold(
            stats_viz
        ).transform_joinaggregate(
        min="min(value)",
        max="max(value)",
        groupby=["key"]
    ).transform_calculate(
        norm_val="(datum.value - datum.min) / (datum.max - datum.min)",
        mid="(datum.min + datum.max) / 2"
    ).properties(width=500)
    
    
    lines = nba_pcoord.mark_line(opacity=1).add_selection(team_select).encode(
        x='key:N',
        y=alt.Y('norm_val:Q', axis=None),
        color=alt.Color('start_team:N',scale=alt.Scale(range=colors),title='Team'),
        detail=alt.Detail ('index:N'),
        opacity = alt.condition(team_select, alt.value(1), alt.value(0.2))
    )
    
    rules = nba_pcoord.mark_rule(
        color="#ccc", tooltip=None
    ).encode(
        x="key:N",
        detail="count():Q",
    )
    
    
    def ytick(yvalue, field, y_adj):
        scale = nba_pcoord.encode(x='key:N', y=alt.value(yvalue), text=f"min({field}):Q")
        return alt.layer(
            scale.mark_text(baseline="middle", align="right", dx=10,  dy=y_adj, tooltip=None),
            scale.mark_tick(size=8, color="#ccc", orient="horizontal", tooltip=None)
        )
    
    nba_pcoord_fin= alt.layer(
        lines, rules, ytick(0, "max", -10), ytick(300, "min", 10)
    )

    return nba_pcoord_fin

make_nba_pcoord(2008)

  for col_name, dtype in df.dtypes.iteritems():


### Multiview Creation

In [70]:
# make multiview
multiview = alt.hconcat(nba_streamgraph, make_nba_pcoord(2015)).configure_axisX(
        domain=False,  labelPadding=15,tickColor="#ccc", title=None
    ).configure_view(
        stroke=None
    ).configure_legend(titleFontSize=16,
                       labelFontSize=12
    ).configure_title(fontSize=16
    ).configure_axis(labelFontSize=16,
                     titleFontSize=16)
multiview
