In [None]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sn

# Exploratory Data Analysis
### DSCI 320 Project - Matthew Gillies, Jordon Chen, Lucas Moynier

In [None]:
## Reading in data from URL:
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ-1fqyvskEHodIPX8O-4_QvUHmXUXCAvNETbYZs8e0ZRO_trGzV64tDjskf_oe_t83JjpzOS_D5kfZ/pub?gid=1833496207&single=true&output=csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.shape

The dataframe has 9756 rows with 61 features

In [None]:
pd.DataFrame(data.dtypes).value_counts()

Note: Not all rows are printed due to large output size
It appears that numeric features are represented by float or int, while categorical features are represented as an object. 

In [None]:
## Removing 'Unnamed: 0' column along with duplicate columns
data = data.drop(columns = ["Unnamed: 0", "pts", "reb", "ast"])

In [None]:
## Analyzing distribution of NA values
pd.DataFrame(data.isna().sum()).sort_values(by = 0,ascending = False).head(20)

It is likely that the NA values for the first 6 rows in the table above are due to the player not shooting from that distance (ie. centers who do not shoot outside of the paint). Therefore we will replace NA's in these six features with zeroes, and then remove all rows with other NA values. This will also be done to prevent the removal of an extremely large amount of our data. 
##### This will not be done as altair requires < 5000 rows of data for visualizations

In [None]:
## Removing all other rows with NA values
data = data.dropna()
data.isna().values.any()

In [None]:
data.shape

In [None]:
data = data[-data['Year'].isin([2001,2002,2003, 2004, 2005, 2006, 2007])]

In [None]:
data.shape

In [None]:
## Converting columns that should be integers 
int_cols = ['points', 'assists', 'made_field_goals', 'attempted_field_goals','made_three_point_field_goals',
            'attempted_three_point_field_goals', 'made_free_throws','attempted_free_throws', 'offensive_rebounds',
            'defensive_rebounds', 'steals', 'blocks', 'turnovers', 'personal_fouls', '# of Dunks', 'Attempted Heaves', 
            '# of Heaves']
for c in int_cols:
    data[c] = data[c].round(0).astype(int)

In [None]:
data.dtypes.value_counts()

In [None]:
pd.DataFrame(data.describe())

In [None]:
## Renaming slug column to abbr (abbreviation)
data = data.rename(columns = {"slug":"abbr", "% of FG Ast'd 2P":"2P FG AST%", "% of FG Ast'd 3P":"3P FG AST%", "Dist.":"Dist"})

In [None]:
## Splitting dataset into numeric/categorical features
categorical = data.dtypes[data.dtypes == "object"]
categorical

In [None]:
cat_features = ["name", "college", "country", "draft_year", "draft_round", "draft_number", "abbr", "positions", "team", "Year"]
cat_data = data[data.columns[data.columns.isin(cat_features)]]
cat_data.head()

In [None]:
numeric_data = data.drop(columns = cat_features)
numeric_data.columns

In [None]:
## Visualizing Bar Charts for categorical features (not visualizing player name/abbreviation due to extremely large output as
## each name/abbreviation is unique anyways)
col1 = ["college", "country", "draft_year", "draft_round", "draft_number", "positions", "team", "Year"]
for col in col1:
    chart = alt.Chart(data).mark_bar().encode(
        alt.X("count()"),
        alt.Y(col))
    chart.display()

In [None]:
numeric_cols = numeric_data.columns.tolist()
for n in numeric_cols:
    chart = alt.Chart(data).mark_area().encode(
        alt.Y("density:Q"), 
        alt.X(n)
    ).transform_density(
        n, [n, "density"])
    chart.display()

In [None]:
## Viewing the relationship between # of Dunks and draft pick number
pd.crosstab(index = data['# of Dunks'], columns = data['draft_number'])

In [None]:
## Visualizing the correlations of shot data
percent_cols = ['FG%', 'Dist.', '% of FGA by Distance 2P',
       '% of FGA by Distance 0-3', '% of FGA by Distance 3-10',
       '% of FGA by Distance 10-16', '% of FGA by Distance 16-3P',
       '% of FGA by Distance 3P', 'FG% by Distance 2P', 'FG% by Distance 0-3',
       'FG% by Distance 3-10', 'FG% by Distance 10-16',
       'FG% by Distance 16-3P', 'FG% by Distance 3P', '2P FG AST%',
       '3P FG AST%', 'Dunks %FGA', '# of Dunks', 'Corner 3s %3PA',
       'Corner 3s 3P%']
perc_data = data[data.columns[data.columns.isin(percent_cols)]]
corr_matrix = perc_data.corr()
plt.rcParams.update({'font.size': 100})
plt.figure(figsize = (200,200))
sn.heatmap(corr_matrix, annot = True)

In [None]:
## Viewing distribution of shot distance by year
alt.Chart(data, title = "Average shot distance by year").mark_bar().encode(
    alt.X('Year:N', sort='-y'),
    alt.Y('mean(Dist)', axis = alt.Axis(title = "Average Shot Distance"))
).properties(height = 400, width = 800)

In [None]:
## Viewing overlapping density plot of 3 point % and % of 3 pointers assisted. 
alt.Chart(data).transform_fold(
    ['FG% by Distance 3P',
     "3P FG AST%"],
    as_ = ["Percent", "value"]
).transform_density(
    density='value',
    bandwidth=0.3,
    groupby=['Percent'],
    counts = True
).mark_area().encode(
    alt.X('value:Q'),
    alt.Y('density:Q', stack = None),
    alt.Color("Percent:N")
).properties(width=400, height=400)