In [2]:
# imports
import pandas as pd
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
from time import sleep
import numpy as np

plt.style.use('seaborn-darkgrid')

%matplotlib inline

In [None]:
# requesting b-r link so that we can collect the list of number options
link = 'http://www.basketball-reference.com/friv/numbers.cgi?number=00'
r = requests.get(link)
b = BeautifulSoup(r.text, 'lxml')

num_options = b.find('select',{'class':'no_chosen','name':'number'})
num_options = num_options.find_all('option')
numbers = [num.text[:-1] for num in num_options]
del(numbers[2])
numbers

['00',
 '0',
 '1',
 '2',
 '03',
 '3',
 '4',
 '5',
 '6',
 '07',
 '7',
 '8',
 '09',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '60',
 '61',
 '62',
 '63',
 '65',
 '66',
 '67',
 '68',
 '70',
 '71',
 '72',
 '73',
 '76',
 '77',
 '83',
 '84',
 '85',
 '86',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '98',
 '99']

In [None]:
# loop through numbers, request links, and load all the player data to 
# a temporary dict, then append copy of dict to list, finally create DF
player_dict = {'num':None, 'player':None, 'team':None, 'year':None}
player_list = []

for number in numbers:
    sleep(1)
    link_start = link[:-2]
    r = requests.get(link_start + number)
    b = BeautifulSoup(r.text, 'lxml')
    player_dict['num'] = number
    table = b.find('table', {'id':'numbers'})
    table_rows = table.find_all('tr')[1:]
    
    for row in table_rows:
        player_name = row.find('th',{'data-stat':'player'}).text
        player_dict['player'] = player_name.replace('*','')
        team_text = row.find('td',{'data-stat':'teams'}).text
        team_text = re.split('\(|\)', team_text)
        
        for i in range(len(team_text[:-1])):
            if i % 2 == 0:
                player_dict['team'] = team_text[i].strip()
            else:
                for year in team_text[i].split(', '):
                    player_dict['year'] = year
                    player_list.append(player_dict.copy())

cols = ['num','player','team','year']
num_df = pd.DataFrame(player_list, columns = cols)

num_df

In [None]:
print(plt.style.available)

In [None]:

# let's look at how all numbers are distributed
fig, ax = plt.subplots()
num_df.groupby('num').size().reset_index(name='count').sort_values(
   'num').plot(kind='bar',
               x='num',
               y='count',
               figsize=(20,10),
               ax=ax,
               fontsize=13)
fig.suptitle('Count of jerseys numbers in the NBA (1949-2017)', 
             fontsize=25)

In [None]:
# let's look at the same view but by a buckets of 10
num_df['bucket'] = num_df['num'].map(lambda x: 0 if int(x) < 10 else
                                    int(x[0]))

fig, ax = plt.subplots()
num_df.groupby('bucket').size().plot(kind='bar', 
                                     x='num',
                                     y='count',
                                     ax=ax)
fig.suptitle('Count of jerseys by starting digit in the NBA', 
             fontsize=14)

In [None]:
# let's look at how this changed over time
# 1987
fig, ax = plt.subplots()
df_1970 = num_df[num_df['year']=='50']
df_1970.groupby('bucket').size().plot(kind='bar', 
                                     x='num',
                                     y='count',
                                     ax=ax)
fig.suptitle('Count of jerseys by starting digit in the NBA (1950)', 
             fontsize=14)

# 2017
fig, ax = plt.subplots()
df_2017 = num_df[num_df['year']=='17']
df_2017.groupby('bucket').size().plot(kind='bar', 
                                     x='num',
                                     y='count',
                                     ax=ax)
fig.suptitle('Count of jerseys by starting digit in the NBA (2017)', 
             fontsize=14)

In [None]:
# add a full_year column to the dataframe
num_df['full_year'] = num_df['year'].map(lambda x: 
                                         '20' + x if x < '30' else '19' + x)
num_df.head()

In [None]:
# let's at the most recent numbers to be "inaugurated" in the NBA
min_year = num_df.groupby(['num'])['full_year'].min().reset_index(
    name='full_year').sort_values(
    'full_year', ascending=False).reset_index(drop=True)

min_year['first_player'] = min_year.apply(lambda x: num_df[(num_df['num'] == x['num'])&
                 (num_df['full_year'] == x['full_year'])].iloc[0]['player'],
              axis=1)

min_year['first_player2'] = min_year.apply(lambda x: num_df[(num_df['num'] == x['num'])&
                 (num_df['full_year'] == x['full_year'])].iloc[1]['player']
               if num_df[(num_df['num'] == x['num'])&
                 (num_df['full_year'] == x['full_year'])].shape[0] > 1
               else np.nan,
              axis=1)

min_year['num_first'] = min_year.apply(lambda x: num_df[(num_df['num'] == x['num'])&
                 (num_df['full_year'] == x['full_year'])].shape[0],
              axis=1)

min_year

In [None]:
# let's look at the most popular number for each year
ph = {'full_year':None, 'most_popular_num':None}
ph_ = []
for year in num_df['full_year'].unique():
    ph['most_popular_num'] = num_df[num_df['full_year'] == year].groupby(
        'num').size().sort_values(ascending=False)[0]
    ph['full_year'] = year
    ph_.append(ph.copy())
    
cols = ['full_year','most_popular_num']
pop_df = pd.DataFrame(ph_, columns = cols).sort_values('full_year',
    ascending=True)

pop_df.plot(kind='line', x='full_year', y='most_popular_num',
            title='Most popular jersey number in NBA by year')

In [None]:
# now we'll see which numbers have occured the most overall
num_df.num.value_counts().reset_index()[0:10].plot(
    kind='bar',x='index',y='num', legend=False,
    title='Most popular jersey numbers in NBA')

In [None]:
for i in range(1,100):
    if str(i) not in num_df['num'].unique():
        print(i)