In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',None)
pd.set_option('display.max_colwidth',None)

Create dataframe for player stats by season and web scrape mvp data

In [2]:
nba_data = pd.read_csv('nba_player_season_stats.csv')
nba_data.drop(columns=['Unnamed: 0'],inplace=True)
#nba_data.head()

In [3]:
request = requests.get('https://en.wikipedia.org/wiki/NBA_Most_Valuable_Player_Award')
soup = BeautifulSoup(request.content,'html.parser')
#soup.prettify()

Clean and format data

In [4]:
wiki_table = soup.find('table',{'class':'wikitable plainrowheaders sortable','summary':'Season (sortable), Player (sortable), Position (sortable), Nationality (sortable) and Team (sortable)'})

table_rows = wiki_table.find_all('tr')



mvp_lists = [row.text for row in table_rows]
season = [row.split('\n')[1] for row in mvp_lists]
player = [row.split('\n')[3] for row in mvp_lists]
position = [row.split('\n')[5] for row in mvp_lists]

mvp_df = pd.DataFrame()
mvp_df['Season'] = season
mvp_df['Player'] = player
mvp_df['Position'] = position

mvp_df = mvp_df.drop(0).reset_index(drop=True)


mvp_df['Season'] = mvp_df.Season.str.strip()

test = mvp_df.Season[0]
delimiter = test[-3]

def fix_years(season):
    season = season.split(delimiter)[1]
    if int(season) > 50:
        return (1900 + float(season))
    else:
        return (2000 + float(season))

mvp_df['Season'] = mvp_df.Season.apply(fix_years)

mvp_df['Player'] = mvp_df.Player.str.strip('*(2345678)^[h]gi ')


mvp_df.Player.iloc[25] = 'Julius Erving'
mvp_df.Player.iloc[[49,50]] = 'Steve Nash'
mvp_df.Player.iloc[15] = 'Kareem Abdul-Jabbar'
mvp_df.Player.iloc[51] = 'Dirk Nowitzki'

positions_dict = {'Center':'C','Power forward':'PF','Point guard':'PG','Shooting guard':'SG','Small forward':'SF'}

mvp_df['Position'] = mvp_df.Position.map(positions_dict)
mvp_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Season,Player,Position
0,1956.0,Bob Pettit,PF
1,1957.0,Bob Cousy,PG
2,1958.0,Bill Russell,C
3,1959.0,Bob Pettit,PF
4,1960.0,Wilt Chamberlain,C
5,1961.0,Bill Russell,C
6,1962.0,Bill Russell,C
7,1963.0,Bill Russell,C
8,1964.0,Oscar Robertson,PG
9,1965.0,Bill Russell,C


In [5]:
mvp_reference = mvp_df.iloc[:62,:]
nba_players = nba_data.iloc[:,[0,1,2]]
nba_players = nba_players[nba_players['Year']>=1956.0].reset_index(drop=True)

nba_players = nba_players.drop_duplicates()


#nba_players.info()
#nba_players.Year = nba_players.Year.fillna(1)
#nba_players.Year.value_counts()
#nba_players.Year = nba_players.Year.apply(lambda x: int(x))
nba_players.head()

Unnamed: 0,Year,Player,Pos
0,1956.0,Paul Arizin*,SF
1,1956.0,Jesse Arnelle,PF
2,1956.0,Dick Atha,SG
3,1956.0,Jim Baechtold,SF
4,1956.0,Ernie Barrett,SG


In [6]:
def fix_pos(pos):
    a = pos.split('-')
    if len(a) > 1:
        return a[0]
    elif pos == 'F':
        return 'SF'
    elif pos == 'G':
        return 'PG'
    else:
        return pos

In [7]:
nba_players['Pos'] = nba_players.Pos.apply(fix_pos)
nba_players['Pos'] = nba_players.Pos.map({'F':'SF','G':'PG','C':'C','SF':'SF','PG':'PG','PF':'PF','SG':'SG'})
nba_players.Pos.value_counts()

PF    4098
SG    4055
SF    4040
C     3998
PG    3774
Name: Pos, dtype: int64

In [8]:
nba_players['Player'] = nba_players.Player.str.strip(' *')
nba_players.reset_index(drop=True,inplace=True)

Identify mvp winners for each season by combining datasets 

In [9]:
indices = []

mvp_list = list(zip(mvp_reference.Player,mvp_reference.Season))

for name,year in mvp_list:
    indices.append(nba_players.loc[(nba_players.Year == year) & (nba_players.Player == name)].index)
    
    
mvp_ix = []

for index in indices:
    mvp_ix.append(index[0])

#mvp_ix

In [10]:
len(mvp_ix)

62

In [11]:
nba_players = nba_players.reset_index()
nba_players.head()

nba_players['MVP'] = nba_players['index'].apply(lambda x: 1 if x in mvp_ix else 0)
nba_players = nba_players.drop(columns=['index'])
mvp_winners = nba_players[nba_players['MVP'] == 1].head()
mvp_winners

Unnamed: 0,Year,Player,Pos,MVP
65,1956.0,Bob Pettit,C,1
109,1957.0,Bob Cousy,PG,1
272,1958.0,Bill Russell,C,1
365,1959.0,Bob Pettit,PF,1
404,1960.0,Wilt Chamberlain,C,1


Cross-tabulate data

In [12]:
nba_xtab = pd.crosstab(index=nba_players.MVP,columns=nba_players.Pos,margins=True)
nba_xtab

Pos,C,PF,PG,SF,SG,All
MVP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3971,4088,3763,4033,4048,19903
1,27,10,11,7,7,62
All,3998,4098,3774,4040,4055,19965


Save output to image

In [13]:
!pip install dataframe-image
import dataframe_image as dfi


nba_xtab_styled = nba_xtab.style.background_gradient()

dfi.export(nba_xtab_styled,'mvpXtab.png')

You should consider upgrading via the '/Users/christine/hueco_mundo/hueco_mundo/bin/python3 -m pip install --upgrade pip' command.[0m


[0213/104004.126132:INFO:headless_shell.cc(659)] Written to file /var/folders/jp/swf8p4nn0r1791yzwymqrplw0000gn/T/tmp26bbxir0/temp.png.
