In [1]:
from functions import PlayerStats, OpeningDayRoster
import pandas as pd
import numpy as np
import time
import requests
from bs4 import BeautifulSoup

### Create csv

In [2]:
# Create a dataframe with all player stats from 1979-80 to 2022-23
years = np.arange(1979, 2024)
dfs = []
for year in years :

    print(f'Fetching player stats for season {year-1}-{year} ...', end = '\r')
    dfs.append(PlayerStats(year))
    time.sleep(5)

data = pd.concat(dfs)
data.to_csv('PlayersStats_1979-2023.csv', index = None)

Fetching player stats for season 1978-1979 ...

KeyboardInterrupt: 

In [5]:
years = np.arange(1979, 2024)

rosters = []
for year in years :

    print(f'Fetching player stats for season {year-1}-{year} ...', end = '\r')
    # Create team name and code mapping
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_ratings.html'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    table = soup.find('table')
    table.find('tr', class_ = 'over_header').decompose()
    mapping = pd.read_html(str(table))[0][['Team']]
    mapping['Tm'] = [x['href'].split('/')[2] for x in table.find_all('a', href = True)]
    tm_map = dict(zip(mapping['Team'], mapping['Tm']))

    rosterDict = OpeningDayRoster(year)
    # Loop over the roster dictionary to create roster dataframe
    lplayers, lteams = [], []
    for team, players in rosterDict.items():

        for player in players :
            lplayers.append(player)
            lteams.append(tm_map.get(team))

    roster = pd.DataFrame(zip(lplayers, len(lplayers) * [year], lteams), columns = ['Player', 'Year', 'OpeningDayTm'])    
    rosters.append(roster)
    time.sleep(1)

Fetching player stats for season 2022-2023 ...

In [8]:
data = pd.concat(rosters).reset_index(drop = True)
data.to_csv('OpeningDayRosters_1979-2023.csv', index = None)

# Player features

In [2]:
data = pd.read_csv('data/PlayersStats_1979-2023.csv')
data.head()

Unnamed: 0,Player,href,Year,Pos,Age,Tm,G,GS,MP,FG,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Kareem Abdul-Jabbar,abdulka01,1979,C,31,LAL,80,,39.5,9.7,...,15.3,23.3,8.8,5.6,14.4,0.219,4.6,3.0,7.6,7.7
1,Tom Abernethy,abernto01,1979,PF,24,GSW,70,,17.4,2.5,...,7.7,13.8,2.3,1.3,3.7,0.144,0.2,0.7,0.9,0.9
2,Alvan Adams,adamsal01,1979,C,24,PHO,77,,30.7,7.4,...,18.9,24.1,3.9,3.7,7.6,0.154,2.3,1.2,3.4,3.2
3,Lucius Allen,allenlu01,1979,PG,31,KCK,31,,13.3,2.2,...,13.7,20.3,-0.4,0.5,0.1,0.007,-3.7,0.4,-3.3,-0.1
4,Kim Anderson,anderki01,1979,SF,23,POR,21,,10.7,1.1,...,19.8,19.6,-0.6,0.2,-0.4,-0.078,-6.1,-1.3,-7.5,-0.4


In [4]:
odr = OpeningDayRoster(2023)

KeyError: 'Player'

In [42]:
lplayers, lteams = [], []
for team, players in odr.items():

    for player in players :
        lplayers.append(player)
        lteams.append(tm_map.get(team))

In [50]:
rosters = pd.DataFrame(zip(lplayers, lteams), columns = ['Player', 'Tm'])

In [47]:
stats = data[data['Year'] == 2023]

In [55]:
[p for p in rosters.merge(stats, on = 'Player', how = 'outer').sort_values('Player').Player]

['A.J. Green',
 'A.J. Griffin, Jr.',
 'A.J. Lawson',
 'AJ Griffin',
 'Aaron Gordon',
 'Aaron Holiday',
 'Aaron Nesmith',
 'Aaron Wiggins',
 'Admiral Schofield',
 'Al Horford',
 'Alec Burks',
 'Aleksej Pokusevski',
 'Alex Caruso',
 'Alex Len',
 'Alize Johnson',
 'Alondes Williams',
 'Alperen Sengun',
 'Alperen Şengün',
 'Amir Coffey',
 'Andre Drummond',
 'Andre Iguodala',
 'Andrew Nembhard',
 'Andrew Wiggins',
 'Anfernee Simons',
 'Anthony Davis',
 'Anthony Edwards',
 'Anthony Gill',
 'Anthony Lamb',
 'Austin Reaves',
 'Austin Rivers',
 'Ayo Dosunmu',
 'B.J. Boston, Jr.',
 'Bam Adebayo',
 'Ben Simmons',
 'Bennedict Mathurin',
 'Bismack Biyombo',
 'Blake Griffin',
 'Blake Wesley',
 'Boban Marjanovic',
 'Boban Marjanović',
 'Bobby Portis',
 'Bogdan Bogdanovic',
 'Bogdan Bogdanović',
 'Bojan Bogdanovic',
 'Bojan Bogdanović',
 'Bol Bol',
 'Bones Hyland',
 'Bradley Beal',
 'Brandon Boston Jr.',
 'Brandon Clarke',
 'Brandon Ingram',
 'Braxton Key',
 'Brook Lopez',
 'Bruce Brown',
 'Bruce Brow

In [213]:
Odr = pd.read_csv('data/OpeningDayRosters_1979-2023.csv').rename(columns = {'Player':'PlayerODR'})
Ps = pd.read_csv('data/PlayersStats_1979-2023.csv')

In [214]:
def cleanName(input) :
    x = input.replace(',','').replace('.', '').replace("'", '').replace('-', '')
    words = x.split(' ')
    words = [x for x in words if x not in ['Jr', 'Sr', 'III', 'IV', 'II']]
    x = ' '.join(words)
    x = unidecode.unidecode(x)
    x = x.upper()
    return x

In [215]:
Odr['clean_name'] = Odr['PlayerODR'].apply(cleanName)
Ps['clean_name'] = Ps['Player'].apply(cleanName)

In [168]:
df = Odr.merge(Ps, on = ['clean_name', 'Year'], how = 'left')
df.head()

Unnamed: 0,PlayerODR,Year,clean_name,Player
0,Tiny Archibald,1979,TINY ARCHIBALD,Tiny Archibald
1,Dennis Awtrey,1979,DENNIS AWTREY,Dennis Awtrey
2,Marvin Barnes,1979,MARVIN BARNES,Marvin Barnes
3,Don Chaney,1979,DON CHANEY,Don Chaney
4,Dave Cowens,1979,DAVE COWENS,Dave Cowens


In [169]:
df_needmatch = df[df['Player'].isna()]
df_needmatch

Unnamed: 0,PlayerODR,Year,clean_name,Player
8,Joe Pace,1979,JOE PACE,
41,Mo Cheeks,1979,MO CHEEKS,
87,Jim Bostic,1979,JIM BOSTIC,
93,Robert Hawkins,1979,ROBERT HAWKINS,
98,John Shumate,1979,JOHN SHUMATE,
...,...,...,...,...
17848,K.J. Martin,2023,KJ MARTIN,
17867,"Kenny Lofton, Jr.",2023,KENNY LOFTON,
17872,"Vincent Williams, Jr.",2023,VINCENT WILLIAMS,
17879,Herb Jones,2023,HERB JONES,


In [150]:
best_matches, similarities = [], []
for i, player in enumerate(df_needmatch['clean_name']) :
    print(i+1, '/', len(df_needmatch), end = '\r')
    best_match = max(list(Ps['clean_name']), key = lambda name2: fuzz.partial_ratio(player, name2))
    best_matches.append(best_match)

    similarity = fuzzy_match_names(player, best_match)
    similarities.append(similarity)
df_needmatch['best_match'] = best_matches
df_needmatch['similarity'] = similarities

372 / 372

In [170]:
df_needmatch['exists_in_player_stats'] = df_needmatch['clean_name'].apply(lambda x: x in list(Ps.clean_name.unique()))
df_needmatch.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_needmatch['exists_in_player_stats'] = df_needmatch['clean_name'].apply(lambda x: x in list(Ps.clean_name.unique()))


Unnamed: 0,PlayerODR,Year,clean_name,Player,exists_in_player_stats
8,Joe Pace,1979,JOE PACE,,False
41,Mo Cheeks,1979,MO CHEEKS,,False
87,Jim Bostic,1979,JIM BOSTIC,,False
93,Robert Hawkins,1979,ROBERT HAWKINS,,False
98,John Shumate,1979,JOHN SHUMATE,,True


In [173]:
df_needmatch = df_needmatch.groupby('exists_in_player_stats').get_group(False)
df_needmatch

Unnamed: 0,PlayerODR,Year,clean_name,Player,exists_in_player_stats
8,Joe Pace,1979,JOE PACE,,False
41,Mo Cheeks,1979,MO CHEEKS,,False
87,Jim Bostic,1979,JIM BOSTIC,,False
93,Robert Hawkins,1979,ROBERT HAWKINS,,False
122,Joe C. Meriweather,1979,JOE C MERIWEATHER,,False
...,...,...,...,...,...
17848,K.J. Martin,2023,KJ MARTIN,,False
17867,"Kenny Lofton, Jr.",2023,KENNY LOFTON,,False
17872,"Vincent Williams, Jr.",2023,VINCENT WILLIAMS,,False
17879,Herb Jones,2023,HERB JONES,,False


In [162]:
name_map = {'ALEK RADOJEVIC': 'ALEKSANDAR RADOJEVIC',
 'BJ BOSTON': 'BRANDON BOSTON',
 'BABBY ARAUJO': 'RAFAEL ARAUJO',
 'BATEER MENGKE': 'Mengke Bateer',
 'BOBBY HANSEN': 'BOB HANSEN',
 'CADILLAC ANDERSON': 'Greg Anderson',
 'CAM REYNOLDS': 'CAMERON REYNOLDS',
 'CHARLES A JONES': 'CHARLES JONES',
 'CHARLES ALEXANDER JONES': 'CHARLES JONES',
 'CHARLES CORNELIUS SMITH': 'CHARLES SMITH',
 'CHARLES DANIEL SMITH': 'CHARLES SMITH',
 'CHARLES EDWARD SMITH': 'CHARLES SMITH',
 'CHARLES R JONES': 'CHARLES JONES',
 'CHEICKH SAMB': 'CHEIKH SAMB',
 'CHET HOLMGREN': 'CHET HOLMGREN',
 'COLLIN GILLESPIE': 'COLLIN GILLESPIE',
 'CORRY CARR': 'CORY CARR',
 'DAVID GREENWOOD': 'DAVE GREENWOOD',
 'DELMER BESHORE': 'DEL BESHORE',
 'EJ LIDDELL': 'EJ LIDDELL',
 'FLIP MURRAY': 'Ronald Murray',
 'FRANKIE WILLIAMS': 'Frank Williams',
 'GEFF CROMPTON': 'GEOFF CROMPTON',
 'GEORGE MURESAN': 'Gheorghe Mureșan',
 'GEORGE PAPAGIANNIS': 'GEORGIOS PAPAGIANNIS',
 'HERB JONES': 'HERBERT JONES',
 'IBRAHIM KUTLUAY': 'IBO KUTLUAY',
 'IKE AUSTIN': 'ISAAC AUSTIN',
 'JR RIDER': 'Isaiah Rider',
 'JAKE WILEY': 'Jacob Wiley',
 'JAMES HUFF': 'JAY HUFF',
 'JAMES MCADOO': 'James Michael McAdoo',
 'JEFFERY TAYLOR': 'JEFF TAYLOR',
 'JIANLIAN YI': 'YI JIANLIAN',
 'JOE C MERIWEATHER': 'JOE MERIWEATHER',
 'JOSE RAFAEL ORTIZRIJOS': 'JOSE ORTIZ',
 'JOSHUA MAGETTE': 'JOSH MAGETTE',
 'KJ MARTIN': 'KENYON MARTIN',
 'KEENAN EVANS': 'EARL EVANS',
 'KENNY LOFTON': 'KENNETH LOFTON',
 'MARCELINHO HUERTAS': 'MARCELO HUERTAS',
 'MARCUS D WILLIAMS': 'Marcus Williams',
 'MARQUINHOS VINICIUS': 'MARCUS VINICIUS',
 'MATT DELLAVEDOVA': 'MATTHEW DELLAVEDOVA',
 'MAYBYNER NENE': 'NENE',
 'MEL TURPIN': 'MELVIN TURPIN',
 'MICHAEL PATRICK GBINIJE': 'Michael Gbinije',
 'MIKE HOLTON': 'Michael Holton',
 'MO CHEEKS': 'MAURICE CHEEKS',
 'MO TAYLOR': 'Fatty Taylor',
 'MOE WAGNER': 'Moritz Wagner',
 'MOHAMED BAMBA': 'MO BAMBA',
 'NATHANIEL HINTON': 'NATE HINTON',
 'NAZ LONG': 'Naz Mitrou-Long',
 'NICOLAS CLAXTON': 'Nic Claxton',
 'NORMAN RICHARDSON': 'Norm Richardson',
 'OGNEN KUZMIC': 'OGNJEN KUZMIC',
 'OLIVER MACK': 'OLLIE MACK',
 'PEJA DROBNJAK': 'PREDRAG DROBNJAK',
 'PENNY HARDAWAY': 'ANFERNEE HARDAWAY',
 'PIG MILLER': 'Anthony Miller',
 'PIPOKA VIANNA': 'JOAO VIANNA',
 'POOH JETER': 'Eugene Jeter',
 'RAULZINHO NETO': 'RAUL NETO',
 'RAYMOND SPALDING': 'RAY SPALDING',
 'RICKY CALLOWAY': 'RICK CALLOWAY',
 'ROBERT HAWKINS': 'Bubbles Hawkins',
 'RONNIE GRANDISON': 'RON GRANDISON',
 'SAER SENE': 'MOUHAMED SENE',
 'SASA DJORDJEVIC': 'ALEKSANDAR DJORDJEVIC',
 'SEAN CHRISTIAN SMITH': 'Chris Smith',
 'SERGUEI BAZAREVITCH': 'SERGEI BAZAREVICH',
 'SLAVA MEDVEDENKO': 'STANISLAV MEDVEDENKO',
 'SQUEAKY JOHNSON': 'Carldell Johnson',
 'VINCENT WILLIAMS': 'Vince Williams Jr.',
 'VITOR FAVERANI': 'Vítor Luiz Faverani',
 'WAYNE ENGLESTAD': 'WAYNE ENGELSTAD',
 'WESLEY IWUNDU': 'WES IWUNDU',
 'WILL CUNNINGHAM': 'WILLIAM CUNNINGHAM',
 'YANN ULRICH STEPHANE LASME': 'STEPHANE LASME',
 'YUE SUN': 'Sun Yue',
 'ZHIZHI WANG': 'WANG ZHIZHI'}

In [174]:
name_Map = {}
for key, val in name_map.items() :
    name_Map[key] = cleanName(val)

In [180]:
df_needmatch = df_needmatch[df_needmatch['clean_name'].isin(list(name_Map.keys()))]

In [184]:
df_needmatch['clean_name'] = df_needmatch['clean_name'].apply(lambda x: name_Map.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_needmatch['clean_name'] = df_needmatch['clean_name'].apply(lambda x: name_Map.get(x))


In [185]:
df_needmatch

Unnamed: 0,PlayerODR,Year,clean_name,Player,exists_in_player_stats
41,Mo Cheeks,1979,MAURICE CHEEKS,,False
93,Robert Hawkins,1979,BUBBLES HAWKINS,,False
122,Joe C. Meriweather,1979,JOE MERIWEATHER,,False
150,Geff Crompton,1979,GEOFF CROMPTON,,False
304,Joe C. Meriweather,1980,JOE MERIWEATHER,,False
...,...,...,...,...,...
17848,K.J. Martin,2023,KENYON MARTIN,,False
17867,"Kenny Lofton, Jr.",2023,KENNETH LOFTON,,False
17872,"Vincent Williams, Jr.",2023,VINCE WILLIAMS,,False
17879,Herb Jones,2023,HERBERT JONES,,False


In [186]:
df_nm = df_needmatch.merge(Ps, on = ['clean_name', 'Year'], how = 'left')

In [216]:
Odr['clean_name'] = Odr['clean_name'].apply(lambda x: name_Map.get(x) if x in list(name_Map.keys()) else x)

In [222]:
df = Odr.merge(Ps, on = ['clean_name', 'Year'], how = 'left').drop(columns = ['Tm'])
df.head()

Unnamed: 0,PlayerODR,Year,OpeningDayTm,clean_name,Player,href,Pos,Age,G,GS,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Tiny Archibald,1979,BOS,TINY ARCHIBALD,Tiny Archibald,architi01,PG,30.0,69.0,,...,21.8,21.3,1.0,0.6,1.6,0.046,-0.5,-2.1,-2.6,-0.3
1,Dennis Awtrey,1979,BOS,DENNIS AWTREY,Dennis Awtrey,awtrede01,C,30.0,63.0,,...,28.3,9.6,-0.1,0.9,0.7,0.046,-3.5,1.2,-2.3,-0.1
2,Marvin Barnes,1979,BOS,MARVIN BARNES,Marvin Barnes,barnema01,PF,26.0,38.0,,...,18.5,18.1,0.1,0.9,1.0,0.06,-2.7,1.2,-1.5,0.1
3,Don Chaney,1979,BOS,DON CHANEY,Don Chaney,chanedo01,SG,32.0,65.0,,...,13.1,18.1,-0.3,1.0,0.6,0.027,-3.5,0.8,-2.7,-0.2
4,Dave Cowens,1979,BOS,DAVE COWENS,Dave Cowens,cowenda01,C,30.0,68.0,,...,13.7,19.6,2.4,2.4,4.8,0.091,0.1,0.2,0.3,1.5


In [223]:
df.to_csv('PlayerStats_withODTm_1979-2023.csv', index = None)