Scraping Kenpom to get data on past and current tournament teams

#### Inspiration code source: 
    
    https://www.kaggle.com/walterhan/scrape-kenpom-data
    
    https://www.kaggle.com/code/matthewdenko/kenneth-pomeroy-data-scraper-no-leakage/edit

#### Update from prior year:
using archived values prior to tourament to get kempom rankings prior to tournament performance

# Imports

In [71]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
import os

In [None]:
# Set Path
PATH = ''
os.chdir(PATH)

# Scraper

In [72]:
base_urls=[
          'https://web.archive.org/web/20110311233233/http://www.kenpom.com/',
          'https://web.archive.org/web/20120311165019/http://kenpom.com/',
          'https://web.archive.org/web/20130318221134/http://kenpom.com/',
          'https://web.archive.org/web/20140318100454/http://kenpom.com/',
          'https://web.archive.org/web/20150316212936/http://kenpom.com/',
          'https://web.archive.org/web/20160314134726/http://kenpom.com/',
          'https://web.archive.org/web/20170312131016/http://kenpom.com/',
          'https://web.archive.org/web/20180311122559/https://kenpom.com/',
          'https://web.archive.org/web/20190317211809/https://kenpom.com/',
          # Tournament wasn't played on 2020
          'https://web.archive.org/web/20210314233855/http://kenpom.com/',
          'https://web.archive.org/web/20220313171046/https://kenpom.com/',
          'https://web.archive.org/web/20230215165311/https://kenpom.com/',
          'https://kenpom.com/'
         ]

years = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021, 2022, 2023]

def scrap_archive(url,year):
    """
    Imports raw data from a kenpom archive into a dataframe
    """
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    table_full = soup.find_all('table', {'id': 'ratings-table'})

    thead = table_full[0].find_all('thead')
    table = table_full[0]
    
    for weird in thead:
        table = str(table).replace(str(weird), '')

    df = pd.read_html(table)[0]
    df['year'] = year
    
    return df
    

In [73]:
def scraping(df,years):
    
    for url, year in zip(base_urls, years):
    
        print(f'Scrapping: {url}')
        archive=scrap_archive(url,year)
        
        df = pd.concat( (df, archive), axis=0) 
        year += 1
    df.columns = ['Rank', 'Team', 'Conference', 'W-L', 'Pyth', 
             'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank',
             'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank', 
             'SOS Pyth', 'SOS Pyth Rank', 'SOS OppO', 'SOS OppO Rank',
             'SOS OppD', 'SOS OppD Rank', 'NCSOS Pyth', 'NCSOS Pyth Rank', 'Year']        
    return df

In [74]:
df=None
df=scraping(df,years)
df.head()

Scrapping: https://web.archive.org/web/20110311233233/http://www.kenpom.com/




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Scrapping: https://web.archive.org/web/20120311165019/http://kenpom.com/
Scrapping: https://web.archive.org/web/20130318221134/http://kenpom.com/
Scrapping: https://web.archive.org/web/20140318100454/http://kenpom.com/
Scrapping: https://web.archive.org/web/20150316212936/http://kenpom.com/
Scrapping: https://web.archive.org/web/20160314134726/http://kenpom.com/
Scrapping: https://web.archive.org/web/20170312131016/http://kenpom.com/
Scrapping: https://web.archive.org/web/20180311122559/https://kenpom.com/
Scrapping: https://web.archive.org/web/20190317211809/https://kenpom.com/
Scrapping: https://web.archive.org/web/20210314233855/http://kenpom.com/
Scrapping: https://web.archive.org/web/20220313171046/https://kenpom.com/
Scrapping: https://web.archive.org/web/20230215165311/https://kenpom.com/


Unnamed: 0,Rank,Team,Conference,W-L,Pyth,AdjustO,AdjustO Rank,AdjustD,AdjustD Rank,AdjustT,...,Luck Rank,SOS Pyth,SOS Pyth Rank,SOS OppO,SOS OppO Rank,SOS OppD,SOS OppD Rank,NCSOS Pyth,NCSOS Pyth Rank,Year
0,1,Ohio St.,B10,29-2,0.9824,125.4,2,88.4,10,66.0,...,72,0.734,26,107.4,17,98.3,46,0.4219,245,2011
1,2,Duke,ACC,27-4,0.972,118.8,6,87.2,6,70.1,...,171,0.7254,31,106.0,34,97.4,19,0.6092,71,2011
2,3,Kansas,B12,30-2,0.9711,119.8,4,88.3,9,69.6,...,30,0.6956,42,106.1,33,98.7,59,0.4944,178,2011
3,4,Texas,B12,26-6,0.9657,114.0,23,85.3,1,67.2,...,297,0.708,38,105.6,43,97.8,31,0.5844,85,2011
4,5,Purdue,B10,25-6,0.9641,116.1,12,87.2,5,67.1,...,191,0.7713,13,108.1,13,97.3,14,0.5076,166,2011


# Map Team ID to Kenpom Data

In [76]:
# Lambda that returns true if given string is a number and a valid seed number (1-16)
valid_seed = lambda x: True if str(x).replace(' ', '').isdigit() \
                and int(x) > 0 and int(x) <= 16 else False

# Use lambda to parse out seed/team
df['Seed'] = df['Team'].apply(lambda x: x[-2:].replace(' ', '') \
                              if valid_seed(x[-2:]) else np.nan )


df['Team'] = df['Team'].apply(lambda x: x[:-2] if valid_seed(x[-2:]) else x)

# Split W-L column into wins and losses
df['Wins'] = df['W-L'].apply(lambda x: int(re.sub('-.*', '', x)) )
df['Losses'] = df['W-L'].apply(lambda x: int(re.sub('.*-', '', x)) )
df.drop('W-L', inplace=True, axis=1)


# Reorder columns just cause I'm OCD
df=df[[ 'Year', 'Rank', 'Team', 'Conference', 'Wins', 'Losses', 'Seed','Pyth', 
             'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank',
             'AdjustT', 'AdjustT Rank', 'Luck', 'Luck Rank', 
             'SOS Pyth', 'SOS Pyth Rank', 'SOS OppO', 'SOS OppO Rank',
             'SOS OppD', 'SOS OppD Rank', 'NCSOS Pyth', 'NCSOS Pyth Rank']]
             

In [77]:
df.Team=df.Team.apply(lambda x: x.replace('-',' '))
df.Team=df.Team.apply(lambda x: x.lower())
df.Team=df.Team.apply(lambda x: x.strip())
df.Team=df.Team.replace('mississippi valley st.','mississippi valley state')
#df.Team=df.Team.replace('texas a&m corpus chris','texas a&m corpus christi')
df.Team=df.Team.replace('dixie st.','dixie st')
df.Team=df.Team.replace('st. francis pa','st francis pa')
df.Team=df.Team.replace('ut rio grande valley','texas rio grande valley')
df.Team=df.Team.replace('southeast missouri st.','southeast missouri state')
df.Team=df.Team.replace('tarleton st.','tarleton st')
df.Team=df.Team.replace('liu','liu brooklyn')
df.Team=df.Team.replace('cal st. bakersfield','cal state bakersfield')

df.Team=df.Team.replace('virginia military inst','virginia military	')
df.Team=df.Team.replace('louisiana saint','louisiana state')
df.Team=df.Team.replace('nj inst of technology','njit')

df.Team=df.Team.replace('texas a&m corpus chris','texas a&m corpus')
df.Team=df.Team.replace('md baltimore county','maryland baltimore county')

df.Team=df.Team.replace('southwest missouri saint','southwest missouri state')
df.Team=df.Team.replace('southwest texas saint','southwest texas saint')
df.Team=df.Team.replace('winston salem saint','winston salem saint')

#-------------------------------------------------------
# merge with spelling file to get the TeamID
spelling=pd.read_csv("inputs/MTeamSpellings.csv",encoding='cp1252')
spelling.columns=['Team','TeamID']
spelling.Team=spelling.Team.apply(lambda x: x.replace('-',' '))
df.Team=df.Team.apply(lambda x: x.strip())


df=df.merge(spelling[['Team','TeamID']],on='Team',how='left')

df.Team=df.Team.apply(lambda x: x.replace('st.','saint'))
df.Team=df.Team.apply(lambda x: x.replace(';',''))
df.Team=df.Team.apply(lambda x: x.replace('\t',''))
df.Team=df.Team.replace('texas a&m corpus chris','texas a&m corpus')
df.Team=df.Team.replace('louisiana saint','louisiana state')
df.Team=df.Team.replace('southwest missouri saint','southwest missouri state')
df.Team=df.Team.replace('southwest texas saint','texas state')
df.Team=df.Team.replace('winston salem saint','winston salem state')

df=df.merge(spelling[['Team','TeamID']],on='Team',how='left')

df.TeamID_x.fillna(df.TeamID_y, inplace=True)


df=df.drop(['TeamID_y','Conference','Wins','Losses','Seed'],axis = 1)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,Year,Rank,Team,Pyth,AdjustO,AdjustO Rank,AdjustD,AdjustD Rank,AdjustT,AdjustT Rank,...,Luck Rank,SOS Pyth,SOS Pyth Rank,SOS OppO,SOS OppO Rank,SOS OppD,SOS OppD Rank,NCSOS Pyth,NCSOS Pyth Rank,TeamID_x
0,2011,1,ohio saint,0.9824,125.4,2,88.4,10,66.0,213,...,72,0.734,26,107.4,17,98.3,46,0.4219,245,
1,2011,2,duke,0.972,118.8,6,87.2,6,70.1,40,...,171,0.7254,31,106.0,34,97.4,19,0.6092,71,1181.0
2,2011,3,kansas,0.9711,119.8,4,88.3,9,69.6,46,...,30,0.6956,42,106.1,33,98.7,59,0.4944,178,1242.0
3,2011,4,texas,0.9657,114.0,23,85.3,1,67.2,143,...,297,0.708,38,105.6,43,97.8,31,0.5844,85,1400.0
4,2011,5,purdue,0.9641,116.1,12,87.2,5,67.1,149,...,191,0.7713,13,108.1,13,97.3,14,0.5076,166,1345.0


In [78]:
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.replace(' ', '_')
df.head(25)

Unnamed: 0,year,rank,team,pyth,adjusto,adjusto_rank,adjustd,adjustd_rank,adjustt,adjustt_rank,...,luck_rank,sos_pyth,sos_pyth_rank,sos_oppo,sos_oppo_rank,sos_oppd,sos_oppd_rank,ncsos_pyth,ncsos_pyth_rank,teamid_x
0,2011,1,ohio saint,0.9824,125.4,2,88.4,10,66.0,213,...,72,0.734,26,107.4,17,98.3,46,0.4219,245,
1,2011,2,duke,0.972,118.8,6,87.2,6,70.1,40,...,171,0.7254,31,106.0,34,97.4,19,0.6092,71,1181.0
2,2011,3,kansas,0.9711,119.8,4,88.3,9,69.6,46,...,30,0.6956,42,106.1,33,98.7,59,0.4944,178,1242.0
3,2011,4,texas,0.9657,114.0,23,85.3,1,67.2,143,...,297,0.708,38,105.6,43,97.8,31,0.5844,85,1400.0
4,2011,5,purdue,0.9641,116.1,12,87.2,5,67.1,149,...,191,0.7713,13,108.1,13,97.3,14,0.5076,166,1345.0
5,2011,6,pittsburgh,0.96,119.7,5,90.8,22,63.3,315,...,189,0.7279,28,106.4,29,97.6,23,0.427,239,1338.0
6,2011,7,wisconsin,0.9549,125.8,1,96.4,73,58.0,344,...,149,0.7695,14,108.8,9,98.0,36,0.4649,201,1458.0
7,2011,8,notre dame,0.9505,122.8,3,95.0,58,64.5,274,...,14,0.7433,19,107.1,22,97.7,29,0.4196,247,1323.0
8,2011,8,notre dame,0.9505,122.8,3,95.0,58,64.5,274,...,14,0.7433,19,107.1,22,97.7,29,0.4196,247,1323.0
9,2011,8,notre dame,0.9505,122.8,3,95.0,58,64.5,274,...,14,0.7433,19,107.1,22,97.7,29,0.4196,247,1323.0


# Export File

In [79]:
df.to_csv('inputs/kenpom.csv',index=False)