# Data Preparation and Cleaning

In this notebook, I clean the datasets and combine them into a single csv file that can be used later for feature generation.

In [1]:
# Import packages
import sys
sys.path.append('/Users/phil/Documents/Documents/College_Basketball')

import pandas as pd
import collegebasketball as cbb
cbb.__version__

'0.2'

## Cleaning the Data

First, we need to edit the school names in the kenpom, basic stats and T-Rank datasets to ensure that they match up with the school names from the scores dataset.

In [2]:
# The location where the files will be saved
path = '/Users/phil/Documents/Documents/College_Basketball/Data/'

# Store a dataframe of kenpom data for each year in a list
kenpom_data = {}
TRank_data = {}
stats_data = {}

# We need to clean each statistics data set
for year in range(2002, 2020):
    
    # Load this year's data and clean up the school names to match up with scores data
    data_kenpom = pd.read_csv('{0}Kenpom/{1}_kenpom.csv'.format(path, year))
    kenpom_data[year] = cbb.update_kenpom(data_kenpom)
    
    # TRank data starts in 2008 and the team name column is called school instead of team
    if year > 2007:
        data_TRank =  pd.read_csv('{0}TRank/{1}_TRank.csv'.format(path, year))
        TRank_data[year] = cbb.update_TRank(data_TRank)
        
    # Basic stats data starts in 2010
    if year > 2009:
        data_stats =  pd.read_csv('{0}SportsReference/{1}_stats.csv'.format(path, year))
        data_stats = data_stats.rename(index=str, columns={'School': 'Team'})
        stats_data[year] = cbb.update_basic(data_stats)

In [3]:
# Lets take a quick look at one of the datasets
kenpom_data[2019].head()

Unnamed: 0,Rank,Team,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,AdjD Rank,...,Luck,Luck Rank,OppAdjEM,OppAdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank
0,1,Virginia,ACC,29,3,35.66,123.6,2,87.9,5,...,0.024,119,10.51,29,108.4,48,97.9,19,-2.82,248
1,2,Gonzaga,WCC,30,3,32.79,125.1,1,92.3,16,...,0.008,160,3.38,83,106.2,76,102.8,102,1.98,109
2,3,Duke,ACC,29,5,31.99,120.1,6,88.1,6,...,0.018,137,13.25,7,110.4,4,97.2,11,5.48,38
3,4,Michigan State,B10,28,6,31.36,121.7,4,90.4,8,...,-0.007,198,13.57,3,110.2,7,96.7,2,3.15,90
4,5,Michigan,B10,28,6,29.44,115.5,18,86.1,2,...,-0.014,227,11.65,17,109.3,29,97.6,16,-4.94,305


## Joining the Datasets

Now that the school names from each data set matches up, we can join the kenpom and score data to form a single csv file.

In [4]:
# Save the paths to the scores data 
scores_path = '../Data/Scores/'
save_path = '../Data/Combined_Data/Kenpom/'

# Save the joined tables in dictionaries
regular = {}
march = {}

# We need to first join datasets from the same year
for year in range(2002, 2020):
    
    # Load the scores datasets
    regular_season = cbb.load_csv(scores_path + str(year) + '_regular_season.csv')
    if year < 2019:
        march_madness = cbb.load_csv(scores_path + str(year) + '_march.csv')
    
    # Join the dataframes to get kenpom for both home and away team
    regular[year] = pd.merge(regular_season, kenpom_data[year], left_on='Home', right_on='Team', sort=False)
    regular[year] = pd.merge(regular[year], kenpom_data[year], left_on='Away', right_on='Team', 
                             suffixes=('_Home', '_Away'), sort=False)
    
    if year < 2019:
        march[year] = pd.merge(march_madness, kenpom_data[year], left_on='Home', right_on='Team', sort=False)
        march[year] = pd.merge(march[year], kenpom_data[year], left_on='Away', right_on='Team', 
                                 suffixes=('_Home', '_Away'), sort=False)
    
    # Add a column to indicate the year
    regular[year].insert(0, 'Year', year)
    if year < 2019:
        march[year].insert(0, 'Year', year)
    
    # Save each year's data to a csv file
    regular[year].to_csv('{0}{1}_regular_season.csv'.format(save_path, year), index=False)
    if year < 2019:
        march[year].to_csv('{0}{1}_march.csv'.format(save_path, year), index=False)
    
# Lets take a look at one of the data sets
regular[2013].head()

Unnamed: 0,Year,Home,Away,Home_Score,Away_Score,Rank_Home,Team_Home,Conf_Home,Wins_Home,Losses_Home,...,Luck_Away,Luck Rank_Away,OppAdjEM_Away,OppAdjEM Rank_Away,OppO_Away,OppO Rank_Away,OppD_Away,OppD Rank_Away,NCSOS AdjEM_Away,NCSOS AdjEM Rank_Away
0,2013,Bryant,Indiana,54,97,176,Bryant,NEC,19,12,...,-0.021,239,8.86,17,106.7,23,97.8,14,-5.16,294
1,2013,Michigan State,Indiana,70,75,10,Michigan State,B10,27,9,...,-0.021,239,8.86,17,106.7,23,97.8,14,-5.16,294
2,2013,Coppin State,Indiana,51,87,313,Coppin State,MEAC,8,24,...,-0.021,239,8.86,17,106.7,23,97.8,14,-5.16,294
3,2013,Jacksonville,Indiana,59,93,288,Jacksonville,ASun,14,18,...,-0.021,239,8.86,17,106.7,23,97.8,14,-5.16,294
4,2013,Mount St. Mary's,Indiana,54,93,211,Mount St. Mary's,NEC,18,14,...,-0.021,239,8.86,17,106.7,23,97.8,14,-5.16,294


Now that we have joined the tables for each year, we can combine all of the data into two larger tables and then save the larger tables to csv files.

In [5]:
# Concatonate all of the tables into one larger table
regular_table = pd.concat(regular)
march_table = pd.concat(march)

# Save the tables to csv files
regular_table.to_csv('../Data/Combined_Data/Kenpom/regular_season.csv', index=False)
march_table.to_csv('../Data/Combined_Data/Kenpom/march.csv', index=False)

print('Total Number of Games in the Kenpom Data: {}'.format(len(regular_table)))

Total Number of Games in the Kenpom Data: 91210


Next, we will join the TRank data with the game scores to form a single csv file.

In [6]:
# Save the paths to the scores data 
scores_path = '../Data/Scores/'
save_path = '../Data/Combined_Data/TRank/'

# Save the joined tables in dictionaries
regular = {}
march = {}

# We need to first join datasets from the same year
for year in range(2008, 2020):
    
    # Load the scores datasets
    regular_season = cbb.load_csv(scores_path + str(year) + '_regular_season.csv')
    if year < 2019:
        march_madness = cbb.load_csv(scores_path + str(year) + '_march.csv')
    
    # Join the dataframes to get kenpom for both home and away team
    regular[year] = pd.merge(regular_season, TRank_data[year], left_on='Home', right_on='Team', sort=False)
    regular[year] = pd.merge(regular[year], TRank_data[year], left_on='Away', right_on='Team', 
                             suffixes=('_Home', '_Away'), sort=False)
    
    if year < 2019:
        march[year] = pd.merge(march_madness, TRank_data[year], left_on='Home', right_on='Team', sort=False)
        march[year] = pd.merge(march[year], TRank_data[year], left_on='Away', right_on='Team', 
                                 suffixes=('_Home', '_Away'), sort=False)
    
    # Add a column to indicate the year
    regular[year].insert(0, 'Year', year)
    if year < 2019:
        march[year].insert(0, 'Year', year)
    
    # Save each year's data to a csv file
    regular[year].to_csv('{0}{1}_regular_season.csv'.format(save_path, year), index=False)
    if year < 2019:
        march[year].to_csv('{0}{1}_march.csv'.format(save_path, year), index=False)
    
# Lets take a look at one of the data sets
regular[2013].head()

Unnamed: 0,Year,Home,Away,Home_Score,Away_Score,Rk_Home,Team_Home,Conf_Home,G_Home,Rec_Home,...,2P%D_Away,2P%D Rank_Away,3P%_Away,3P% Rank_Away,3P%D_Away,3P%D Rank_Away,Adj T._Away,Adj T. Rank_Away,WAB_Away,WAB Rank_Away
0,2013,Bryant,Indiana,54,97,169,Bryant,NEC,31,19-12,...,43.2,29,40.3,5,30.4,29,67.8,92,7.8,2
1,2013,Michigan State,Indiana,70,75,11,Michigan State,B10,35,26-9,...,43.2,29,40.3,5,30.4,29,67.8,92,7.8,2
2,2013,Coppin State,Indiana,51,87,309,Coppin State,MEAC,31,7-24,...,43.2,29,40.3,5,30.4,29,67.8,92,7.8,2
3,2013,Jacksonville,Indiana,59,93,295,Jacksonville,ASun,30,12-18,...,43.2,29,40.3,5,30.4,29,67.8,92,7.8,2
4,2013,Mount St. Mary's,Indiana,54,93,209,Mount St. Mary's,NEC,32,18-14,...,43.2,29,40.3,5,30.4,29,67.8,92,7.8,2


In [7]:
# Concatonate all of the tables into one larger table
regular_table = pd.concat(regular)
march_table = pd.concat(march)

# Save the tables to csv files
regular_table.to_csv('../Data/Combined_Data/TRank/regular_season.csv', index=False)
march_table.to_csv('../Data/Combined_Data/TRank/march.csv', index=False)

print('Total Number of Games in the TRank Data: {}'.format(len(regular_table)))

Total Number of Games in the TRank Data: 63246


Lastly, we will run the same process for the basic statistics.

In [8]:
# Save the paths to the scores data 
scores_path = '../Data/Scores/'
save_path = '../Data/Combined_Data/Basic/'

# Save the joined tables in dictionaries
regular = {}
march = {}

# We need to first join datasets from the same year
for year in range(2010, 2020):
    
    # Load the scores datasets
    regular_season = cbb.load_csv(scores_path + str(year) + '_regular_season.csv')
    if year < 2019:
        march_madness = cbb.load_csv(scores_path + str(year) + '_march.csv')
    
    # Join the dataframes to get kenpom for both home and away team
    regular[year] = pd.merge(regular_season, stats_data[year], left_on='Home', right_on='Team', sort=False)
    regular[year] = pd.merge(regular[year], stats_data[year], left_on='Away', right_on='Team', 
                             suffixes=('_Home', '_Away'), sort=False)
    
    if year < 2019:
        march[year] = pd.merge(march_madness, stats_data[year], left_on='Home', right_on='Team', sort=False)
        march[year] = pd.merge(march[year], stats_data[year], left_on='Away', right_on='Team', 
                                 suffixes=('_Home', '_Away'), sort=False)
    
    # Add a column to indicate the year
    regular[year].insert(0, 'Year', year)
    if year < 2019:
        march[year].insert(0, 'Year', year)
    
    # Save each year's data to a csv file
    regular[year].to_csv('{0}{1}_regular_season.csv'.format(save_path, year), index=False)
    if year < 2019:
        march[year].to_csv('{0}{1}_march.csv'.format(save_path, year), index=False)
    
# Lets take a look at one of the data sets
regular[2013].head()

Unnamed: 0,Year,Home,Away,Home_Score,Away_Score,Team_Home,G_Home,SRS_Home,SOS_Home,Tm._Home,...,FT_Away,FTA_Away,FT%_Away,ORB_Away,TRB_Away,AST_Away,STL_Away,BLK_Away,TOV_Away,PF_Away
0,2013,Bryant,Indiana,54,97,Bryant,31,-2.08,-4.72,2294,...,672,904,0.743,439,1270,514,256,130,469,598
1,2013,Michigan State,Indiana,70,75,Michigan State,36,18.64,10.55,2448,...,672,904,0.743,439,1270,514,256,130,469,598
2,2013,Coppin State,Indiana,51,87,Coppin State,32,-12.65,-4.2,2035,...,672,904,0.743,439,1270,514,256,130,469,598
3,2013,Jacksonville,Indiana,59,93,Jacksonville,32,-10.47,-4.74,2181,...,672,904,0.743,439,1270,514,256,130,469,598
4,2013,Mount St. Mary's,Indiana,54,93,Mount St. Mary's,32,-5.85,-4.1,2233,...,672,904,0.743,439,1270,514,256,130,469,598


In [9]:
# Concatonate all of the tables into one larger table
regular_table = pd.concat(regular)
march_table = pd.concat(march)

# Save the tables to csv files
regular_table.to_csv('../Data/Combined_Data/Basic/regular_season.csv', index=False)
march_table.to_csv('../Data/Combined_Data/Basic/march.csv', index=False)

print('Total Number of Games in the Basic Stats Data: {}'.format(len(regular_table)))

Total Number of Games in the Basic Stats Data: 53102
