# Data Preparation and Cleaning

In this notebook, I clean the datasets and combine them into a single csv file that can be used later for feature generation.

In [1]:
# Import packages
import sys
sys.path.append('/Users/phil/Documents/Documents/College Basketball')

import pandas as pd
import collegebasketball as cbb
cbb.__version__

'0.1'

## Cleaning the Data

First, we need to edit the school names in the kenpom datasets to ensure that they match up with the school names from the scores dataset

In [2]:
# The location where the files will be saved
path = '/Users/phil/Documents/Documents/College Basketball/Data/Kenpom/'

# Store a dataframe of kenpom data for each year in a list
kenpom_data = {}

# We need to clean each kenpom data set
for year in range(2002, 2018):
    
    # Load this year's data
    data = cbb.load_csv(path + str(year) + '_kenpom.csv')
    
    # Clean the school names to match up with the scores data
    data = cbb.update_kenpom(data)
    kenpom_data[year] = data

In [3]:
# Lets take a quick look at one of the datasets
kenpom_data[2013].head()

Unnamed: 0,Rank,Team,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,AdjD Rank,...,Luck,Luck Rank,OppAdjEM,OppAdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank
0,1,Louisville,BE,35,5,32.92,117.7,7,84.8,1,...,-0.016,230,10.23,7,107.4,11,97.2,6,2.15,100
1,2,Florida,SEC,29,8,31.18,117.2,9,86.0,3,...,-0.089,332,7.0,41,105.6,63,98.6,36,4.09,62
2,3,Indiana,B10,29,7,29.31,120.8,2,91.5,19,...,-0.021,239,8.86,17,106.7,23,97.8,14,-5.16,294
3,4,Michigan,B10,31,8,27.86,121.9,1,94.0,37,...,-0.007,197,11.13,3,108.2,5,97.0,3,-3.09,257
4,5,Gonzaga,WCC,32,3,27.81,120.5,3,92.7,30,...,0.026,116,4.37,87,105.4,75,101.0,102,5.03,39


## Joining the Datasets

Now that the school names from each data set matches up, we can join the kenpom and score data to form a single csv file.

In [4]:
# Save the paths to the scores data 
scores_path = '/Users/phil/Documents/Documents/College Basketball/Data/Scores/' 

# Save the joined tables in dictionaries
regular = {}
march = {}

# We need to first join datasets from the same year
for year in range(2002, 2018):
    
    # Load the scores datasets
    regular_season = cbb.load_csv(scores_path + str(year) + '_regular_season.csv')
    march_madness = cbb.load_csv(scores_path + str(year) + '_march.csv')
    
    # Join the dataframes to get kenpom for both home and away team
    regular[year] = pd.merge(regular_season, kenpom_data[year], left_on='Home', right_on='Team', sort=False)
    regular[year] = pd.merge(regular[year], kenpom_data[year], left_on='Away', right_on='Team', 
                             suffixes=('_Home', '_Away'), sort=False)
    
    march[year] = pd.merge(march_madness, kenpom_data[year], left_on='Home', right_on='Team', sort=False)
    march[year] = pd.merge(march[year], kenpom_data[year], left_on='Away', right_on='Team', 
                             suffixes=('_Home', '_Away'), sort=False)
    
    # Add a column to indicate the year
    regular[year].insert(0, 'Year', year)
    march[year].insert(0, 'Year', year)
    
# Lets take a look at one of the data sets
regular[2013].head()

Unnamed: 0,Year,Home,Away,Home_Score,Away_Score,Rank_Home,Team_Home,Conf_Home,Wins_Home,Losses_Home,...,Luck_Away,Luck Rank_Away,OppAdjEM_Away,OppAdjEM Rank_Away,OppO_Away,OppO Rank_Away,OppD_Away,OppD Rank_Away,NCSOS AdjEM_Away,NCSOS AdjEM Rank_Away
0,2013,South Dakota State,Alabama,67,70,96,South Dakota State,Sum,25,10,...,0.001,175,4.52,83,104.8,94,100.3,83,1.07,127
1,2013,Charleston Southern,Alabama,46,59,179,Charleston Southern,BSth,19,13,...,0.001,175,4.52,83,104.8,94,100.3,83,1.07,127
2,2013,Georgia,Alabama,58,61,113,Georgia,SEC,15,17,...,0.001,175,4.52,83,104.8,94,100.3,83,1.07,127
3,2013,Kentucky,Alabama,55,59,55,Kentucky,SEC,21,12,...,0.001,175,4.52,83,104.8,94,100.3,83,1.07,127
4,2013,LSU,Alabama,57,60,102,LSU,SEC,19,12,...,0.001,175,4.52,83,104.8,94,100.3,83,1.07,127


Now that we have joined the tables for each year, we can combine all of the data into two larger tables and then save the larger tables to csv files.

In [5]:
# Concatonate all of the tables into one larger table
regular_table = pd.concat(regular)
march_table = pd.concat(march)

len(regular_table)

83707

In [6]:
# Save the tables to csv files
regular_table.to_csv('/Users/phil/Documents/Documents/College Basketball/Data/regular_season.csv', index=False)
march_table.to_csv('/Users/phil/Documents/Documents/College Basketball/Data/march.csv', index=False)