## Determining which ID dataset to use to connect yahoo with fangraphs 

Each dataset has a number of different IDs and connecting them is important in preserving the data. This script will help provide guidance on which baseball ID datasets to use

The current sources are from [Crunch Time Baseball](http://crunchtimebaseball.com/baseball_map.html) and [Smart Fantasy Baseball](https://www.smartfantasybaseball.com/tools/).

#### Import modules

In [1]:
#Import modules
import os
import math
import difflib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Set file paths

In [2]:
### Input paths

# Crunch Time Baseball
ct_URL = r'http://crunchtimebaseball.com/master.csv'

# Smart Fantasy Baseball
sfb_URL = r'https://www.smartfantasybaseball.com/PLAYERIDMAPCSV'

### Output paths
outputFolder = r'C:\Users\phili\OneDrive\Documents\DataProjects\data'
ct_File = 'crunchTime_IDs.csv'
sfb_File = 'smartFB_IDs.csv'

# Construct output file paths
ct_PATH = os.path.join(outputFolder, ct_File)
sfb_PATH = os.path.join(outputFolder, sfb_File)

### Fangraphs data
fg_datasets = ['SteamerBat_20200731.csv', 'SteamerPit_20200731.csv']

### Yahoo! Fantasy data
yh_datasets = ['totalAvailableList_20200730.csv', 'totalKeeperList_20200730.csv']

#### Set parameters

In [3]:
# Crunch Time columns to read-in
ct_cols = ['fg_id', 'fg_name', 'yahoo_id', 'yahoo_name']

# Smart Fantasy Baseball columns to read-in
sfb_cols =['IDFANGRAPHS', 'FANGRAPHSNAME', 'YAHOOID', 'YAHOONAME']

# Fangraphs columns to read-in
fg_cols = ['Name', 'playerid']

# Yahoo columns to read-in
yh_cols = ['Keeper', 'Player_ID']

# Keys to keep when quantifying nulls
keysToKeep = ['Name']

#### Read-in files

In [4]:
### Read-in files from the web
# 'ISO-8859-1'

# ID datasets
ct_df = pd.read_csv(ct_URL, encoding='ISO-8859-1', usecols = ct_cols)
sfb_df = pd.read_csv(sfb_URL, encoding='ISO-8859-1', usecols = sfb_cols)

# Fangraphs datasets: Read dfs into a list of dfs and concat. Select fg_cols
fg_list = []

for fg in fg_datasets:
    df = pd.read_csv(os.path.join(outputFolder, fg), index_col=None, header=0, usecols = fg_cols)
    fg_list.append(df)

fg_df = pd.concat(fg_list, axis=0, ignore_index=True)
    
# Yahoo! datasets: Read dfs into a list of dfs and concat. Select yh_cols
yh_list = []

for yh in yh_datasets:
    df = pd.read_csv(os.path.join(outputFolder, yh), index_col=None, header=0, usecols = yh_cols)
    yh_list.append(df)

yh_df = pd.concat(yh_list, axis=0, ignore_index=True)

#### Write out config files if set

In [5]:
# if writeOrigID = True:
#     ct_df.to_csv('')
ct_df.head()

Unnamed: 0,fg_id,fg_name,yahoo_id,yahoo_name
0,11387,A.J. Achter,9824.0,A.J. Achter
1,11467,A.J. Cole,9638.0,A.J. Cole
2,5677,A.J. Ellis,8373.0,A.J. Ellis
3,11132,A.J. Griffin,9220.0,A.J. Griffin
4,7077,A.J. Jimenez,,


#### Format columns as needed

In [5]:
# The Fangraphs data id comes in as an int but the Crunch Time data has the id as an object
# Convert the Fangraphs ID to an object
fg_df['playerid'] = fg_df['playerid'].astype(str)

### Create a column that is the closest match for the name of the Yahoo data in the ID data
# Consider creating a function

def findTopNameMatches(name1, name2):
    
    import sys
    
    # Expected values
    expectedValues = ['Yahoo!', 'CT', 'SFB']
    
    # Break if not in expected values
    if name1 not in expectedValues or name2 not in expectedValues:
        raise ValueError('Input String not in expected values]\nExpecting: ' + ' or '.join(expectedValues))
    
    
    if name1 == 'Yahoo!':
        df1 = yh_df
        field1 = 'Keeper'
    
    if name2 == 'CT':
        df2 = ct_df
        field2 = 'yahoo_name'
        
    if name2 == 'SFB':
        df2 = sfb_df
        field2 = 'YAHOONAME'
    
    # Extract lists of names in each dataset of the join
    df1Names = df1[field1].to_list()
    df2Names = df2[field2].to_list()
    
    # Clean nans out of the lists
    df2Names = ['' if str(x) == 'nan' else x for x in df2Names]
    
    # Store the best match of the yahoo name in the Crunch Time data
    print('Finding the best match of names...' + 'for ' + name2)
    
    ### Some steps are needed to convert the output of nested lists into a column to store
    
    # Store the best matches of the names, which is outputed as a nested list
    tempList = [difflib.get_close_matches(potMatch, df2Names, 3) for potMatch in df1Names]
    
    # # Break down the nested list
    tempList = [item if len(item) != 0 else [''] for item in tempList]
    # yh_df['ct_match_from_yh_name'] = [item for sublist in test for item in sublist]
    
    return tempList

yh_df['closest_yh_match_in_ct'] = findTopNameMatches('Yahoo!', 'CT')
yh_df['closest_yh_match_in_sfb'] = findTopNameMatches('Yahoo!', 'SFB')


Finding the best match of names...for CT
Finding the best match of names...for SFB


In [6]:
yh_df.head()

Unnamed: 0,Keeper,Player_ID,closest_yh_match_in_ct,closest_yh_match_in_sfb
0,Albert Pujols,6619,"[Albert Pujols, Alberto Callaspo]","[Albert Pujols, Albert Abreu, Alberto Callaspo]"
1,Miguel Cabrera,7163,"[Miguel Cabrera, Melky Cabrera, Miguel Castro]","[Miguel Cabrera, Melky Cabrera, Miguel Castro]"
2,Jeff Mathis,7296,"[Jeff Mathis, Jeff Manship, Jeff Bianchi]","[Jeff Mathis, Jeff Francis, Jeff Bianchi]"
3,René Rivera,7458,"[RenÃ© Rivera, Yadiel Rivera, Ben Revere]","[Rene Rivera, Juan Rivera, Yadiel Rivera]"
4,Robinson Canó,7497,"[Robinson CanÃ³, Robinson Chirinos, Drew Robin...","[Robinson Cano, Robinson Chirinos, Drew Robinson]"


#### Test the different joins using each ID dataset and measure the complete data

In [7]:
# Determine the number of people in the yahoo dataset
numOfyhPlyrs = len(yh_df)

# Workflow
# Yahoo <- ID <- FG
# Join to create a Yahoo <- Crunch Time <- Fangraphs dataset
yh_ct = pd.merge(left = yh_df, right = ct_df, how = 'left', left_on = 'Player_ID', right_on = 'yahoo_id')

# Examine where the merge failed
ctNulls = yh_ct.isnull()['yahoo_name']
numOfctNulls = yh_ct.isnull()['yahoo_name'].sum()

print(r'Presenting the first join on the Yahoo! data to the ID data...')
print('...Out of the ' + str(numOfyhPlyrs) + ' players that needed to be joined, ' + str(numOfctNulls) + ' failed')

# Display where they are
yh_ct[ctNulls].head(20)

Presenting the first join on the Yahoo! data to the ID data...
...Out of the 1011 players that needed to be joined, 154 failed


Unnamed: 0,Keeper,Player_ID,closest_yh_match_in_ct,closest_yh_match_in_sfb,fg_id,fg_name,yahoo_id,yahoo_name
140,Christian Arroyo,9890,"[Christian Walker, Christian Garcia, Christian...","[Christian Arroyo, Christian Walker, Christian...",,,,
153,Mike Yastrzemski,10119,"[Mike Zagurski, Mike Ekstrom]","[Mike Yastrzemski, Mike Adams]",,,,
174,Jorge Mateo,10239,"[Jorge Alfaro, Joe Paterson, Joe Martinez]","[Jorge Mateo, Joe Mather, Jorge Cantu]",,,,
191,Austin Riley,10420,"[Austin Romine, Justin Haley, Austin Brice]","[Austin Riley, Austin Romine, Austin Brice]",,,,
193,Zack Collins,10450,"[Tim Collins, Zach Phillips, Tyler Collins]","[Zack Collins, Zack Cox, Tim Collins]",,,,
194,Yu Chang,10459,[],[Yu Chang],,,,
199,Garrett Stubbs,10478,"[Drew Stubbs, Jarrett Grube, Garrett Jones]","[Garrett Stubbs, Drew Stubbs, Garrett Atkins]",,,,
200,Jonathan Araúz,10484,"[Jonathan Aro, Jonathan Diaz, Yonathan Daza]","[Yonathan Daza, Jonathan Sanchez, Jonathan Her...",,,,
203,Matt Thaiss,10497,"[Matt Tuiasosopo, Matt Tracy, Matt Harrison]","[Matt Thaiss, Matt Cain, Matt Harrison]",,,,
207,Trent Grisham,10522,[Tyler Graham],[Trent Grisham],,,,


In [8]:
# Join to create a Yahoo <- Smart Fantasy Baseball <- Fangraphs dataset
yh_sfb = pd.merge(left = yh_df, right = sfb_df, how = 'left', left_on = 'Player_ID', right_on = 'YAHOOID')

# Examine where the merge failed
sfbNulls = yh_sfb.isnull()['YAHOONAME']
numOfFirstSFBNulls = yh_sfb.isnull()['YAHOONAME'].sum()

print(r'Presenting the first join on the Yahoo! data to the ID data...')
print('...Out of the ' + str(numOfyhPlyrs) + ' players that needed to be joined, ' + str(numOfFirstSFBNulls) + ' failed')

# Display where they are
yh_sfb[sfbNulls].head(20)

Presenting the first join on the Yahoo! data to the ID data...
...Out of the 1011 players that needed to be joined, 96 failed


Unnamed: 0,Keeper,Player_ID,closest_yh_match_in_ct,closest_yh_match_in_sfb,IDFANGRAPHS,FANGRAPHSNAME,YAHOOID,YAHOONAME
142,Óscar Hernández,9899,"[Teoscar HernÃ¡ndez, CÃ©sar HernÃ¡ndez, Marco ...","[Cesar Hernandez, Teoscar HernÃ¡ndez, Marco He...",,,,
200,Jonathan Araúz,10484,"[Jonathan Aro, Jonathan Diaz, Yonathan Daza]","[Yonathan Daza, Jonathan Sanchez, Jonathan Her...",,,,
219,Edmundo Sosa,10594,[],[],,,,
223,José Marmolejos,10651,"[Jose Marmolejos, Jose Mijares, Carlos Marmol]","[Jose Mijares, Carlos Marmol, Josh James]",,,,
243,Phillip Evans,10827,"[Phillip Ervin, Phillippe Aumont, Zach Phillips]","[Phillip Ervin, Phillippe Aumont, Phil Maton]",,,,
272,Adolis García,11127,"[Adolis Garcia, Adonis Garcia, Aramís García]","[Adonis Garcia, Avisail Garcia, Robel Garcia]",,,,
278,Joe Hudson,11161,"[Daniel Hudson, Dakota Hudson, Joe Paterson]","[Tim Hudson, Joe Benson, Daniel Hudson]",,,,
290,Edward Olivares,11232,"[Edward Olivares, Eduardo Paredes, Edward Mujica]","[Jared Oliva, Edward Mujica]",,,,
352,Rafael Dolis,9088,"[Rafael Dolis, Rafael Devers, Rafael Ynoa]","[Rafael Devers, Rafael Soriano, Rafael Bautista]",,,,
368,Brooks Raley,9262,"[Brooks Raley, Brooks Conrad]",[],,,,


#### Determine where the missing values overlap and differ

In [14]:
# Determine where the values are valid in both
bothNulls = np.array(ctNulls) & np.array(sfbNulls)
bothValid = ~np.array(ctNulls) & ~np.array(sfbNulls)

print('There are {0} yahoo players who are not in either dataset and {1} in both'.format(bothNulls.sum(), bothValid.sum()))

There are 62 yahoo players who are not in either dataset and 823 in both


In [23]:
# Determine where one dataset is valid

# CT
onlyCT = ~np.array(ctNulls) & np.array(sfbNulls)
print('There are {0} players that are only in CT'.format(onlyCT.sum()))

# SFB
onlySFB = ~np.array(sfbNulls) & np.array(ctNulls)
print('There are {0} players that are only in SFB'.format(onlySFB.sum()))

There are 34 players that are only in CT
There are 92 players that are only in SFB


In [26]:
# Confirming the totals match
checkTotal= bothValid.sum() + bothNulls.sum() + onlyCT.sum() + onlySFB.sum()

if checkTotal == numOfyhPlyrs:
    print('Success. Player totals match\n...there are {0} total'.format(checkTotal))
    
else:
    print('WARNING: LOST PLAYERS!')

Success. Player totals match
...there are 1011 total


In [None]:
#### Determine which ID dataset matches best with FanGraphs

In [None]:
# firstNulls == firstSFBNulls
test1 = [True, True, False]
test2 = [True, False, False]

np.array(test1) | np.array(test2)

In [None]:
yh_ct_fg = pd.merge(left = yh_ct, right = fg_df, how = 'left', left_on = 'fg_id', right_on = 'playerid')
yh_sfb_fg = yh_sfb.merge(fg_df, how = 'left', left_on = 'IDFANGRAPHS', right_on = 'playerid')

In [None]:
yh_ct_fg.head()

In [None]:
yh_sfb_fg.head()

#### Format the data for visualizations

In [None]:
# Create vector with the counts of nulls or nas in each column
rawCtNulls = yh_ct_fg.isnull()
rawSFBNulls = yh_sfb_fg.isnull()

# Sum
ctNulls = rawCtNulls.sum()
sfbNulls = rawSFBNulls.sum()

# Filter only what we want to keep
ctNulls = ctNulls[keysToKeep]
sfbNulls = sfbNulls[keysToKeep]

# Convert the pd series into a data frame and transpose
missing_df = pd.concat([ctNulls.rename('CrunchTime'), sfbNulls.rename('SmartFantasy')], axis=1).T

# Rename the keysToKeep column into what we're interested in, Number of Missing
missing_df.rename(columns = {keysToKeep[0]: 'Number of Missing'}, inplace = True)

#### Visualize

In [None]:
missing_df.plot(y = 'Number of Missing', use_index = True, kind = 'bar', rot=0, color = 'orange')

#### See where the data is missing in the table

In [None]:
yh_ct_fg[rawCtNulls[keysToKeep[0]]].head(n=10)

In [None]:
yh_sfb_fg[rawSFBNulls[keysToKeep[0]]].head(n=10)