# Helper Notebook for Initial Investigation
Ensure that the naming conventions of the provided data and the supplemental data are consistent. This notebook was for R&D and eventually source code was created to clean and processes the data (See `bullpen.data_utils.load_data`)

In [1]:
import pandas as pd

In [2]:
provided_data = pd.read_csv("../data/k.csv")

In [3]:
suppl_data = pd.read_csv("../data/supplemental-stats.csv")

## Ensure naming conventions
Probably a better way to do this with pattern matching and fuzzy searching, but only a few so did it manually.

In [4]:
suppl_data.Name = suppl_data.Name.replace(
    {
        "Manny Banuelos": "Manny Bañuelos",
        "Ralph Garza": "Ralph Garza Jr.",
        "Luis Ortiz": "Luis L. Ortiz",
        "Jose Hernandez": "Jose E. Hernandez",
        "Hyeon-jong Yang": "Hyeon-Jong Yang",
        "Adrián Martinez": "Adrián Martínez",
    }
)


provided_data.Name = provided_data.Name.replace(
    {
        "Eduardo Rodriguez": "Eduardo Rodríguez",
        "Jose Alvarez": "José Álvarez",
        "Sandy Alcantara": "Sandy Alcántara",
        "Carlos Martinez": "Carlos Martínez",
        "Phillips Valdez": "Phillips Valdéz",
        "Jovani Moran": "Jovani Morán",
        "Jose Cuas": "José Cuas",
        "Jorge Alcala": "Jorge Alcalá",
        "Jhoan Duran": "Jhoan Durán",
        "Jesus Tinoco": "Jesús Tinoco",
        "Brent Honeywell": "Brent Honeywell Jr.",
        "Adrian Morejon": "Adrián Morejón",
    }
)

In [5]:
def compare_datasets(provided_df, supplemental_df):
    """ """
    provided = set(provided_df.Name)
    suppl = set(supplemental_df.Name)

    # Only care about what is in provided as that's what is used in prediction exercise.
    diffs = provided - suppl
    return diffs


def lookup_player(dataframe, name, source="provided", show_data=False):
    mask = (
        dataframe.Name.str.startswith(name)
        if source == "supplemental"
        else dataframe.Name == name
    )
    if show_data:
        print(dataframe[mask])
    return dataframe.loc[mask, "Name"].unique().tolist()

In [6]:
compare_datasets(provided_data, suppl_data)

set()

In [7]:
lookup_player(provided_data, "Jhoan Duran", source="provided", show_data=True)

Empty DataFrame
Columns: [MLBAMID, PlayerId, Name, Team, Age, Season, TBF, K%]
Index: []


[]

In [8]:
lookup_player(suppl_data, "Adrián", source="supplemental", show_data=True)

       Rk             Name  Age   Tm    IP   PA   Pit  Pit/PA  Str   Str%  \
707   708   Adrián Morejón   22  SDP   4.2   20    80    4.00   51  0.638   
1750  592  Adrián Martínez   25  OAK  57.2  260  1015    3.90  647  0.637   
1826  668   Adrián Morejón   23  SDP  34.0  141   540    3.83  358  0.663   
2836  597  Adrián Martínez   26  OAK  55.0  242   943    3.90  570  0.604   
2908  669   Adrián Morejón   24  SDP   9.0   44   161    3.66   98  0.609   
3993  690   Adrián Morejón   25  SDP  63.2  272  1064    3.91  714  0.671   

      ... 02h  L/SO  S/SO  L/SO%  3pK  4pW  PAu  Pitu  Stru  Season  
707   ...   0     0     3  0.000    0    1    0     0     0    2021  
1750  ...   4    14    39  0.264    9    2    0     0     0    2022  
1826  ...   2     3    23  0.107    5    2    0     0     0    2022  
2836  ...   2    15    32  0.319    6    2    0     0     0    2023  
2908  ...   0     5     3  0.625    0    0    0     0     0    2023  
3993  ...   5    18    53  0.254    9   

['Adrián Morejón', 'Adrián Martínez']

In [9]:
tmp = provided_data[["MLBAMID", "PlayerId", "Name"]].drop_duplicates().to_dict("records")
# tmp

In [10]:
# import json

# with open("../data/player_ids.json", "w") as fp:
#     json.dump(tmp, fp)

In [11]:
from bullpen import data_utils

In [12]:
lookup = data_utils.PlayerLookup()

lookup.mapping.head()

loading player ids from /Users/logan/Desktop/repos/mlb-pitcher-xK/data/player_ids.json...


Unnamed: 0,MLBAMID,PlayerId,Name
0,695243,31757,Mason Miller
1,621242,14710,Edwin Díaz
2,518585,7048,Fernando Cruz
3,623352,14212,Josh Hader
4,663574,19926,Tony Santillan


In [13]:
lookup.get_id('Adrián Morejón', source='mlb')

670970

In [14]:
lookup.get_id('Adrián Morejón', source='fangraphs')

20039

In [15]:
lookup.get_name(670970)

'Adrián Morejón'

In [16]:
lookup.get_name(20039, source='fangraphs')

'Adrián Morejón'

In [17]:
lookup.get_id('Logan Allen', source='fangraphs')

Unnamed: 0,Name,PlayerId
0,Logan Allen,27589
1,Logan Allen,18555


In [18]:
lookup.get_id('Logan Allen', source='mlb')

Unnamed: 0,Name,MLBAMID
0,Logan Allen,671106
1,Logan Allen,663531


In [19]:
lookup.get_id('Logan Alln', source='mlb')

Unnamed: 0,Name,MLBAMID


In [20]:
lookup.get_name(671106, source='mlb')

'Logan Allen'