In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils
import time
import re
from tqdm import tqdm
from fuzzywuzzy import process, fuzz
import jellyfish
import phonetics

# conn = utils.db_connect()



In [16]:
# find matches between name and school, then add the marval id for the matches
def fuzzy_match(map_df, map_to_df, map_name_col, map_to_name_col):
    result = pd.DataFrame()
    errors = []

    # remove punctuation from both names if the value is not missing
    map_df['clean_name'] = map_df[map_name_col].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    map_to_df['clean_name'] = map_to_df[map_to_name_col].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    
    # iterate through map_df
    for index, row in map_df.iterrows():

        try:
            ratio = process.extract(
                row['clean_name'].lower(),
                map_to_df['clean_name'].str.lower(),
                limit = 3
            )
            matches = pd.DataFrame(ratio, columns = ['map_name', 'score_name', 'index'])
            matches['map_index'] = index
            matches['map_name'] = row[map_name_col]
            matches = pd.merge(
                matches,
                map_to_df,
                left_on = 'index',
                right_index = True,
                how = 'left'
            )
            matches = matches.sort_values(by = ['score_name'], ascending = False)
            # concat to result
            result = pd.concat([result, matches])
        except Exception as e: 
            print(f"Error on {row[map_name_col]}")
            # add the error message to the row
            row['error'] = e
            errors.append(row)

    return result, errors

In [61]:
dg_players = pd.read_sql('select * from gold.players', conn)
# espn_players = pd.read_sql('select * from gold.player_bio', conn)
espn_player_names = pd.read_sql('select distinct espn_id, name from gold.player_stats', conn)

2023-07-22 22:13:39,255 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-07-22 22:13:39,256 INFO sqlalchemy.engine.Engine [cached since 3998s ago] {'name': 'select * from gold.players'}
2023-07-22 22:13:39,335 INFO sqlalchemy.engine.Engine select * from gold.players
2023-07-22 22:13:39,336 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-22 22:13:39,469 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-07-22 22:13:39,470 INFO sqlalchemy.engine.Engine [cached since 3998s ago] {'name': 'select distinct espn_id, name from gold.player_stats'}
2023-07-22 22:13:39,551 INFO sqlalchemy.engine.Engine select distinct espn_id, name from gold.player_stats
2023-07-22 22:13:39,552 INFO sqlalchemy.engine.Engine [raw sql] {}


In [62]:
# change names to first last instead of last, first
dg_players['name_fl'] = dg_players['name'].apply(lambda x: ' '.join(x.split(', ')[::-1]))
# drop null espn ids
espn_player_names = espn_player_names[espn_player_names['espn_id'].notnull()]

fuzzy_mappings, err = fuzzy_match(espn_player_names, dg_players, 'name', 'name_fl')

In [64]:
# get the highest score for each map_name
best = fuzzy_mappings.sort_values(by = ['map_name','score_name'], ascending = False).groupby('map_name').head(1)

# separate the df by score > 90
matches = best[best.score_name > 87]
checks = best[best.score_name <= 87]

# checks.sort_values(by = ['score_name'], ascending = False)

Unnamed: 0,map_name,score_name,index,map_index,dg_id,amateur,name,country,country_code,name_fl,clean_name
0,K.H. Lee,86,1209,391,25157,False,"Hodges, Lee",United States,USA,Lee Hodges,Lee Hodges
0,Kyung-tae Kim,86,1489,613,12188,False,"Kim, Bio",Korea - Republic of,KOR,Bio Kim,Bio Kim
0,Mike Lorenzo-Vera,86,1528,499,26090,False,"Kisia, Mike",Kenya,KEN,Mike Kisia,Mike Kisia
0,Josh Broadaway,86,1136,650,13544,False,"Hart, Josh",United States,USA,Josh Hart,Josh Hart
0,Joseph Dean,86,2841,561,8192,False,"Summerhays, Joseph",United States,USA,Joseph Summerhays,Joseph Summerhays
0,Joe Affrunti,86,628,344,16497,False,"Dean, Joe",England,ENG,Joe Dean,Joe Dean
0,Ho-Yu An,86,50,394,14459,False,"An, Byeong Hun",Korea - Republic of,KOR,Byeong Hun An,Byeong Hun An
0,Doug LaBelle II,86,957,86,24550,False,"Ghim, Doug",United States,USA,Doug Ghim,Doug Ghim
0,D.H. Lee,86,1209,339,25157,False,"Hodges, Lee",United States,USA,Lee Hodges,Lee Hodges
0,Stephen Ames,86,2778,187,25335,False,"Stallings Jr., Stephen",United States,USA,Stephen Stallings Jr.,Stephen Stallings Jr


In [65]:
# check these and find additional matches
# calculate metaphone code for each name
checks['map_name_code'] = checks['map_name'].apply(lambda x: phonetics.metaphone(x))
checks['name_fl_code'] = checks['name_fl'].apply(lambda x: phonetics.metaphone(x))

# get the fuzz ratio
checks['score_soundex'] = checks.apply(lambda x: fuzz.ratio(x['map_name_code'], x['name_fl_code']), axis = 1)

checks.sort_values(by = ['score_name'], ascending = False, inplace = True)

# for index, row in checks.iterrows():
#     print(row.score_name, row.map_name, row.name_fl, row.score_soundex)

# both scores >= 80
checks[(checks.score_name >= 80) & (checks.score_soundex >= 80)]

86 K.H. Lee Lee Hodges 40
86 Kyung-tae Kim Bio Kim 44
86 Mike Lorenzo-Vera Mike Kisia 50
86 Josh Broadaway Josh Hart 89
86 Joseph Dean Joseph Summerhays 50
86 Joe Affrunti Joe Dean 50
86 Ho-Yu An Byeong Hun An 29
86 Doug LaBelle II Doug Ghim 44
86 D.H. Lee Lee Hodges 40
86 Stephen Ames Stephen Stallings Jr. 56
86 Carl Yuan Carl Pettersson 67
86 Tano Goya Estanislao Goya 67
86 Tim Clark Tim Wiedemeyer 55
86 Tim Petrovic Tim Hart 55
86 Tom Gillis Tom Power Horan 36
86 Tom Watson Tom Power Horan 55
86 Cam Davis Davis Chatfield 46
86 Bill Lunde Bill Suguturaga 50
86 Ben Curtis Lawrence Curtis 71
86 Kris Ventura Kristoffer Ventura 82
85 Andy Svoboda Andrew Svoboda 93
83 Chris Riley Chris Paisley 80
82 Miguel Ángel Jiménez Miguel A Jimenez 82
82 Brian Gaffney Brian Gay 80
80 Kent Jones Kyle Jones 73
80 Jason Allred Jason Millard 92
78 Greg Owen Greg Snow 89
78 Bryce Molder Bryce Emory 83
76 Brad Elder Brad Miller 83
76 Nathan Holman Jonathan Thomson 71
76 Troy Kelly Jerry Kelly 75
75 Marco D

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checks['map_name_code'] = checks['map_name'].apply(lambda x: phonetics.metaphone(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checks['name_fl_code'] = checks['name_fl'].apply(lambda x: phonetics.metaphone(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checks['score_soundex'] = checks.appl

Unnamed: 0,map_name,score_name,index,map_index,dg_id,amateur,name,country,country_code,name_fl,clean_name,map_name_code,name_fl_code,score_soundex
0,Josh Broadaway,86,1136,650,13544,False,"Hart, Josh",United States,USA,Josh Hart,Josh Hart,JXPRT,JXRT,89
0,Kris Ventura,86,3020,541,15651,False,"Ventura, Kristoffer",Norway,NOR,Kristoffer Ventura,Kristoffer Ventura,KRSFNTR,KRSTFRFNTR,82
0,Andy Svoboda,85,2852,372,11274,False,"Svoboda, Andrew",United States,USA,Andrew Svoboda,Andrew Svoboda,ANTSFPT,ANTRSFPT,93
0,Chris Riley,83,2248,691,14844,False,"Paisley, Chris",England,ENG,Chris Paisley,Chris Paisley,KRSRL,KRSPL,80
0,Miguel Ángel Jiménez,82,1359,481,3596,False,"Jimenez, Miguel A",Spain,ESP,Miguel A Jimenez,Miguel A Jimenez,MKLNJLJMNS,MKLJMNS,82
0,Brian Gaffney,82,941,438,5951,False,"Gay, Brian",United States,USA,Brian Gay,Brian Gay,PRNKFN,PRNK,80
0,Jason Allred,80,2011,585,17567,False,"Millard, Jason",United States,USA,Jason Millard,Jason Millard,JSNLRT,JSNMLRT,92


In [66]:
matches_to_add = [
    'Kris Ventura',
    'Arie Irawan',
    'Andy Svoboda',
    'Miguel Ángel Jiménez'
]

checks[checks.map_name.isin(matches_to_add)]

Unnamed: 0,map_name,score_name,index,map_index,dg_id,amateur,name,country,country_code,name_fl,clean_name,map_name_code,name_fl_code,score_soundex
0,Kris Ventura,86,3020,541,15651,False,"Ventura, Kristoffer",Norway,NOR,Kristoffer Ventura,Kristoffer Ventura,KRSFNTR,KRSTFRFNTR,82
0,Andy Svoboda,85,2852,372,11274,False,"Svoboda, Andrew",United States,USA,Andrew Svoboda,Andrew Svoboda,ANTSFPT,ANTRSFPT,93
0,Miguel Ángel Jiménez,82,1359,481,3596,False,"Jimenez, Miguel A",Spain,ESP,Miguel A Jimenez,Miguel A Jimenez,MKLNJLJMNS,MKLJMNS,82


In [71]:
# bind exact matches with checked matches
final = pd.concat([
    best[['map_name', 'map_index', 'dg_id']],
    checks[checks.map_name.isin(matches_to_add)][['map_name', 'map_index', 'dg_id']]
])

# merge with espn players
final = pd.merge(
    final,
    espn_player_names,
    left_on = 'map_index',
    right_index = True,
    how = 'left'
)

# rename
final = final[['dg_id', 'espn_id', 'map_name']].rename(columns = {'map_name': 'espn_player_name'})

# merge with dg players
final = pd.merge(
    final,
    dg_players[['dg_id', 'name']],
    on = 'dg_id',
    how = 'left'
)

# rename
final = final[['dg_id', 'espn_id', 'espn_player_name', 'name']].rename(columns = {'name': 'dg_player_name'})

# add placeholder for PGA names
final['pga_player_name'] = None

final.head()

In [73]:
# write to db
drop_statement = """
DROP TABLE IF EXISTS gold.player_xref;
"""

create_statement = """
CREATE TABLE gold.player_xref (
    id serial PRIMARY KEY,
    dg_id integer,
    espn_id integer,
    espn_player_name varchar(255),
    dg_player_name varchar(255),
    pga_player_name varchar(255)
);
"""

conn.execute(drop_statement)
conn.execute(create_statement)

utils.write_to_db(final, 'player_xref', schema='gold', append=True)