In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils
import time
import re
from tqdm import tqdm
from fuzzywuzzy import process, fuzz

In [None]:
def fuzzy_match_events(map_df, map_to_df, map_event_col, map_to_event_col):
    result = pd.DataFrame()
    errors = []

    # remove punctuation from both names if the value is not missing
    map_df['clean_event'] = map_df[map_event_col].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    map_to_df['clean_event'] = map_to_df[map_to_event_col].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    
    # iterate through map_df
    for index, row in map_df.iterrows():

        try:
            # filter for season = row.season
            tmp = map_to_df[map_to_df['season'] == row['season']]
            ratio = process.extract(
                row['clean_event'].lower(),
                tmp['clean_event'].str.lower(),
                limit = 1
            )
            matches = pd.DataFrame(ratio, columns = ['map_event', 'score', 'index'])
            matches['map_index'] = index
            matches['map_event'] = row[map_event_col]
            matches = pd.merge(
                matches,
                tmp,
                left_on = 'index',
                right_index = True,
                how = 'left'
            )
            matches = matches.sort_values(by = ['score'], ascending = False)
            # concat to result
            result = pd.concat([result, matches])
        except Exception as e: 
            print(f"Error on {row[map_event_col]}")
            # add the error message to the row
            row['error'] = e
            errors.append(row)

    return result, errors

In [None]:
conn = utils.db_connect()

event_money = pd.read_sql('select * from gold.winnings', conn)
event_money_events = event_money[['event_name', 'season']].drop_duplicates()
events = pd.read_sql('select distinct event_id, event_name, calendar_year, season, tour from gold.events', conn)

fuzzy_mappings, err = fuzzy_match_events(event_money_events, events, 'event_name', 'event_name')
matches = fuzzy_mappings[fuzzy_mappings.score > 87]
checks = fuzzy_mappings[fuzzy_mappings.score <= 87]

matches.sort_values(by = ['score'], ascending = True, inplace=True)

for index, row in matches.iterrows():
    print(row.score, row.calendar_year, row.map_event, row.event_name)

In [None]:
# find us open
# event_money_events[event_money_events['event_name'].str.contains('U.S. Open')]
# events[events['event_name'].str.contains('U.S. Open')]

In [None]:
matches = matches[matches['map_event'] != 'U.S. Open (2021)']
matches.to_csv('matches.csv', index=False)
checks.to_csv('missed_events.csv', index=False)

In [None]:
matches_final = pd.read_csv('matches_final.csv')

In [None]:
# rename
final = matches_final[['event_id',  'event_name', 'map_event', 'calendar_year', 'season']].rename(
    columns = {'event_id':'dg_event_id', 'event_name':'dg_event_name', 'map_event': 'pga_event_name', 'season': 'pga_season'}
    )

final['tour'] = 'pga'

final.head()

Check missing:

In [None]:
# Merge the two DataFrames and mark the rows that exist in both
merged_events = events.merge(final, 
                             left_on=['event_id', 'calendar_year', 'tour'], 
                             right_on=['dg_event_id', 'calendar_year', 'tour'], 
                             how='left', 
                             indicator=True)

# Filter out the rows that exist in both DataFrames
events_filtered = merged_events[merged_events['_merge'] == 'left_only']

# Drop the indicator column if you don't need it
events_filtered.drop(columns='_merge', inplace=True)

events_filtered[events_filtered['tour'] == 'pga'].sort_values(by = ['event_name', 'calendar_year', 'event_id'])

Added tour championship manually. Hero World not going to be included, benefits Tiger foundation so no money. Olympics not included, no money. Players was cancelled in 2020.

In [None]:
# write to db
drop_statement = """
DROP TABLE IF EXISTS gold.event_xref;
"""

create_statement = """
CREATE TABLE gold.event_xref (
    id serial PRIMARY KEY,
    dg_event_id integer,
    dg_event_name varchar(255),
    pga_event_name varchar(255),
    calendar_year int,
    pga_season int,
    tour varchar(255)
);
"""

conn.execute(drop_statement)
conn.execute(create_statement)

utils.write_to_db(final, 'event_xref', schema='gold', append=True)