In [1]:
# Functional -- Updated 2024.11.03
import pandas as pd
from difflib import SequenceMatcher


def filter_by_date(df, start_date, end_date):
    return df[(df['event_date'] >= start_date) & (df['event_date'] <= end_date)]

def american_to_probability(odds):
    if odds < 0:
        return (-odds) / ((-odds) + 100)
    else:
        return 100 / (odds + 100)

def probability_to_american(prob):
    if prob > 0.5:
        return -100 * (prob / (1 - prob))
    else:
        return 100 * ((1 - prob) / prob)

def calculate_avg_odds(odds_list):
    probabilities = [american_to_probability(odds) for odds in odds_list]
    avg_probability = sum(probabilities) / len(probabilities)
    return round(probability_to_american(avg_probability))

name_match_cache = {}

def similar(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def extract_first_last(name):
    name_parts = name.split()
    if len(name_parts) >= 2:
        return f"{name_parts[0]} {name_parts[-1]}"
    return name

def is_name_match(fighter_name, outcome_name):
    cache_key = (fighter_name, outcome_name)
    if cache_key in name_match_cache:
        return name_match_cache[cache_key]

    simplified_fighter_name = extract_first_last(fighter_name)
    simplified_outcome_name = extract_first_last(outcome_name)
    
    if similar(simplified_fighter_name, simplified_outcome_name) > 0.8:
        name_match_cache[cache_key] = True
        return True
    
    reversed_outcome_name = " ".join(simplified_outcome_name.split()[::-1])
    if similar(simplified_fighter_name, reversed_outcome_name) > 0.8:
        name_match_cache[cache_key] = True
        return True
    
    name_match_cache[cache_key] = False
    return False

def relaxed_name_match(fighter_name, outcome_name):
    fighter_first_last = extract_first_last(fighter_name).split()
    outcome_first_last = extract_first_last(outcome_name).split()

    if len(fighter_first_last) == 2 and len(outcome_first_last) == 2:
        fighter_first, fighter_last = fighter_first_last
        outcome_first, outcome_last = outcome_first_last
        
        return similar(fighter_first, outcome_first) > 0.8 or similar(fighter_last, outcome_last) > 0.8

    return False

def append_odds_to_fights(masterlist, odds):
    result_list = []

    for _, fight in masterlist.iterrows():
        event_date = fight['event_date']
        fighter_a = fight['fighter_a_name']
        fighter_b = fight['fighter_b_name']

        event_odds = odds[odds['Timestamp'].dt.date == event_date.date()]

        fighter_a_odds = event_odds[event_odds['Outcome Name'].apply(lambda x: is_name_match(fighter_a, x))]
        fighter_b_odds = event_odds[event_odds['Outcome Name'].apply(lambda x: is_name_match(fighter_b, x))]

        if not fighter_a_odds.empty and fighter_b_odds.empty:
            fighter_b_odds = event_odds[event_odds['Outcome Name'].apply(lambda x: relaxed_name_match(fighter_b, x))]
        elif not fighter_b_odds.empty and fighter_a_odds.empty:
            fighter_a_odds = event_odds[event_odds['Outcome Name'].apply(lambda x: relaxed_name_match(fighter_a, x))]

        fighter_a_odds_prices = fighter_a_odds['Odds Price'].tolist()
        fighter_b_odds_prices = fighter_b_odds['Odds Price'].tolist()

        fighter_a_avg_odds, fighter_a_best_odds = calculate_best_avg_odds(fighter_a_odds, fighter_a_odds_prices)
        fighter_b_avg_odds, fighter_b_best_odds = calculate_best_avg_odds(fighter_b_odds, fighter_b_odds_prices)

        fight_data = fight.to_dict()
        fight_data.update({
            'Fighter A Odds': fighter_a_odds[['Bookmaker', 'Odds Price']].to_dict(orient='records'),
            'Fighter A Avg Odds': fighter_a_avg_odds,
            'Fighter A Best Odds': fighter_a_best_odds,
            'Fighter B Odds': fighter_b_odds[['Bookmaker', 'Odds Price']].to_dict(orient='records'),
            'Fighter B Avg Odds': fighter_b_avg_odds,
            'Fighter B Best Odds': fighter_b_best_odds,
        })

        result_list.append(fight_data)

    return result_list

def calculate_best_avg_odds(odds_df, odds_prices):
    if odds_prices:
        avg_odds = calculate_avg_odds(odds_prices)
        best_row = odds_df.loc[odds_df['Odds Price'].idxmax()]
        best_odds = {'Odds': best_row['Odds Price'], 'Bookmaker': best_row['Bookmaker']}
        return avg_odds, best_odds
    return None, None


def main():
    # File paths
    masterlist_file = "preprocessed_event_masterlist.csv"
    odds_file = "mma_odds.csv"

    # Load data
    masterlist_df = pd.read_csv(masterlist_file, encoding='utf-8', low_memory=False)
    odds_df = pd.read_csv(odds_file, encoding='ISO-8859-1')

    # Convert 'event_date' and 'Timestamp' to datetime
    masterlist_df['event_date'] = pd.to_datetime(masterlist_df['event_date'])
    odds_df['Timestamp'] = pd.to_datetime(odds_df['Timestamp'])

    # Define date range for filtering
    start_date = pd.to_datetime('2020-06-06')
    end_date = pd.to_datetime('2024-08-24')
    filtered_masterlist = filter_by_date(masterlist_df, start_date, end_date)

    # Apply odds appending
    fights_with_odds = append_odds_to_fights(filtered_masterlist, odds_df)

    # Convert to DataFrame
    output_df = pd.DataFrame(fights_with_odds)

    # Identify rows with NaN in specific columns
    columns_to_check = [
        'Fighter A Avg Odds', 'Fighter A Best Odds', 
        'Fighter B Avg Odds', 'Fighter B Best Odds'
    ]
    nan_rows = output_df[output_df[columns_to_check].isnull().any(axis=1)]
    num_nan_rows = nan_rows.shape[0]
    print(f"Number of rows with NaN values: {num_nan_rows}")

    # Save rows with NaN values
    nan_output_file = "fights_with_nan_odds.csv"
    nan_rows.to_csv(nan_output_file, index=False, encoding='utf-8')
    print(f"Rows with NaN values have been saved to {nan_output_file}")

    # Clean dataset by dropping rows with NaN values in specified columns
    cleaned_df = output_df.dropna(subset=columns_to_check)
    num_dropped_rows = output_df.shape[0] - cleaned_df.shape[0]
    print(f"Number of fights dropped due to NaN values: {num_dropped_rows}")

    # Save the cleaned dataset
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    final_output_file = "event_masterlist_with_odds ({start_date_str} to {end_date_str}).csv"
    cleaned_df.to_csv(final_output_file, index=False, encoding='utf-8')
    print(f"Cleaned dataset has been saved to {final_output_file}")

if __name__ == "__main__":
    main()


Number of rows with NaN values: 202
Rows with NaN values have been saved to C:\Users\EditZ\UFC Research\Github\Odds Scraper\fights_with_nan_odds.csv
Number of fights dropped due to NaN values: 202
Cleaned dataset has been saved to C:\Users\EditZ\UFC Research\Github\Odds Scraper\masterlist_with_odds (2020-06-06 to 2024-08-24).csv
