In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Union
import seaborn as sns
import datetime
import requests
from bs4 import BeautifulSoup

# 158739 Assignment 4 - Does having wealthy parents make you a better tennis player?

#### Student name: James Evans and Hayley Wikeepa
#### Student ID: 12211872 & 19034759


# Introduction

Two things that make us wonder about tennis and wealthy parents. Firstly, two of the top 100 women tennis players have billionaire parents. Second, a tweet one of us saw wondering why so many formula one drivers fathers have hyperlinks in wikipedia (hinting that you need a rich father to become a formula 1 driver). Formula one is not tennis (race car driving is much more expensive than playing tennis for example), but this tweet gave us the idea that maybe you could tell if a person was wealthy by their wiki page.

Does having a wealthy parent make you a better tennis player? We take a look at the current world top tennis players, and examine if being from a wealthy family makes it more likely for them to win, when playing professional tennis. We also examine if having wealthy parents is "priced into the betting odds when betting on tennis matches.

#### Datasets used:
- ATP (Men's professional tennis association) and WTA (Women's professional tennis association) tennis player lists
- Tennis results from the years 2019 - 2022
- Tennis betting odds from the years 2019 - 2022
- List of tennis players that have wiki pages, and a boolean if there parents have wiki pages
- Chat gpt answers to the question "did this tennis player have wealthy parents?"


#### Dataset sources: 

- https://rapidapi.com/sportcontentapi/api/ultimate-tennis1
- http://www.tennis-data.co.uk/alldata.php
- Wikipedia
- ChatGPT


### Research Questions

1. Is having wealthy parents a predictor of becoming a professional tennis player?
2. Is having wealthy parents a predictor of future success as a professional tennis player?
3. Is there money to be made by taking into account if a tennis player has wealth parents, when betting on a tennis matches?


### Executive Summary

There are more top tennis players that have wealthy parents than the average population. Also tennis players with wealthy parents are more likely to win, even when playing other top tennis players. The information that having wealthy parents makes you a better tennis player is not priced into the betting odds.

We used Chat-GPT and Wikipdia to try to determine if players had wealthy parents. These methods did not agree with each other, so the conclusions above should be taken with this in mind.




## Report outline

### Initialise functions
Do these first so they work when used further down.

### Data Acquisition
1. Import players data from API.
2. Import match results from the last 4 years from CSV files.

### Data Wrangling and EDA
3. Get top 200 players data into one dataframe
4. Query chat-GPT to see if players have wealthy parents
5. Query Wiki to see if players have parents that have hyperlinks (assuming this will mean they have wealthy parents.
6. Manipulate the match result data into a usable form.
7. Merge the match result data with the player data.

### Data Analysis and discussion
7. Compare the Wealth Parents queries from chat-GPT results with the results from Wiki, to see if they agree.
8. Check if tennis players are more likely to have wealthy parents than the general population.
8. Compare match win rates between Players with wealthy parents vs not wealthy parents.
9. Simulate betting on matches and see if this info is "priced in" to the betting odds.



## Initialise functions

#Function returns true if ChatGPT thinks a tennis player had wealthy parents (top 1% weath in their country).
#function created by Chat GPT and modified by us.

#Note/ function cost about $0.0002 each time it is run. 

API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
API_KEY = "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"
MODEL_NAME = "gpt-3.5-turbo"

def determine_wealthy_parents(tennis_player_name):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}"
    }
    data = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant who only ever responds with yes or no answers. If you do not have enough information, answer no. Wealthy means the parents were probably in the top 1% of people in there country."},
            {"role": "user", "content": f"Did the tennis player {tennis_player_name}, have wealthy parents?"}
        ],
        "model": MODEL_NAME
    }
    response = requests.post(API_ENDPOINT, headers=headers, json=data)
    response_json = response.json()
    #print (response_json)
    try:
        chat_reply = response_json['choices'][0]['message']['content']
        # You can modify the condition below based on the expected response from the model
        if "yes" in chat_reply.lower():
            return True
        elif "no" in chat_reply.lower():
            return False
        else:
            return None  # Unable to determine the response
    except KeyError:
        return None  # Invalid response format

# scraping wiki to find parents. We can use this to check out results from Chat GPT
# This function returns true if the player has a hyperlink to a family member in their profile.
# The assumption is that this will be a hyperlink to a parent, and any parent with a hyperlink will be wealthy
# (bit of a stretch, we know!)

# Function made with the help of Chat-GPT

def parents_have_wiki(url):
    parents = []  # initialize the parents list
    last_name = None  # initialize the last name variable
    
    # Make a request to the URL and get the HTML response
    response = requests.get(url)
    html = response.content
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the "Personal life" or "Early life" or "Personal info" section on the page
    personal_life_section = soup.find('span', {'id': 'Early_life'})
    if personal_life_section is None:
        personal_life_section = soup.find('span', {'id': 'Personal_life'})
    if personal_life_section is None:
        personal_life_section = soup.find('span', {'id': 'Personal_info'})
       
    # Look for parents in the infobox
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        for row in infobox.find_all('tr'):
            th = row.find('th')
            if th and 'Parent' in th.text:
                td = row.find('td')
                for link in td.find_all('a'):
                    href = link.get('href')
                    if href and '/wiki/' in href and 'Wikipedia:' not in href:
                        # Add the parent's URL to the list
                        parents.append(href) 
    
    # Look for parents in the "Personal life" section
    if personal_life_section is not None:
        personal_life_content = personal_life_section #.parent.find_next_sibling('div', {'class': 'hatnote'})
        #print("persona life content" + personal_life_content.text)
        if personal_life_content is not None:
            for link in personal_life_content.find_next('p').find_all('a'):
                href = link.get('href')
                #print(href)
                if href and '/wiki/' in href and 'Wikipedia:' not in href:
                    # Check if the hyperlink points to a person with the same last name
                    link_text = link.text
                    #print(link_text)
                    if last_name is None:
                        # Get the last name of the person whose page we're on
                        name = soup.find('h1', {'id': 'firstHeading'}).text
                        last_name = name.split()[-1]
                        print(last_name)
                    if last_name in link_text:
                        # Add the parent's URL to the list
                        parents.append(href)
    
    # Check if a parent has a Wikipedia page
    for parent in parents:
        parent_url = 'https://en.wikipedia.org' + parent
        parent_response = requests.get(parent_url)
        parent_soup = BeautifulSoup(parent_response.content, 'html.parser')
        parent_title = parent_soup.find('h1', {'id': 'firstHeading'}).text
        if parent_title != 'Wikipedia':
            # Check if the parent's page exists
            return True
    
    return False


#This method returns a players win ratio

def win_ratio(players_name,matches_df):
    # get dataframe of just this players matches
    this_players_matches = matches_df[matches_df["Name"] == players_name]
    
    if len(this_players_matches) == 0:
        return 0
    else:
        # return win loss ratio 
        return len(this_players_matches[this_players_matches["win"] == True])/len(this_players_matches)

#This method returns the betting multiplyer for a player. It assumes the same value is bet on each match. 
#The bet being to win the match


def betting_returns(players_name,matches_df):
    this_players_matches = matches_df[matches_df["Name"] == players_name]
    if len(this_players_matches) == 0:
        return 0
    else:
        betting_return = 0
        for index, row in this_players_matches.iterrows():
            #bet a dollar
            betting_return = betting_return - 1
            if this_players_matches.at[index, 'win'] == True:
                betting_return = betting_return + this_players_matches.at[index, 'AvgWL']

        return betting_return
    
    

## Data Acquisition

### 1. Import players data from API.

# initialise APIs

api_key = '8556d4b2f5mshddae5c2b7778158p1b7b83jsn131f0acf695b'

# initialise https://rapidapi.com/sportcontentapi/api/ultimate-tennis1 API
ultimate_tennis_headers =  {
    'X-RapidAPI-Key': api_key,
    'X-RapidAPI-Host': 'ultimate-tennis1.p.rapidapi.com'
}


ultimate_tennis_headers = {
	"X-RapidAPI-Key": 'dc6d9587f3mshae60004afa47d06p138d95jsn2be153d18d4a',
	"X-RapidAPI-Host": 'ultimate-tennis1.p.rapidapi.com'
}

#### Import ATP players and rankings

url = "https://ultimate-tennis1.p.rapidapi.com/rankings/atp/singles/200/current"

top_atp_response = requests.get(url, headers=ultimate_tennis_headers)
top_atp_response = pd.DataFrame(top_atp_response.json()['data'])

#### Import WTA players and rankings

url = "https://ultimate-tennis1.p.rapidapi.com/rankings/wta/singles/200/current"

top_wta_response = requests.get(url, headers=ultimate_tennis_headers)
top_wta_response = pd.DataFrame(top_wta_response.json()['data'])

top_atp_response

## 2. Import match results from the last 4 years from CSV files.
### read csv files

notes on what each column means can be found in ../datasets/datasets_notes.txt

data taken from [tennis-data](http://www.tennis-data.co.uk/alldata.php)

wta2019_tornament_matches = pd.read_csv('datasets/2019wta.csv')
wta2020_tornament_matches  = pd.read_csv('datasets/2020wta.csv')
wta2021_tornament_matches  = pd.read_csv('datasets/2021wta.csv')
wta2022_tornament_matches  = pd.read_csv('datasets/2022wta.csv')
wta2023_tornament_matches  = pd.read_csv('datasets/2023wta.csv')

wta_dfs = [wta2019_tornament_matches , wta2020_tornament_matches , 
           wta2021_tornament_matches , wta2022_tornament_matches , 
           wta2023_tornament_matches ]
wta_tournament_matches_wOdds = pd.concat(wta_dfs)

atp2019_tornament_matches  = pd.read_csv('datasets/2019atp.csv')
atp2020_tornament_matches  = pd.read_csv('datasets/2020atp.csv')
atp2021_tornament_matches  = pd.read_csv('datasets/2021atp.csv')
atp2022_tornament_matches  = pd.read_csv('datasets/2022atp.csv')
atp2023_tornament_matches  = pd.read_csv('datasets/2023atp.csv')

atp_dfs = [atp2019_tornament_matches , atp2020_tornament_matches , atp2021_tornament_matches , atp2022_tornament_matches , atp2023_tornament_matches ]
atp_tournament_matches_wOdds = pd.concat(atp_dfs)

## 3. Get top 200 players data into one dataframe

### DataFrame edits

#### Get  top 200 ATP and WTA players into a single DataFrame

# edit ATP dataframe to show basic player details

top_atp_response.set_index('id', inplace=True)

top_atp_response = top_atp_response.drop(['Rank Diff', 'Age', 'Points'], axis=1)

top_atp_response["Association"] = "ATP"

# edit WTA players dataframe to show basic player details

# drop and normalise some columns
top_wta_response = top_wta_response.drop(['country', 'movement', 'rankedAt', 'points', 'tournamentsPlayed'], axis=1)
top_wta_response.rename(columns={'ranking': 'Rank'}, inplace=True)
top_wta_response.rename(columns={'name': 'Name'}, inplace=True)

top_wta_response.set_index('ID', inplace=True)

top_players = pd.concat([top_atp_response, top_wta_response])

# or get top_players from csv
top_players = pd.read_csv("players.csv", index_col=0)
top_players.rename_axis('ID')

#code to save players data to csv (so we do not need to keep hitting apis).
top_players.to_csv('players.csv', index=True)

## 4. Query chat-GPT to see if players have wealthy parents

# add a wealthy_parent column

players_df["wealthy_parents"] = None

#loop through all players and find if chatGPT thinks they had wealthy parents

for index, row in top_players.iterrows():
    player_name = row['Name']
    wealthy_parents = determine_wealthy_parents(player_name)
    top_players.at[index, 'wealthy_parents'] = wealthy_parents
    #print(player_name + " weathly parents is " + str(wealthy_parents))

## 5. Query Wiki to see if players have parents that have hyperlinks .
(assuming this will mean they have wealthy parents.)

top_players[['First Name', 'Last Name']] = top_players['Name'].str.split(' ', 1, expand=True)

# add a parent_on_wiki column

top_players["parent_on_wiki"] = None
top_players.info()

# scrape wiki for parents hyperlinks
#this takes a longtime.

def check_wikipedia_url(url):
    response = requests.head(url)
    if response.status_code == 200:
        return True
    else:
        return False

for index, row in top_players.iterrows():
    player_name = row['First Name'] + "_" + row['Last Name']
    wiki_url = "https://en.wikipedia.org/wiki/" + player_name
    
    if check_wikipedia_url(wiki_url):
        print(wiki_url)
        wiki_parents = parents_have_wiki(wiki_url)
        print(wiki_parents)
        top_players.at[index, 'parent_on_wiki'] = wiki_parents


top_players.head()

## 6. Manipulate the match result data into a usable form.

#### edit tournaments with odds DataFrames

# join wta & atp tournament with betting odds DataFrames
all_tournament_matches_wOdds = pd.concat([wta_tournament_matches_wOdds, atp_tournament_matches_wOdds])

# Change NaN values to zero for specified columns
columns_to_fill = ['W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'ATP', 'WTA', 'Series']
all_tournament_matches_wOdds[columns_to_fill] = all_tournament_matches_wOdds[columns_to_fill].fillna(0)


# create a dataframe with just the winners of each match

winner_tournament_matches_wOdds = all_tournament_matches_wOdds.copy()

winner_tournament_matches_wOdds = winner_tournament_matches_wOdds[[
       'WTA', 'Location', 'Tournament', 'Date', 'Tier', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'WRank', 'WPts',
       'Wsets', 'Comment', 'AvgW', 'ATP', 'Series',]]

winner_tournament_matches_wOdds.rename(
    columns={'Winner': 'WinLoseName', 'WRank': 'WLRank', 'Wpts': 'WLpts', 
             'Wsets': 'WLsets', 'AvgW': 'AvgWL'}, inplace=True)


winner_tournament_matches_wOdds["win"] = True

# create a dataframe with just the loosers of each match

loser_tournament_matches_wOdds = all_tournament_matches_wOdds.copy()

loser_tournament_matches_wOdds = loser_tournament_matches_wOdds[[
       'WTA', 'Location', 'Tournament', 'Date', 'Tier', 'Court', 'Surface',
       'Round', 'Best of', 'Loser', 'LRank', 'LPts',
       'Lsets', 'Comment', 'AvgL', 'ATP', 'Series']]

loser_tournament_matches_wOdds.rename(
    columns={'Loser': 'WinLoseName', 'LRank': 'WLRank', 'Lpts': 'WLpts', 
             'Lsets': 'WLsets', 'AvgL': 'AvgWL'}, inplace=True)


loser_tournament_matches_wOdds["win"] = False

match_results_per_player = pd.concat([winner_tournament_matches_wOdds, loser_tournament_matches_wOdds])

# set index
match_results_per_player['ID'] = match_results_per_player.index + 1
match_results_per_player.set_index('ID', inplace=True)

match_results_per_player.tail()

## 7. Merge the match result data with the player data.
#### Merge all DataFrames into one

# get first initial from players_df players to check against tornament details
top_players.loc[:, 'first_initial'] = top_players['First Name'].str[0]

match_results_per_player[['Last Name','first_initial']] = match_results_per_player['WinLoseName'].str.split(' ', n=1, expand=True)
match_results_per_player['first_initial'] = match_results_per_player['first_initial'].str.extract(r'(\w)')

player_odds_df = match_results_per_player.merge(top_players, on=['Last Name', 'first_initial'], how='left')

# fill NA values

player_odds_df['Tier'] = player_odds_df['Tier'].fillna("no tier")
player_odds_df['parent_on_wiki'] = player_odds_df['parent_on_wiki'].fillna("False")
player_odds_df['wealthy_parents'] = player_odds_df['wealthy_parents'].fillna("Flase")

player_odds_df['Best of'] = player_odds_df['Best of'].fillna(0)
player_odds_df['WLRank'] = player_odds_df['WLRank'].fillna(0)
player_odds_df['WPts'] = player_odds_df['WPts'].fillna(0)
player_odds_df['WLsets'] = player_odds_df['WLsets'].fillna(0)
player_odds_df['AvgWL'] = player_odds_df['AvgWL'].fillna(0)
player_odds_df['LPts'] = player_odds_df['LPts'].fillna(0)
player_odds_df['Rank'] = player_odds_df['Rank'].fillna(0)

player_odds_df.drop(['Last Name', 'first_initial', 
                     'Association', 'First Name'], axis=1, inplace=True)

player_odds_df.to_csv('all_matches_player_odds.csv', index=True)
player_odds_df.head()

## Data Analysis and discussion
### 7. Compare the Wealth Parents queries from chat-GPT results with the results from Wiki, to see if they agree.



#### How to check the ChatGPT results?
To check how accurate they are we will querry wiki to see what players have parents with wiki pages. We can the compare to see if chat gpt say all players that have parents that have wiki pages are weatlth, then this gives us some evendence that chat GPT is telling the truth. 

We are only looking a general trends so we only need chatGPT to be "about right", it does not need to be 100% accurate to allow us to get some idea if wealth parents play a role.

#### Limitations of chatGPT results
Better tennis players will have more written about them and their families. A large language model will have more infomation about them. This may skew the results, because there may be players who are not so good at tennis, but who still have wealthy parents, but there is not enough info about them so chatGPT will say no.

#Check if all players with wiki_parents are considered to have wealthy parents by chatGPT.

#this returns all rows where a player has wiki parents but they are not wealthy


all_wiki_parents = top_players[top_players['parent_on_wiki']==True]

test_wiki_vs_chatGPT = all_wiki_parents[all_wiki_parents['wealthy_parents']==False]


#Lets see what percentage the above false hits were compared to all players with wiki_parents
all_wiki_parents = top_players[top_players['parent_on_wiki']==True]
false_hit_ratio = len(test_wiki_vs_chatGPT)/len(all_wiki_parents)
print("the ratio of players with parents that have wiki pages that chat gpt does vs does not think are wealthy are: " + str(false_hit_ratio))

#Todo// Make a info graphic about this 

#### Results of checking wealth via chatGPT vs wiki parents

The above shows that the two methods of telling if a parent is wealthy do not align. 80% of players that our function says have parents who have wiki pages, are not considered wealthy by chat-gpt. This not not a good test!

There are not a huge amount of rows in either disputed list so by manually checking 5 of them from each list it seems the chat GPT results are more accurate. It seems the main problem with the wiki results is that it more often pulls up other family members other than the parents. 

due to this desk top evaluation we will use the chatGPT results for the rest of the analysis, noting that they have not been well checked.


#### Interesting things to note about the wiki parents method.
 It also picks up syblings that are famous (example Elias_Ymer, who had a tennis playing sybbling), and other family members (example Raffa's uncle who is famous for being Raffa's coach).
It also sometimes picks up the wiki page of someone who shares the same name as the tennis player. Example Alexander Zhurbin https://en.wikipedia.org/wiki/Alexander_Zhurbin the musician vs the tennis player https://en.wikipedia.org/wiki/Alexander_Zhurbin_(tennis)


### 8. Check if tennis players are more likely to have wealthy parents than the general population.

Find % of players with weathy parent.
Compare this to general population. We said to chat GPT to use weathy as to 1%, so lets see how this compares to our sample of players.


#Check if all players with wiki_parents are considered to have wealthy parents by chatGPT.

#this returns all rows where a player has wiki parents but they are not wealthy


all_wiki_parents = top_players[top_players['parent_on_wiki']==True]

test_wiki_vs_chatGPT = all_wiki_parents[all_wiki_parents['wealthy_parents']==False]
test_wiki_vs_chatGPT.head()

percentage_wealthy = top_players['wealthy_parents'].mean() * 100
percentage_wealthy

10% > 1% so this sugests that tennis players are more often from wealthy families.

//todo add visulisation

### 9. Compare match win rates between Players with wealthy parents vs not wealthy parents.


#### Popuate players_df with win_ratio

#this assumes that the df that contains all the matches split so one row one player is called "all_tournament_matches_wOdds_split"

top_players["win_ratio"] = None
for index, row in top_players.iterrows():
    top_players.at[index, 'win_ratio'] = win_ratio(top_players.at[index, 'Name'], player_odds_df)
    

#### Find out if the average win_ratio is higer if the parents are wealthy

wealthy_parents_players_df = top_players[top_players["wealthy_parents"] == True]
average_win_ratio_wealthy_parents = wealthy_parents_players_df["win_ratio"].mean()
print("Players with wealth parents have a win ratio of " + str(average_win_ratio_wealthy_parents))

not_wealth_parents_players_df = top_players[top_players["wealthy_parents"] == False]
average_win_ratio_not_wealthy_parents = not_wealth_parents_players_df["win_ratio"].mean()
print("Players who do not have wealth parents have a win ratio of " + str(average_win_ratio_not_wealthy_parents))



//todo create visulisation

### 10. Simulate betting on matches and see if this info is "priced in" to the betting odds.

To answer this lets loop though all players with wealthy parents (acording the chatGPT) and see if we bet the same amount on all there matches over the last 4 years, if we would have made a profit.

# this loops through all players and calulates the return if you had bet 1$ on each of there matches over the last 4 years.
top_players["bet_return"] = None
for index, row in top_players.iterrows():
    top_players.at[index, 'bet_return'] = betting_returns(top_players.at[index, 'Name'], player_odds_df)

wealthy_parents_players_df = top_players[top_players["wealthy_parents"] == True]
average_betting_return_wealthy_parents = wealthy_parents_players_df["bet_return"].mean()
print("Players with wealth parents have an average betting return of " + str(average_betting_return_wealthy_parents))

not_wealth_parents_players_df = top_players[top_players["wealthy_parents"] == False]
average_betting_return_not_wealthy_parents = not_wealth_parents_players_df["bet_return"].mean()
print("Players who do not have wealth parents have an average betting return of " + str(average_betting_return_not_wealthy_parents))

top_players.to_csv("players.csv")

#### Discussion
So it will not make you a winning better, but you will loose less if you take into consideration wealthy parents.
Todo: add graph


