In [1]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging

**1. Navigate to https://usapl.liftingdatabase.com/competitions. Using BeautifulSoup, extract information from each competition's website.** 

General notes from the competitions' default database:

- "All" for Type and State has the following HTML: https://usapl.liftingdatabase.com/competitions-default?t=&s=. 
- Changing the Type but keeping State as "All" changes the number after 't ='
    - T =  
        - International = 10 
        - IPF = 5
        - Local = 3
        - NAPF = 7
        - National = 2
        - Pro Meet = 9 
        - Pro Series = 8
        - Regional = 4
        - State = 6

**Code to retrieve results from a competition's url:**

In [2]:
def extract_category(td_tag):
    th_tag_before = td_tag.find_previous('th')  # Find the previous th tag
    th_tag_after = td_tag.find_next('th')  # Find the next th tag

    if th_tag_before and th_tag_after:
        # If the td tag is between two th tags, retrieve the previous th element
        return th_tag_before.get_text().strip()
    elif th_tag_before:
        # If the td tag is after a th tag, retrieve the previous th element
        return th_tag_before.get_text().strip()
    elif th_tag_after:
        # If the td tag is before a th tag, retrieve the next th element
        return th_tag_after.get_text().strip()
    else:
        return None

def retrieve_info(url):
    """
    This function retrieves data from the USAPL database and creates a DataFrame.
    """
    retry = Retry(total = 5,
                         backoff_factor = 0.5,
                         status_forcelist = [429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries = retry)
    try:
        response = requests.get(url, timeout = 5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text)    

            results = []

            # Main html element where data is located 
            content = soup.find('div', id="content")

            if content:
                tables = content.findAll('table')

                if tables: 
                    meet_info = tables[0]
                    meet_results = tables[1]

                    # This is the table with competitor results
                    if meet_results:
                        tr_tags = meet_results.findAll('tr')

                        for tr_tag in tr_tags:
                            td_tags = tr_tag.findAll('td')

                            if len(td_tags) >= 2:
                                category = extract_category(td_tags[0])
                                weight_class = td_tags[0].get_text().strip().replace('-', '')
                                placement = td_tags[1].get_text().strip().replace('.', '')
                                name = td_tags[2].get_text().strip()
                                yob = td_tags[3].get_text().strip()
                                team = td_tags[4].get_text().strip()
                                state = td_tags[5].get_text().strip()
                                lot = td_tags[6].get_text().strip()
                                weight = td_tags[7].get_text().strip()
                                squat_1 = td_tags[8].get_text().strip()
                                squat_2 = td_tags[9].get_text().strip()
                                squat_3 = td_tags[10].get_text().strip()
                                bench_1 = td_tags[11].get_text().strip()
                                bench_2 = td_tags[12].get_text().strip()
                                bench_3 = td_tags[13].get_text().strip()
                                deadlift_1 = td_tags[14].get_text().strip()
                                deadlift_2 = td_tags[15].get_text().strip()
                                deadlift_3 = td_tags[16].get_text().strip()
                                total = td_tags[17].get_text().strip()
                                points = td_tags[18].get_text().strip()
                                drug_tested = td_tags[19].get_text().strip()

                                # Find the previous th tag for the event
                                event_tag = tr_tag.find_previous('th', 'competition_view_event')

                                if event_tag: 
                                    event = event_tag.get_text().strip()

                                else: 
                                    event = None

                                # Create a dictionary for each row
                                meet_results_data = {
                                    'Event': event,
                                    'Category': category,
                                    'Weight Class': weight_class,
                                    'Placement': placement,
                                    'Name': name,
                                    'Year of Birth': yob,
                                    'Team': team,
                                    'State': state,
                                    'Lot': lot,
                                    'Weight': weight,
                                    'Squat 1': squat_1,
                                    'Squat 2': squat_2,
                                    'Squat 3': squat_3,
                                    'Bench Press 1': bench_1,
                                    'Bench Press 2': bench_2,
                                    'Bench Press 3': bench_3,
                                    'Deadlift 1': deadlift_1,
                                    'Deadlift 2': deadlift_2,
                                    'Deadlift 3': deadlift_3,
                                    'Total': total,
                                    'Points': points,
                                    'Drug-Tested': drug_tested
                                }

                                # Append the dictionary to the results list
                                results.append(meet_results_data)

                        # Create a DataFrame from the results list
                        meet_results_df = pd.DataFrame(results)

                    # This is table with meet information
                    if meet_info:
                        tr_tags = meet_info.findAll('tr')
                        
                        meet_info_data = {}

                        for tr_tag in tr_tags:
                            th_tag = tr_tag.find('th')
                            td_tag = tr_tag.find('td')

                            if th_tag and td_tag:
                                category = th_tag.get_text().strip()
                                value = td_tag.get_text().strip()

                                # Map category to corresponding variable
                                if 'Date' in category:
                                    meet_info_data['Date'] = value if value else None
                                elif 'Sanction #' in category:
                                    meet_info_data['Sanction #'] = value if value else None
                                elif 'State' in category:
                                    meet_info_data['State'] = value if value else None
                                elif 'Meet Director' in category:
                                    meet_info_data['Meet Director'] = value if value else None

                        # Use the extracted data from meet_info_data dictionary
                        date = meet_info_data.get('Date')
                        sanction_num = meet_info_data.get('Sanction #')
                        state = meet_info_data.get('State')
                        meet_director = meet_info_data.get('Meet Director')

                        # Repeat the values for each row in meet_results_df 
                        meet_info_df = pd.DataFrame({
                            'Meet Date': [date] * len(results),
                            'Sanction Number': [sanction_num] * len(results),
                            'Meet Location': [state] * len(results),
                            'Meet Director': [meet_director] * len(results)
                        }, index= meet_results_df.index)

                        # Concatenate the dataframes along the columns 
                        results_df = pd.concat([meet_info_df, meet_results_df], axis=1)
                    
                    else: 
                        print(f'Error retrieving tr tag in meet_info table element from {url}')
                        
                    # Retrieve meet name from h3 heading
                    # Create a new column to add to results_df
                    meet_name = content.find('h3').get_text(strip=True)
                    meet_name_df = pd.DataFrame({
                        'Meet Name': [meet_name] * len(results)
                    })

                    # Concatenate the dataframes along the columns 
                    results_df = pd.concat([results_df, meet_name_df], axis = 1)                        

                    #Reorder columns in the DataFrame 
                    column_order = [
                        'Meet Date', 'Sanction Number', 'Meet Location', 'Meet Name', 'Meet Director', 
                        'Event', 'Category', 'Weight Class', 'Placement', 'Name', 
                        'Year of Birth', 'Team', 'State', 'Lot', 'Weight', 
                        'Squat 1', 'Squat 2', 'Squat 3', 'Bench Press 1', 
                        'Bench Press 2', 'Bench Press 3', 'Deadlift 1', 
                        'Deadlift 2', 'Deadlift 3', 'Total', 'Points', 'Drug-Tested'
                    ]
                    results_df = results_df[column_order]
                    
                    print('Success!')
                    return results_df

            else: 
                print(f'Error retrieving table element from {url}')
        else: 
            print(f'Error retrieving {url}')

    except Exception as e:
        print(f'Error retrieving data from {url}: {e}')

**Practice run with definition functions**

In [3]:
url = 'https://usapl.liftingdatabase.com/competitions-view?id=1375'
df = retrieve_info(url)
display(df)

Success!


Unnamed: 0,Meet Date,Sanction Number,Meet Location,Meet Name,Meet Director,Event,Category,Weight Class,Placement,Name,...,Squat 3,Bench Press 1,Bench Press 2,Bench Press 3,Deadlift 1,Deadlift 2,Deadlift 3,Total,Points,Drug-Tested
0,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,47,6,Allegra Hudson,...,-155.0,75.0,82.5,-87.5,147.5,157.5,162.5,390.0,526.11,
1,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,52,7,Juanita Nájera,...,-162.5,87.5,95.0,100.0,155.0,165.0,-172.5,422.5,531.42,
2,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,57,5,Rowena Lopez,...,185.0,125.0,132.5,137.5,182.5,192.5,197.5,520.0,607.57,
3,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,63,9,Katie Van Dusen,...,215.0,120.0,125.0,130.0,165.0,175.0,-180.0,520.0,561.18,
4,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,72,2,Priscilla Ribic,...,245.0,-145.0,-145.0,145.0,227.5,242.5,-248.5,632.5,634.27,
5,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,84,1,Liane Blyn,...,245.0,175.0,182.5,187.5,-207.5,207.5,212.5,645.0,584.63,
6,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Female - Open,84+,1,Bonica Brown,...,297.5,185.0,197.5,-202.5,210.0,227.5,230.0,725.0,572.68,
7,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Male - Open,59,8,Chris Tran,...,240.0,140.0,147.5,152.5,235.0,245.0,-252.5,637.5,553.92,
8,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Male - Open,66,2,Charles Okpoko,...,307.5,190.0,197.5,200.0,245.0,257.5,-267.5,765.0,602.9,
9,11/14/2016 - 11/19/2016,,Florida,2016 IPF Open Worlds,Robert Keller,,Male - Open,74,7,Nathan Walton,...,-297.5,-177.5,-180.0,180.0,290.0,302.5,-312.5,760.0,547.73,


**2. Create DataFrame of all competitions and their URL's in USAPL Database.** 

This DataFrame contains information of ALL competitions in the USAPL databasa, their competition type, date the meet occurred, sanction number, and the meet's director. 

In [4]:
list_of_urls = [
    'https://usapl.liftingdatabase.com/competitions-default?t=5&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=3&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=7&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=2&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=9&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=8&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=4&s=',
    'https://usapl.liftingdatabase.com/competitions-default?t=6&s='
]

competition_types = ['IPF', 'Local', 'NAPF', 'National', 
                     'Pro Meet', 'Pro Series', 'Regional', 'State']

all_competitions = []

# Go through list of URL and collect data
for url, competition_type in zip(list_of_urls, competition_types):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # List to collect information from each competition 
        competition_info = []
        
        # Retrieve general information from list of competitions
        tabledata = soup.find('table', 'tabledata')
        if tabledata:
            print('Found tabledata')
            tr_tags = tabledata.findAll('tr')
            
            for tr_tag in tr_tags:
                td_tags = tr_tag.findAll('td')

                if td_tags:
                    date = td_tags[0].get_text().strip()
                    name = td_tags[1].get_text().strip()
                    anchor_tag = td_tags[1].find('a')
                    href = anchor_tag.get('href') if anchor_tag else None
                    sanction_num = td_tags[2].get_text().strip()
                    state = td_tags[3].get_text().strip()
                    
                    # Create a dictionary for each row
                    meet_type_data = {
                        'Date': date,
                        'Name': name,
                        'Meet Type': competition_type,
                        'Sanction Number': sanction_num,
                        'State': state,
                        'Website': 'https://usapl.liftingdatabase.com/' + str(href)
                    }

                    # Append dictionary to results list
                    competition_info.append(meet_type_data)
                    
        else:
            print(f'Error retrieving URL: {url}')

        # Append competition_info to overall list 
        all_competitions.extend(competition_info)
        print(all_competitions)

# Create a DataFrame from the results list
competition_type_df = pd.DataFrame(all_competitions)
print('DataFrame created from list of USAPL competitions.')

# Display and save DataFrame as csv
competition_type_df

# competition_type_df.to_csv('../data/USAPL_competitions_new.csv', index = False)
# print('DataFrame saved as csv file in data folder.')


Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Found tabledata
[{'Date': '09/23/2021', 'Name': "World Men's Classic Championships", 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=120068'}, {'Date': '03/08/2020', 'Name': 'Arnold SBD Pro American', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-07', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2794'}, {'Date': '03/07/2020', 'Name': 'Arnold Con-Cret Pro Bench Bash', 'Meet Type': 'IPF', 'Sanction Number': 'USAPL-2020-02', 'State': 'Ohio', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2789'}, {'Date': '11/18/2019', 'Name': '40th World Open Championships Dubai UAE', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State': '', 'Website': 'https://usapl.liftingdatabase.com/competitions-view?id=2661'}, {'Date': '10/07/2019', 'Name': 'World Masters Equipped Championships Potchefstroom South Africa', 'Meet Type': 'IPF', 'Sanction Number': 'IPF', 'State':

Unnamed: 0,Date,Name,Meet Type,Sanction Number,State,Website
0,09/23/2021,World Men's Classic Championships,IPF,IPF,,https://usapl.liftingdatabase.com/competitions...
1,03/08/2020,Arnold SBD Pro American,IPF,USAPL-2020-07,Ohio,https://usapl.liftingdatabase.com/competitions...
2,03/07/2020,Arnold Con-Cret Pro Bench Bash,IPF,USAPL-2020-02,Ohio,https://usapl.liftingdatabase.com/competitions...
3,11/18/2019,40th World Open Championships Dubai UAE,IPF,IPF,,https://usapl.liftingdatabase.com/competitions...
4,10/07/2019,World Masters Equipped Championships Potchefst...,IPF,IPF,,https://usapl.liftingdatabase.com/competitions...
...,...,...,...,...,...,...
3263,12/12/2015,2015 USA Powerlifting Idaho State Powerlifting...,State,ID-2015-02,Idaho,https://usapl.liftingdatabase.com/competitions...
3264,12/12/2015,USA Powerlifting South Carolina State Champion...,State,SC-2015-02,South Carolina,https://usapl.liftingdatabase.com/competitions...
3265,12/05/2015,USAPL Colorado State Championships,State,CO-2015-05,Colorado,https://usapl.liftingdatabase.com/competitions...
3266,03/14/2015,SE States BP,State,NS-2015-04,Georgia,https://usapl.liftingdatabase.com/competitions...


**3. Using list of url's from the previous code, retrieve results from each competition using retrieve_info function.**

Use URL's in Website column of competition_type_df to retrieve competition results. 

In [5]:
url_list = competition_type_df
dfs = []

# Iterate through each complete URL
for url in url_list['Website']:
    df = retrieve_info(url)
    dfs.append(df)
    time.sleep(0.5)
    
# Concatenate all DataFrames into one
USAPL_powerlifting_df = pd.concat(dfs, ignore_index=True)

USAPL_powerlifting_df.to_csv('../data/usapl_newnew.csv', index = False)
print('DataFrame saved as csv file in data folder.')

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Error retrieving data from https://usapl.liftingdatabase.com/competitions-view?id=120939: HTTPSConnectionPool(host='usapl.liftingdatabase.com', port=443): Read tim

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
S

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
S

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Error retrieving data from https://usapl.liftingdatabase.com/competitions-view?id=1309: HTTPSConnectionPool(host='usapl.liftingdatabase.com', port=443): Read timed out. (read timeout=5)
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Succ

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
S

**Webscrape errors:** 
- Cannot open google spreadsheets
    - Resolved this by making sure the first href in an anchor tag was being retrieved. 
    
- 'NoneType' object has no attribute 'get_text' 
    - Resolved this issue by adding a None value to Event column in original retrieve_info function

- list index out of range (1) 
    - Used table[2] because table[1] is an 'Attachment' table

- Missing information ('None') in meet_type_data table. 
    - Resolved this issue by editing if statement from 4 to 1 (if len(tr_tags) >= 1). 

- Error retrieving data from https://usapl.liftingdatabase.com/competitions-view?id=1823: HTTPSConnectionPool(host='usapl.liftingdatabase.com', port=443): Max retries exceeded with url: /competitions-view?id=1823 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000028D4B82B650>, 'Connection to usapl.liftingdatabase.com timed out. (connect timeout=None)'))
    - Added timeout to requests call 
    
            response = requests.get(url, timeout = 10)
            
    - Added Retry
    
            retry = Retry(total = 5,
                         backoff_factor = 0.5,
                         status_forcelist = [429, 500, 502, 503, 504])
            adapter = HTTPAdapter(max_retries = retry)
- Error retrieving tr tag in meet_info table element from {url}
    - Resolved this by extracting value in meet_info_table if it contains specific text within tr tag 
- Error retrieving data from {url}: cannot access local variable 'date' where it is not associated with a value
    - Resolved this by extracting value in meet_info_table if it contains specific text within tr tag
- Error retrieving data from {url} HTTPSConnectionPool(host='usapl.liftingdatabase.com', port=443): Read timed out. (read timeout=5)
    - Notate the url to rerun later. 


**4. Rerun webscraping function for errors**

**Timed-out errors:**

In [6]:
url_list = ['https://usapl.liftingdatabase.com/competitions-view?id=120939:',
           'https://usapl.liftingdatabase.com/competitions-view?id=120919:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120662:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120420:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120421:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2527:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2530:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2518:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2392:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1654:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1430:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1431:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1433:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1483:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1437:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1487:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1425:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1417:',
            'https://usapl.liftingdatabase.com/competitions-view?id=1309:',
            'https://usapl.liftingdatabase.com/competitions-view?id=902:',
            'https://usapl.liftingdatabase.com/competitions-view?id=909:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120507:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120649:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2552:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2553:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2547:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120907:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120909:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120910:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120922:',
            'https://usapl.liftingdatabase.com/competitions-view?id=120882:',
            'https://usapl.liftingdatabase.com/competitions-view?id=2011:'
           ]

dfs = []

# Iterate through each complete URL
for url in url_list:
    df = retrieve_info(url)
    dfs.append(df)
    time.sleep(0.5)
    
# Concatenate all DataFrames into one
USAPL_powerlifting_df = pd.concat(dfs, ignore_index=True)

USAPL_powerlifting_df.to_csv('../data/usapl_timed_out.csv', index = False)
print('DataFrame saved as csv file in data folder.')

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Error retrieving data from https://usapl.liftingdatabase.com/competitions-view?id=1654:: list index out of range
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Error retrieving data from https://usapl.liftingdatabase.com/competitions-view?id=2011:: list index out of range
DataFrame saved as csv file in data folder.


**Investigate "list index out of range" error.**

- https://usapl.liftingdatabase.com/competitions-view?id=1654::
- https://usapl.liftingdatabase.com/competitions-view?id=2011::

Note: Table[1] is an attachment table. Table[2] is results table. Redefine BeautifulSoup code. 

In [7]:
def extract_category(td_tag):
    th_tag_before = td_tag.find_previous('th')  # Find the previous th tag
    th_tag_after = td_tag.find_next('th')  # Find the next th tag

    if th_tag_before and th_tag_after:
        # If the td tag is between two th tags, retrieve the previous th element
        return th_tag_before.get_text().strip()
    elif th_tag_before:
        # If the td tag is after a th tag, retrieve the previous th element
        return th_tag_before.get_text().strip()
    elif th_tag_after:
        # If the td tag is before a th tag, retrieve the next th element
        return th_tag_after.get_text().strip()
    else:
        return None

def retrieve_info(url):
    """
    This function retrieves data from the USAPL database and creates a DataFrame.
    """
    retry = Retry(total = 5,
                         backoff_factor = 0.5,
                         status_forcelist = [429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries = retry)
    try:
        response = requests.get(url, timeout = 5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text)    

            results = []

            # Main html element where data is located 
            content = soup.find('div', id="content")

            if content:
                tables = content.findAll('table')

                if tables: 
                    meet_info = tables[0]
                    meet_results = tables[2]

                    # This is the table with competitor results
                    if meet_results:
                        tr_tags = meet_results.findAll('tr')

                        for tr_tag in tr_tags:
                            td_tags = tr_tag.findAll('td')

                            if len(td_tags) >= 2:
                                category = extract_category(td_tags[0])
                                weight_class = td_tags[0].get_text().strip().replace('-', '')
                                placement = td_tags[1].get_text().strip().replace('.', '')
                                name = td_tags[2].get_text().strip()
                                yob = td_tags[3].get_text().strip()
                                team = td_tags[4].get_text().strip()
                                state = td_tags[5].get_text().strip()
                                lot = td_tags[6].get_text().strip()
                                weight = td_tags[7].get_text().strip()
                                squat_1 = td_tags[8].get_text().strip()
                                squat_2 = td_tags[9].get_text().strip()
                                squat_3 = td_tags[10].get_text().strip()
                                bench_1 = td_tags[11].get_text().strip()
                                bench_2 = td_tags[12].get_text().strip()
                                bench_3 = td_tags[13].get_text().strip()
                                deadlift_1 = td_tags[14].get_text().strip()
                                deadlift_2 = td_tags[15].get_text().strip()
                                deadlift_3 = td_tags[16].get_text().strip()
                                total = td_tags[17].get_text().strip()
                                points = td_tags[18].get_text().strip()
                                drug_tested = td_tags[19].get_text().strip()

                                # Find the previous th tag for the event
                                event_tag = tr_tag.find_previous('th', 'competition_view_event')

                                if event_tag: 
                                    event = event_tag.get_text().strip()

                                else: 
                                    event = None

                                # Create a dictionary for each row
                                meet_results_data = {
                                    'Event': event,
                                    'Category': category,
                                    'Weight Class': weight_class,
                                    'Placement': placement,
                                    'Name': name,
                                    'Year of Birth': yob,
                                    'Team': team,
                                    'State': state,
                                    'Lot': lot,
                                    'Weight': weight,
                                    'Squat 1': squat_1,
                                    'Squat 2': squat_2,
                                    'Squat 3': squat_3,
                                    'Bench Press 1': bench_1,
                                    'Bench Press 2': bench_2,
                                    'Bench Press 3': bench_3,
                                    'Deadlift 1': deadlift_1,
                                    'Deadlift 2': deadlift_2,
                                    'Deadlift 3': deadlift_3,
                                    'Total': total,
                                    'Points': points,
                                    'Drug-Tested': drug_tested
                                }

                                # Append the dictionary to the results list
                                results.append(meet_results_data)

                        # Create a DataFrame from the results list
                        meet_results_df = pd.DataFrame(results)

                    # This is table with meet information
                    if meet_info:
                        tr_tags = meet_info.findAll('tr')
                        
                        meet_info_data = {}

                        for tr_tag in tr_tags:
                            th_tag = tr_tag.find('th')
                            td_tag = tr_tag.find('td')

                            if th_tag and td_tag:
                                category = th_tag.get_text().strip()
                                value = td_tag.get_text().strip()

                                # Map category to corresponding variable
                                if 'Date' in category:
                                    meet_info_data['Date'] = value if value else None
                                elif 'Sanction #' in category:
                                    meet_info_data['Sanction #'] = value if value else None
                                elif 'State' in category:
                                    meet_info_data['State'] = value if value else None
                                elif 'Meet Director' in category:
                                    meet_info_data['Meet Director'] = value if value else None

                        # Use the extracted data from meet_info_data dictionary
                        date = meet_info_data.get('Date')
                        sanction_num = meet_info_data.get('Sanction #')
                        state = meet_info_data.get('State')
                        meet_director = meet_info_data.get('Meet Director')

                        # Repeat the values for each row in meet_results_df 
                        meet_info_df = pd.DataFrame({
                            'Meet Date': [date] * len(results),
                            'Sanction Number': [sanction_num] * len(results),
                            'Meet Location': [state] * len(results),
                            'Meet Director': [meet_director] * len(results)
                        }, index= meet_results_df.index)

                        # Concatenate the dataframes along the columns 
                        results_df = pd.concat([meet_info_df, meet_results_df], axis=1)
                    
                    else: 
                        print(f'Error retrieving tr tag in meet_info table element from {url}')
                        
                    # Retrieve meet name from h3 heading
                    # Create a new column to add to results_df
                    meet_name = content.find('h3').get_text(strip=True)
                    meet_name_df = pd.DataFrame({
                        'Meet Name': [meet_name] * len(results)
                    })

                    # Concatenate the dataframes along the columns 
                    results_df = pd.concat([results_df, meet_name_df], axis = 1)                        

                    #Reorder columns in the DataFrame 
                    column_order = [
                        'Meet Date', 'Sanction Number', 'Meet Location', 'Meet Name', 'Meet Director', 
                        'Event', 'Category', 'Weight Class', 'Placement', 'Name', 
                        'Year of Birth', 'Team', 'State', 'Lot', 'Weight', 
                        'Squat 1', 'Squat 2', 'Squat 3', 'Bench Press 1', 
                        'Bench Press 2', 'Bench Press 3', 'Deadlift 1', 
                        'Deadlift 2', 'Deadlift 3', 'Total', 'Points', 'Drug-Tested'
                    ]
                    results_df = results_df[column_order]
                    
                    print('Success!')
                    return results_df

            else: 
                print(f'Error retrieving table element from {url}')
        else: 
            print(f'Error retrieving {url}')

    except Exception as e:
        print(f'Error retrieving data from {url}: {e}')

In [9]:
url_list = ['https://usapl.liftingdatabase.com/competitions-view?id=1654',
           'https://usapl.liftingdatabase.com/competitions-view?id=2011'
           ]

dfs = []

# Iterate through each complete URL
for url in url_list:
    df = retrieve_info(url)
    dfs.append(df)
    time.sleep(0.5)
    
# Concatenate all DataFrames into one
USAPL_powerlifting_df = pd.concat(dfs, ignore_index=True)

USAPL_powerlifting_df.to_csv('../data/usapl_list_index.csv', index = False)
print('DataFrame saved as csv file in data folder.')

Success!
Success!
DataFrame saved as csv file in data folder.
