In [1]:
import requests
import dataclasses
import typing
import os
import aiohttp
import asyncio
import time
import dataclasses

import pandas as pd
import bs4

In [2]:
data = pd.DataFrame(columns=["url_suffix", "athlete_element", "table_elements"])

get the html table row elements that represent each athlete in the fighter list page

In [3]:
res = requests.get("https://www.bjjheroes.com/a-z-bjj-fighters-list")
soup = bs4.BeautifulSoup(res.text, 'html.parser')
table = soup.find_all('tr')
for row in table:
    row_elements = row.find_all('td')
    if not row_elements:
        continue
    suffix = row_elements[0].find('a').get('href')
    data.loc[len(data)] = {
        "url_suffix": suffix,
        "athlete_element": str(row),
        "table_elements": None,
    }

In [11]:
async def get_tables(session, url_suffix):
    url=f"https://www.bjjheroes.com{url_suffix}"
    async with session.get(url) as response:
        try:
            soup = bs4.BeautifulSoup(await response.text(), 'html.parser')
            tables = soup.find_all('table')
            if tables is None or len(tables) < 1:
                print(f'{url_suffix}: {data[data["url_suffix" == url_suffix]]["table"]})')
                data[data["url_suffix" == url_suffix]]["table"] =  None
                
            elif len(tables) > 1:
                # this should send an alert through cloudwatch
                print(f"{url} has more than 1 table. Found this many: {len(tables)}")
                print("returning the first table found as a fallback")
            data[data["url_suffix" == url_suffix]]["table"] = tables[0]
        except Exception as e:
            data[data["url_suffix" == url_suffix]]["table"] =  None
            print(f"could not scrape {url}")
            print("due to the following error")
            print(e)
 
async def scrape(rows):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for row in rows:
            tasks.append(get_tables(session, row)) 
        result = await asyncio.gather(*tasks)
        return result

In [12]:
urls = list(data.url_suffix)
total = len(urls)
step = 200
total_start = time.time()
for i in range(0, total, step):
    start = time.time()
    print(f"{(i/total)*100}% done - {i} / {total}")
    await scrape(urls[i: i+step])
    print(f"========= step time {(time.time() - start):.2f} seconds")
    print(f"========= total time {(time.time() - total_start):.2f} seconds")
    print(f"========= rate {(i+step)/(time.time() - total_start):.2f} per second")

0.0% done - 0 / 1336


KeyError: False

In [None]:
# convert data to string:
string_data = []
for athlete_row, table_element in final_results:
    print(athlete_row)
    str_athlete_row = str(athlete_row)
    if table_element:
        str_tables = str(table_element)
    else:
        str_tables = None
    string_data.append((str_athlete_row, str_tables))

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import boto3
from io import BytesIO
from datetime import date


In [None]:
# Convert DataFrame to Parquet format
table = pa.Table.from_pandas(data)

In [None]:
# Write Parquet data to BytesIO buffer
parquet_buffer = BytesIO()
pq.write_table(table, parquet_buffer)
parquet_buffer.seek(0)

In [None]:
# Upload Parquet file to S3
file_path = f'bjjheroes-scrape-v1/{date}.parquet')
s3 = boto3.client('s3')
s3.upload_fileobj(parquet_buffer, 'bjjstats', file_path)