In [1]:
import requests
import bs4
import dataclasses
import typing
import psycopg2
import os
import dataclasses

In [2]:
@dataclasses.dataclass
class Athlete:
    id: int
    name: str
    url: str

In [3]:
DB_URL=os.getenv("DB_URL")
connection = psycopg2.connect(
    DB_URL,
)

In [31]:
def get_athletes():
    with connection.cursor() as cursor:
        cursor.execute(
            (
                "SELECT a.id, a.name, u.url"
                " FROM athlete a"
                " JOIN url u on a.id = u.athlete_id;"
            ),
        )
        rows = cursor.fetchall()
        connection.commit()
    return [Athlete(row[0], row[1], row[2]) for row in rows]

In [5]:
athletes = get_athletes()

In [6]:
athletes

[Athlete(id=2, name='Aaron Johnson ', url='https://www.bjjheroes.com/?p=9246'),
 Athlete(id=3, name='Abdurakhman Bilarov ', url='https://www.bjjheroes.com/?p=8494'),
 Athlete(id=4, name='Abmar Barbosa', url='https://www.bjjheroes.com/?p=390'),
 Athlete(id=8, name='Adam Kayoom', url='https://www.bjjheroes.com/?p=1133'),
 Athlete(id=9, name='Adam Wardzinski', url='https://www.bjjheroes.com/?p=13245'),
 Athlete(id=10, name='Ademir Barreto', url='https://www.bjjheroes.com/?p=7478'),
 Athlete(id=11, name='Admilson Brites', url='https://www.bjjheroes.com/?p=6339'),
 Athlete(id=12, name='Admilson Gobi Junior', url='https://www.bjjheroes.com/?p=8968'),
 Athlete(id=13, name='Aniel Bonifacio', url='https://www.bjjheroes.com/?p=9541'),
 Athlete(id=14, name='Adele Fornarino', url='https://www.bjjheroes.com/?p=13498'),
 Athlete(id=15, name='Adriana Martins', url='https://www.bjjheroes.com/?p=699'),
 Athlete(id=16, name='Adriano Martins', url='https://www.bjjheroes.com/?p=1116'),
 Athlete(id=17, nam

In [7]:
test_set = [
    "https://www.bjjheroes.com/?p=425",
    "https://www.bjjheroes.com/?p=144",
    "https://www.bjjheroes.com/?p=9246",
    "https://www.bjjheroes.com/?p=13245",
    "https://www.bjjheroes.com/?p=6550",
]

In [8]:
@dataclasses.dataclass
class Performance:
    athlete_id: int
    match_id: str
    result: str

In [32]:
def insert_performance_rows(performances: typing.List[Performance]):
    with connection.cursor() as cursor:
        args_str = ','.join(cursor.mogrify("(%s,%s,%s)", (p.athlete_id, p.match_id, p.result)) for p in performances)
        cursor.execute(
            (
                "INSERT INTO performance (athlete_id, match_id, result)"
                " VALUES " + args_str
            )
        )
    connection.commit()

In [33]:
def create_athletes(names: typing.List[str]):
    with connection.cursor() as cursor:
        args_str = ','.join(cursor.mogrify("(%s)", (n,)) for n in names)
        cursor.execute(
            (
                "INSERT INTO athlete (name)"
                " VALUES " + args_str
                + "RETURNING id"
            )
        )
        id_rows = cursor.fetchall()
        ids = [row[0] for row in id_rows]
        new_athletes = [Athlete(i[0], i[1], 'no link') for i in zip(ids, names)]
    connection.commit()
    return new_athletes

In [34]:
@dataclasses.dataclass(frozen=True)
class Match:
    year: int
    competition: str
    method: str
    stage: str
    weight: str
    athlete_ids: typing.Set[int]

In [38]:
def get_matches():
    with connection.cursor() as cursor:
        cursor.execute(
            (
                "WITH cte AS ("
                " SELECT p.match_id, ARRAY_AGG(a.id) as athlete_ids"
                " FROM performance p"
                " JOIN athlete a ON a.id = p.athlete_id"
                " GROUP BY p.match_id"
                " )"
                "SELECT year, competition, method, stage, weight, athlete_ids"
                " FROM match m"
                " JOIN cte c on c.match_id = m.id;"
            ),
        )
        rows = cursor.fetchall()
    connection.commit()
    return [Match(row[0], row[1], row[2], row[3], row[4], row[5]) for row in rows]

In [39]:
matches = get_matches()

In [26]:
def create_matches(matches: typing.List[Match]) -> typing.List[typing.Tuple[int, Match]]:
    with connection.cursor() as cursor:
        args_str = ','.join(
            cursor.mogrify(
                "(%s, %s, %s, %s, %s)", (m.year, m.competition, m.method, m.stage, m.weight)
            ) for m in matches
        )
        cursor.execute(
            (
                "INSERT INTO match (year, competition, method, stage, weight)"
                " VALUES " + args_str
                + "RETURNING id"
            )
        )
        id_rows = cursor.fetchall()
        ids = [row[0] for row in id_rows]
        id_to_match = list(zip(ids, matches))
    connection.commit()
    return id_to_match

In [12]:
def match_already_exists(year, competition, method, stage, weight, competitor_ids: list[int]):
    with connection.cursor() as cursor:
        cursor.execute(
            (
                "SELECT m.id"
                " FROM match m"
                " WHERE year = %s"
                " AND competition = %s"
                " AND method = %s"
                " AND stage = %s"
                " AND weight = %s"
            ),
            (year, competition, method, stage, weight,),
        )
        match_id_rows = cursor.fetchall()
        if match_id_rows:
            match_ids = [row[0] for row in match_id_rows]
            for match_id in match_ids:
                cursor.execute(
                    (
                        "SELECT a.id"
                        " FROM athlete a"
                        " JOIN performance p ON p.athlete_id = a.id"
                        " JOIN match m ON p.match_id = m.id"
                        " WHERE m.id = %s"
                    ),
                    (match_id,),
                )
                id_rows = cursor.fetchall()
                athlete_ids = {row[0] for row in id_rows}
                if competitor_ids[0] in athlete_ids and competitor_ids[1] in athlete_ids:
                    return match_id
        return False

In [13]:
test_set[0]

'https://www.bjjheroes.com/?p=425'

In [14]:
def scrape_matches(athlete_link):
    print(f"scraping match data for {athlete_link}")
    current_athlete_id = get_athlete_by_url(athlete_link)
    res = requests.get(athlete_link)
    bs = bs4.BeautifulSoup(res.content)
    table = bs.find("table", {"class": "table table-striped sort_table"})
    body = table.find("tbody")
    rows = body.find_all('tr')

    for row in rows:
        match_details = row.find_all("td")
        
        opponent = match_details[1]
        result = match_details[2].text
        method = match_details[3].text
        competition = match_details[4].text
        weight = match_details[5].text
        stage = match_details[6].text
        year = match_details[7].text
        
        opponent_name = opponent.text
        opponent_link = opponent.find('a')
        if opponent_link:
            opponent_link = f"https://www.bjjheroes.com{opponent_link.get('href')}"

        # check if the opponent exists in the database
        if opponent_link:
            opponent_id = get_athlete_by_url(opponent_link)
        else:
            opponent_id = get_athlete_by_name(opponent_name)
        if opponent_id is None:
            print(f"new athlete found: {opponent_name}")
            opponent_id = create_athlete(opponent_name)

        # check if match exists in database
        match_id = match_already_exists(
            year, 
            competition, 
            method, 
            stage, 
            weight, 
            [opponent_id, current_athlete_id]
        )
        if match_id:
            print(f"Match already found in database: {match_id}")
            continue
        
        match_id = create_match(year, competition, method, stage, weight)
        print(f"added match: {match_id}")
        possible_results = ['W', 'L', 'D']
        if result not in possible_results:
            raise ValueError(f"got result: {result}")
        if result == "D":
            opponent_result = "D"
        elif result == "W":
            opponent_result = "L"
        else:
            opponent_result = "W"
        insert_performance_rows(current_athlete_id, match_id, result)
        insert_performance_rows(opponent_id, match_id, opponent_result)

In [23]:
athlete_link = test_set[1]
print(f"scraping match data for {athlete_link}")
current_athlete_id = get_athlete_by_url(athlete_link)
res = requests.get(athlete_link)
bs = bs4.BeautifulSoup(res.content)
table = bs.find("table", {"class": "table table-striped sort_table"})
body = table.find("tbody")
rows = body.find_all('tr')

scraping match data for https://www.bjjheroes.com/?p=144


In [26]:
row = rows[0]

In [27]:
match_details = row.find_all("td")

opponent = match_details[1]
result = match_details[2].text
method = match_details[3].text
competition = match_details[4].text
weight = match_details[5].text
stage = match_details[6].text
year = match_details[7].text

opponent_name = opponent.text
opponent_link = opponent.find('a')
if opponent_link:
    opponent_link = f"https://www.bjjheroes.com{opponent_link.get('href')}"

In [28]:
opponent_link

'https://www.bjjheroes.com/?p=108'

In [None]:
# check if the opponent exists in the database
if opponent_link:
    opponent_id = get_athlete_by_url(opponent_link)
else:
    opponent_id = get_athlete_by_name(opponent_name)
if opponent_id is None:
    print(f"new athlete found: {opponent_name}")
    opponent_id = create_athlete(opponent_name)

In [None]:
# check if match exists in database
match_id = match_already_exists(
    year, 
    competition, 
    method, 
    stage, 
    weight, 
    [opponent_id, current_athlete_id]
)
if match_id:
    print(f"Match already found in database: {match_id}")
    continue

match_id = create_match(year, competition, method, stage, weight)
print(f"added match: {match_id}")
possible_results = ['W', 'L', 'D']
if result not in possible_results:
    raise ValueError(f"got result: {result}")
if result == "D":
    opponent_result = "D"
elif result == "W":
    opponent_result = "L"
else:
    opponent_result = "W"
insert_performance_rows(current_athlete_id, match_id, result)
insert_performance_rows(opponent_id, match_id, opponent_result)

In [16]:
for link in existing_urls:
    scrape_matches(link)

scraping match data for https://www.bjjheroes.com/?p=9246
Match already found in database: 270
Match already found in database: 271
Match already found in database: 272
Match already found in database: 273
Match already found in database: 274
Match already found in database: 275
Match already found in database: 276
Match already found in database: 277
Match already found in database: 278
Match already found in database: 279
Match already found in database: 280
Match already found in database: 281
Match already found in database: 282
Match already found in database: 283
Match already found in database: 284
Match already found in database: 285
Match already found in database: 286
Match already found in database: 287
Match already found in database: 288
Match already found in database: 289
Match already found in database: 290
Match already found in database: 291
Match already found in database: 292
Match already found in database: 293
Match already found in database: 294
Match already fou

OperationalError: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
