# Read html results and simulate next match
### Needs couple htmls files from mcsl results

In [198]:
from bs4 import BeautifulSoup
import csv
import os
import glob
import pandas as pd

In [199]:
#files/folder to simulate
htmlbasepath = "./data/result-simulate"
htmlfiles = ["FVvRV.html", "CGvPO.html", "SGvSB.html"]

In [200]:
# Prepare an empty list to hold the BeautifulSoup objects
soups = []
# read files
for htmlfile in htmlfiles:
    htmlpath = os.path.join(htmlbasepath, f"{htmlfile}")
    #print(htmlpath)
    with open(htmlpath, "r") as f:
        contents = f.read()
        soup = BeautifulSoup(contents, "lxml")
        # Append the BeautifulSoup object to the list
        soups.append(soup)

# print(soup.prettify())

In [201]:
#start recording
record = []

for soup in soups:
    
    table = soup.find("table")
    cols = table.findAll("td")

    content = []
    result = []

    for td in cols:
        try:
            string = "".join(td.find(string=True))
            content.append(string)
        except: pass

    event = content[0]

    #loop the rest
    for line in content:
        if line[0:5] == "Event":
            event = line
        else:
            #start of new record, when it sees rank
            if (line[1:2] == "." and len(line)==2) or (line[2:3] == "." and len(line) == 3) or (line[2:3] == "T" and len(line) == 4): #T is for tie
                if result:
                    record.append(result)
                    result = []
                result.append(event)
                result.append(line)
            else:
                result.append(line)



In [202]:
df = pd.DataFrame(record, columns =["event", "rank", "swimmer", "seed", "final", "extra"]) 
#not sure why there is extra column - investigate later
df.drop(columns=["extra"], inplace=True)

In [203]:
#splitting swimmer colunn into 3
df[["swimmer_name", "swimmer_age", "swimmer_team"]] = df["swimmer"].str.split(pat="(", expand=True, n=2)

#dropping swimmer column
del df["swimmer"]

In [204]:
#remove ")" from age and team, except for those relays
df["swimmer_age"] = df["swimmer_age"].apply(lambda x: x.replace(")", "") if x is not None else x)
df["swimmer_team"] = df["swimmer_team"].apply(lambda x:x.replace(")", "") if x is not None else x)


In [205]:
#event to exclude (all relays), relays have a blank name
df = df[~df['swimmer_team'].isna()]

#Remove rows with NT on Time colunn
values_to_exclude = ["NT", "NS", "DQ", "DNF"]
df = df[~df["final"].isin(values_to_exclude)]

#Remove team that is not simulated
team_to_exclude = ["SB", "FV", "CG", "PO"]
df = df[~df["swimmer_team"].isin(team_to_exclude)]


In [206]:
# Function to convert time to seconds for sorting
def time_to_seconds(time):
    if ":" in time:
        minutes, seconds = time.split(":")
        return int(minutes) * 60 + float(seconds)
    else:
        return float(time)

In [207]:
#this cell is to prep for score assignment

# Convert time to seconds
df["TimeInSeconds"] = df["final"].apply(time_to_seconds)

# Sort by event and time
df = df.sort_values(["event", "TimeInSeconds"])

# Assign ranks within each event group
df["Rank"] = df.groupby("event")["TimeInSeconds"].rank()

# Scoring system
scores = {1.0: 6, 2.0: 4, 3.0: 3, 4.0: 2, 5.0: 1}

# Assign scores based on ranks
df["Score"] = df["Rank"].map(scores).fillna(0)

# Drop auxiliary columns
df = df.drop(["TimeInSeconds", "Rank"], axis=1)

In [208]:
df.to_csv("simulate.csv", index=False)

In [209]:
# Calculate total points by team
team_scores = df.groupby("swimmer_team")["Score"].sum()

# Export the team scores to a CSV file
team_scores.to_csv("team_scores.csv")