# Processing MCSL summer swimming result

In [140]:
from bs4 import BeautifulSoup
import csv
import os
import pandas as pd
import re

In [141]:
#file/folder
htmlbasepath = "./data/2024/week1"
filename_html = 'MS-RC.html'
filename_output = filename_html.replace('.html','.csv')

htmlpath = os.path.join(htmlbasepath, f'{filename_html}')
# print(path)

In [142]:
htmlfile_list = htmlbasepath.split('/')
year = htmlfile_list[2]
week = htmlfile_list[3]

In [143]:
with open(htmlpath, 'r') as f:
    contents = f.read()
    soup = BeautifulSoup(contents, 'lxml')

# print(soup.prettify())

In [144]:
# Find all <h4> tags (event names)
event_tags = soup.find_all('h4')

# Initialize an empty list to store event data
events = []

# Loop through each event tag
for event_tag in event_tags:
    event_name = event_tag.text.strip()

    # Find the corresponding table for each event
    table = event_tag.find_next('table')
    rows = table.find_all('tr')

    # Initialize an empty list for this event's data
    event_data = []

 # Loop through rows (skip the header row)
    for row in rows[1:]:
        cells = row.find_all('td')
        if cells:
            position, name, seed_time, final_time, points = [cell.text.strip() for cell in cells]
            event_data.append({
                'position': position,
                'name': name,
                'seed_time': seed_time,
                'final_time': final_time,
                'points': points
            })

    # Append the event data to the list
    events.append({'event_name': event_name, 'data': event_data})

# Create new list for dataframe
data_list=[]
for event in events:
    event_name = event['event_name']
    # print(f"Event Name: {event['event_name']}"
    for data in event['data']:
        # print(f"year: {year}, week: {week}, event: {event_name}, rank: {data['position']}, swimmer: {data['name']}, seed: {data['seed_time']}, final: {data['final_time']}, point: {data['points']}")
        data_list.append({
                    'year': year,
                    'week': week,
                    'event': event_name,
                    'rank': data['position'],
                    'swimmer': data['name'],
                    'seed': data['seed_time'],
                    'final': data['final_time'],
                    'point': data['points']
                })


In [145]:
# df = pd.DataFrame(individual_record, columns =['year', 'week', 'event', 'rank', 'swimmer', 'seed', 'final']) 
df = pd.DataFrame(data_list)

In [146]:
df.to_csv('test.csv')

In [147]:
df.head()

Unnamed: 0,year,week,event,rank,swimmer,seed,final,point
0,2024,week1,Event 1 - Male 12&U 100M Medley,1,"Ahuja, Riaan (12)(MS)",NT,1:09.62,6
1,2024,week1,Event 1 - Male 12&U 100M Medley,2,"Kimpel, Ross (12)(RC)",1:25.45,1:24.03,4
2,2024,week1,Event 1 - Male 12&U 100M Medley,3,"Mendizabal, Kai (11)(RC)",1:27.06,1:25.62,3
3,2024,week1,Event 1 - Male 12&U 100M Medley,4,"Carare, Eli C (12)(RC)",1:32.69,1:28.69,2
4,2024,week1,Event 1 - Male 12&U 100M Medley,5,"Rosenbaum, Ryan (11)(MS)",1:36.28,1:38.29,1


In [148]:
#special handling for swimmer with () on its name
df['swimmer'] = df['swimmer'].apply(lambda x:x.replace('(Dan)', '- Dan'))
df['swimmer'] = df['swimmer'].apply(lambda x:x.replace('(Ben)', '- Ben'))
df['swimmer'] = df['swimmer'].apply(lambda x:x.replace('(Jojo)', '- Jojo'))
df['swimmer'] = df['swimmer'].apply(lambda x:x.replace('(Jorie)', '- Jorie'))


#splitting swimmer colunn into 3
df[['swimmer_name', 'swimmer_age', 'swimmer_team']] = df['swimmer'].str.split(pat='(', expand=True, n=2)

# #dropping progoma; swimmer column
# del df['swimmer']

In [149]:
#get event number
df['event_no'] = df['event'].apply(lambda x:x.split('-')[0].replace('Event ', ''))
#remove relay events (27,28,49,50) , i am using space at the end cause data has it
df = df[~df['event_no'].isin(['27 ','28 ' ,'49 ','50 '])]

In [150]:
# #replace function
# def replace_enhanced(x):
#     if x is None:
#         return(x)
#     else:
#         x.replace(')','')
#         return(x)
    
#remove ')' from age and team
df['swimmer_age'] = df['swimmer_age'].apply(lambda x:x.replace(')', ''))
df['swimmer_team'] = df['swimmer_team'].apply(lambda x:x.replace(')', ''))


In [151]:
# #add seconds for seed and final
# x = '34.88'
# x_list = x.split(':')

# time_list = x.split(':')
# seconds = float(time_list[0])*60 + float(time_list[1])
# print(seconds)


def x_to_seconds(x):
    x = x.replace('X', '') #replacing exibition time
    x_list = x.split(':')
    if x in ('NT', 'NS', 'DQ', 'DNF'):  # ignore NT, NS, DQ, and DNF
        return(x)
    elif len(x_list) == 1: #only seconds no minutes
         return(float(x_list[0]))
    else:
        return(float(x_list[0])*60 + float(x_list[1]))
# x_to_seconds(x)


df['seed_seconds'] = df['seed'].apply(lambda x:x_to_seconds(x))
df['final_seconds'] = df['final'].apply(lambda x:x_to_seconds(x))



In [152]:
df.tail(10).T

Unnamed: 0,321,322,323,324,325,326,327,328,329,330
year,2024,2024,2024,2024,2024,2024,2024,2024,2024,2024
week,week1,week1,week1,week1,week1,week1,week1,week1,week1,week1
event,Event 47 - Male 15-18 50M Butterfly,Event 47 - Male 15-18 50M Butterfly,Event 47 - Male 15-18 50M Butterfly,Event 47 - Male 15-18 50M Butterfly,Event 48 - Female 15-18 50M Butterfly,Event 48 - Female 15-18 50M Butterfly,Event 48 - Female 15-18 50M Butterfly,Event 48 - Female 15-18 50M Butterfly,Event 48 - Female 15-18 50M Butterfly,Event 48 - Female 15-18 50M Butterfly
rank,2,3,4,5,1,2,3,4,5,6
swimmer,"Jacobs, Charles J (18)(RC)","McKenna, Pierce (15)(RC)","Bian, Andrew (16)(MS)","Linares, Alexander (17)(MS)","Scofield, Campbell (17)(MS)","Somerville, Ella G (17)(MS)","Kirsch, Morgan (17)(RC)","Crane, Zoe (18)(RC)","Williams, Cora (18)(MS)","Moffitt, Avery Clare (16)(RC)"
seed,29.40,29.83,NT,NT,NT,36.50,37.66,38.08,37.90,NT
final,28.97,29.62,33.44,34.40,30.38,35.69,37.18,37.34,37.81,39.71
point,4,3,2,1,6,4,3,2,1,0
swimmer_name,"Jacobs, Charles J","McKenna, Pierce","Bian, Andrew","Linares, Alexander","Scofield, Campbell","Somerville, Ella G","Kirsch, Morgan","Crane, Zoe","Williams, Cora","Moffitt, Avery Clare"
swimmer_age,18,15,16,17,17,17,17,18,18,16


In [153]:
df.tail()

Unnamed: 0,year,week,event,rank,swimmer,seed,final,point,swimmer_name,swimmer_age,swimmer_team,event_no,seed_seconds,final_seconds
326,2024,week1,Event 48 - Female 15-18 50M Butterfly,2,"Somerville, Ella G (17)(MS)",36.50,35.69,4,"Somerville, Ella G",17,MS,48,36.5,35.69
327,2024,week1,Event 48 - Female 15-18 50M Butterfly,3,"Kirsch, Morgan (17)(RC)",37.66,37.18,3,"Kirsch, Morgan",17,RC,48,37.66,37.18
328,2024,week1,Event 48 - Female 15-18 50M Butterfly,4,"Crane, Zoe (18)(RC)",38.08,37.34,2,"Crane, Zoe",18,RC,48,38.08,37.34
329,2024,week1,Event 48 - Female 15-18 50M Butterfly,5,"Williams, Cora (18)(MS)",37.90,37.81,1,"Williams, Cora",18,MS,48,37.9,37.81
330,2024,week1,Event 48 - Female 15-18 50M Butterfly,6,"Moffitt, Avery Clare (16)(RC)",NT,39.71,0,"Moffitt, Avery Clare",16,RC,48,NT,39.71


In [157]:
#export to csv with | delimited
df.to_csv(filename_output,sep='|', index=False, columns=['year','week','event','rank','swimmer','seed','final','swimmer_name','swimmer_age','swimmer_team','seed_seconds','final_seconds','event_no']) # Use pipe to seperate data

In [155]:
#counting swimmer by team/age
df.groupby(['swimmer_age', 'swimmer_team'])['swimmer_name'].count()

swimmer_age  swimmer_team
10           MS              11
             RC              15
11           MS              26
             RC              15
12           MS              14
             RC              23
13           MS              19
             RC              20
14           MS              11
             RC              13
15           MS               9
             RC               9
16           MS               5
             RC              10
17           MS              12
             RC               9
18           MS               4
             RC               8
5            MS               1
7            MS               7
             RC              13
8            MS              21
             RC              17
9            MS              16
             RC              15
Name: swimmer_name, dtype: int64