In [220]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

class LeaguepediaScraper:
    """Class to scrape data from Leaguepedia website regarding professional LOL players"""
    
    def __init__(self):
        self.url_template = "https://lol.fandom.com/wiki/{page}"
    
    def get_page(self, page):
        """Gets the HTML of the page"""
        url_prefix = "lol.fandom.com/wiki/"
       # print("get_page:",page)
        # Find the index of url_prefix in the page
        url_prefix_index = page.find(url_prefix)
        # If url_prefix is not found, use it as it is
        if url_prefix_index == -1:
            url = self.url_template.format(page=page)
        else:
            # If url_prefix is found, remove it
            url = self.url_template.format(page=page[url_prefix_index+len(url_prefix):])
        #print("get_page:",url)
        response = requests.get(url)
        return BeautifulSoup(response.text, 'html.parser')

    def special_search(self, search_term:str) -> BeautifulSoup:
        """Searches for a player using the query in the search bar and returns the soup object of the page"""
        url = self.url_template.format(page=f"Special:Search?query={search_term}&scope=internal&navigationSearch=true")
        #print("Special Search:",search_term, url)
        response = requests.get(url)
        return BeautifulSoup(response.text, 'html.parser')
        
    
    def search_pro_page(self, summoner_name:str, given_name:str, family_name:str, region:str=None, soup:BeautifulSoup=None):
        """Finds the page of the professional player, given their summoner name, legal name, and region 
        returns the soup object"""
        # First try the summoner name
        if soup == None:
            soup = self.get_page(summoner_name)
        page_type = self.check_page(soup)

        if page_type == "disambiguation":
            new_url = self.find_disambiguation(soup, summoner_name, given_name, family_name)
            # print(page_type,new_url)
            return self.search_pro_page(summoner_name, given_name, family_name, region, self.get_page(new_url))
        
        elif page_type == "doesn't exist":
            # Try changing the summoner name to title case
            soup = self.get_page(summoner_name.title())
            
            if self.check_page(soup) != "article":
                search_soup = self.special_search(summoner_name)
                # Use the find_disambiguation function to find the correct link
                new_url = self.find_disambiguation(search_soup, summoner_name, given_name, family_name)
                
                # If the new_url is None, use the find_search function
                
                # print(page_type,new_url)
                return self.search_pro_page(summoner_name, given_name, family_name, region, self.get_page(new_url))
        
        page_type = self.check_page(soup)
        if page_type == "article":
            return soup
        # If not found, try the given name and family name
    
    def find_disambiguation(self, soup:BeautifulSoup, summoner_name:str, first_name:str, family_name:str, search:bool=False):
        """Given a soup object containing a disambiguation page, find the correct link

        Args:
            soup (BeautifulSoup): the soup object for the disambiguation page
            first_name (str): the player's first name
            family_name (str): the player's family name
        """
        #print(summoner_name, first_name, family_name)
        result = None
        for link in soup.find_all("a"):
            # If the link contains summoner_name.title() or summoner_name.lower() and first_name or family_name
            # return the href
            tests = [summoner_name.title(), summoner_name.lower(), summoner_name.upper(), summoner_name]
            if any([s in link.text for s in tests]):
                if (first_name in link.text or family_name in link.text):
                    # Return the href
                    result = link.get("href")[6:]
                    #print(result)
                else:
                    # Print the contents of the link after "wiki/"
                    if len(summoner_name) == len(link.get("href").split("wiki/")[-1]):
                        result = link.get("href")
        #print("Disambiguation: ",result)
        return result
            
                
        
    def check_page(self, soup):
        # Check if the page is an article, a disambiguation page, or a "doesn't exist" page
        
        # If it's an article, return "article"
        article_text = "Background Information"
        
        # If it's a disambiguation page, return "disambiguation"
        disambiguation_text = "This disambiguation page lists articles associated with the same title."
        
        # If it's a "doesn't exist" page, return "doesn't exist"
        doesnt_exist_text = "There is currently no text in this page."
        
        if doesnt_exist_text in soup.text:
            return "doesn't exist"
        elif disambiguation_text in soup.text:
            return "disambiguation"
        elif article_text in soup.text:
            return "article" 
        
        # If it's something else, return "other"
        return "other"
    
    def parse_pro_soloq_ids(self, soup:BeautifulSoup, region:str=None):
        # Go through the tables and look for <td class="infobox-label">Soloqueue IDs</td>
        tables = soup.find_all("table", class_="infobox")
        # If tables doesn't exist, return an empty dictionary
        if tables == [] or tables == None:
            return {}
        
        for table in tables:
            for row in table.find_all("tr"):
                # If row.text starts with "Soloqueue IDs", save the row
                search_string = "Soloqueue IDs"
                if row.text.startswith("Soloqueue IDs"):
                    break
                
        # Get the contents inside all the <b> tag in the rows
        server = row.find_all("b")
        # Iterate through all the server names, and get the text following them in the row variable
        ids = {}
        # Zip the two lists together into a dictionary
        for s in server:
            ids[s.text] = s.next_sibling.strip().split(", ")
            
        if ids == {}:
            # If the ids are not found, try a different format bc leaguepedia has inconsistent formatting

            # Find the index of the search_string, and print everything after that
            search_string_index = row.text.find(search_string)
            
            # If search_string is not found, return an empty dictionary
            if search_string_index == -1:
                return {}
            
            # Split the string by comma
            raw_ids = row.text[search_string_index + len(search_string):].split(", ")

            # Iterate backwards through the list, and if the id doesn't contain a region parenthesis,
            # add the parenthesis from the previous id.
            # If it's the first id, add the region to the end of the id
            for i in range(len(raw_ids)-1, -1, -1):

                if "(" not in raw_ids[i]:
                    try:
                        # Find the index of the parenthesis in the previous id
                        previous_id = raw_ids[i+1]
                        parenthesis_index = previous_id.find("(")
                        # Add the previous id to the current id
                        previous_region = previous_id[parenthesis_index:]
                    except IndexError:
                        previous_region = f"({region})"
                    raw_ids[i] = raw_ids[i] + " " + previous_region

            # For each entry in raw_ids, split by parenthesis
            for raw_id in raw_ids:
                raw_id = raw_id.split("(")
                server = raw_id[1].strip(")")
                id = raw_id[0].strip()
                # Append the server: id to the ids dictionary
                if server in ids:
                    ids[server].append(id)
                else:
                    ids[server] = [id]
        return ids
    
    def get_pro_role(self, soup:BeautifulSoup) -> str:
        """Takes in the soup object of the pro player's page and returns their role
        
        Args: soup (BeautifulSoup): the soup object of the pro player's page
        
        Returns:
            str: the pro player's role
        """
        # Find all tables with the class "infobox"
        tables = soup.find_all("table", class_="infobox")
        # Locate the table with this attribute, then find the text of the next sibling
        for table in tables:
            for row in table.find_all("tr"):
                if row.text.startswith("Role"):
                    # Find the span class with class "sprite" and get its title
                    role = row.find("span", class_="sprite").get("title")
                    return role
        
    
    def get_pro_soloq_ids(self, summoner_name:str, first_name:str, family_name:str, region:str=None) -> dict:
        """Finds the pro player's soloq ids and return it 
        as a dictionary containing the region as keys and the ids as a list of strings

        Args:
            summoner_name (str): the pro player's summoner name
            first_name (str): the pro player's first/given name
            family_name (str): the pro player's family name
            region (str): the pro player's tournament region

        Returns:
            dict: A dictionary of the pro player's soloq ids with the region as keys and the ids as a list of strings
        """
        soup = self.search_pro_page(str(summoner_name), first_name.title(), family_name.title())
        if soup == None:
            raise Exception("Soup is None")
        ids = {}
        role = (self.get_pro_role(soup))
        non_player_roles = ["manager", "analyst", "coach", "caoch", "media"]
        # If the role contains keywords from non_player_roles, return the role
        for r in non_player_roles:
            if r in role.lower():
                return {'ids':ids, 'Roles':role}
        ids = self.parse_pro_soloq_ids(soup, region)
        return {'ids':ids, 'Roles':role}

test = LeaguepediaScraper()

In [221]:
print(problem)
test.get_pro_soloq_ids(problem[0], problem[1], problem[2], problem[3])

['R4ven', 'Domagalski', 'Milosz', 'EMEA']


{'ids': {}, 'Roles': 'Top Laner'}

In [None]:
# Reaad the RCD.xlsx file and save into pandas dataframe
raw_data = pd.read_excel("RCD.xlsx", sheet_name=None)

In [223]:
df = raw_df.copy()
# Merge the Legal Given Name and Legal First Name column
#df["Legal First Name"] = df["Legal Given Name"].fillna(df["Legal First Name"])

# Rename the "End Date (Month, Day, Year)" to just "End Date"
df = df.rename(columns={"End Date (Month, Day, Year)":"End Date"})
# Merge the "Main Role", "Position", and "Positon" columns
df["Roles"] = df["Main Role"].fillna(df["Position"]).fillna(df["Positon"])
df["Roles"] = df["Roles"].apply(lambda x: x.strip().title() if isinstance(x, str) else "NaN")

# Manually change the roles of some players

# Change the role of "Freizer" from NaN to "Coach"
df.loc[df["Official Summoner Name"] == "Freizer", "Roles"] = "Coach"

# Change Duall's Official Summoner Name to "DuaLL"
df.loc[df["Official Summoner Name"] == "Duall", "Official Summoner Name"] = "DuaLL"

# Change Beanovich's First Name to "Louis" since they mistyped it as "Loius"
df.loc[df["Official Summoner Name"] == "Beanovich", "Legal First Name"] = "Louis"

# Filter out all players whose "Roles" contains the word "Coach", "Manager", or "Analyst"
df = df[~df["Roles"].str.contains("Coach|Manager|Analyst|Caoch")]

roleDict = {"Toplaner": "Top", "Jungler": "Jungle", "Midlaner": "Mid", "Botlaner": "Bot", "Support": "Sup",
            "Jun": "Jungle", "Jug":"Jungle", "Jung": "Jungle",
            "Middle" : "Mid",
            "Adc": "Bot", "Bottom": "Bot"}
# If the role is in the roleDict, replace it with the value in the roleDict
df["Roles"] = df["Roles"].replace(roleDict)
df = df[["Region", "Team", "Official Summoner Name","Roles" ,"Legal First Name", "Legal Family Name", "End Date"]]

df = df.dropna(subset=["Official Summoner Name"])
df

Unnamed: 0,Region,Team,Official Summoner Name,Roles,Legal First Name,Legal Family Name,End Date
0,BR,FLUXO,Baldan,,Giovani,Baldan,2024-11-18 00:00:00
1,BR,FLUXO,Forlin,,Leonardo,Pereira,2024-11-18 00:00:00
2,BR,FLUXO,Tay,,Rodrigo,Panisa,2023-11-20 00:00:00
3,BR,FLUXO,Disamis,,Pedro,Cavalcante,2024-11-18 00:00:00
4,BR,FLUXO,Ancrath,,Rodrigo,Montrezol,2024-11-18 00:00:00
...,...,...,...,...,...,...,...
84,VN,,Emo,Sub/Mid,Vinh,Nguyen Thai,2026-11-16 00:00:00
85,VN,,Nugu,Mid,Dat,Tran Quoc,2026-11-16 00:00:00
86,VN,,Slowz,Bot,Hung,Nguyen Huy,2026-11-16 00:00:00
87,VN,,Zodiac,Sup,Luong,Tieu Quoc,2026-11-16 00:00:00


In [129]:
import os
if not os.path.exists("pro_scrape_df.csv"):
    already_scraped = []
    ids = []
    roles = []
else:
    pro_scrape_df = pd.read_csv("pro_scrape_df.csv")
    already_scraped = list(pro_scrape_df["Official Summoner Name"])
    ids = list(pro_scrape_df["ids"])
    roles = list(pro_scrape_df["Roles"])

In [None]:
df[~df["Official Summoner Name"].isin(already_scraped)]

In [157]:
len(df), len(ids), len(roles), len(already_scraped)

(1350, 160, 160, 160)

In [165]:
"""# Take the first 160 rows of the dataframe and append the ids and roles to the lists
test_df = df.head(160).copy()
test_df["Scrapped Roles"] = roles
test_df["Alts"] = ids
test_df
# Save test_df to a csv file named "test_alt_df.csv"
test_df.to_csv("test_alt_df.csv", index=False)"""

In [224]:
import tqdm.notebook as tqdm
# Iterate through each row in the dataframe and get the soloq ids by calling the get_pro_soloq_ids function on the Summoner Name, Legal First Name, and Legal Family Name columns
for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    if index < len(already_scraped) - 1:
        continue
    if row["Official Summoner Name"] in already_scraped:
        continue
    #print(row["Official Summoner Name"], row["Legal Family Name"], row["Legal First Name"])
    try:
        result = test.get_pro_soloq_ids(row["Official Summoner Name"], row["Legal First Name"], row["Legal Family Name"])
        ids.append(result['ids'])
        roles.append(result['Roles'])
        already_scraped.append(row["Official Summoner Name"])
        #print(row["Official Summoner Name"], row["Legal Family Name"], row["Legal First Name"], result['Roles'])
    except:
        print(row)
        problem = [row["Official Summoner Name"], row["Legal Family Name"], row["Legal First Name"], row["Region"]]
        print(problem)
        raise Exception("Error")


  0%|          | 0/1350 [00:00<?, ?it/s]

Region                              EMEA
Team                              REBELS
Official Summoner Name         Beanovich
Roles                                Bot
Legal First Name                   Louis
Legal Family Name         Joscha Schmitz
End Date                             NaT
Name: 355, dtype: object
['Beanovich', 'Joscha Schmitz', 'Louis', 'EMEA']


Exception: Error

In [None]:
# Take already_scraped, ids, roles and add them to the dataframe
pro_scrape_df = pd.DataFrame({"Official Summoner Name":already_scraped, "ids":ids, "Roles":roles})
pro_scrape_df.to_csv("pro_scrape_df.csv", index=False)