In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
problem = {}


In [3]:
import os
if not os.path.exists("pro_scrape_df.csv"):
    already_scraped = []
    ids = []
    roles = []
else:
    pro_scrape_df = pd.read_csv("pro_scrape_df.csv")
    already_scraped = list(pro_scrape_df["Official Summoner Name"])
    ids = list(pro_scrape_df["ids"])
    roles = list(pro_scrape_df["Roles"])

### Leaguepedia Scraper Class

In [4]:


class LeaguepediaScraper:
    """Class to scrape data from Leaguepedia website regarding professional LOL players"""
    
    def __init__(self):
        self.url_template = "https://lol.fandom.com/wiki/{page}"
    
    def get_page(self, page):
        """Gets the HTML of the page"""
        url_prefix = "lol.fandom.com/wiki/"
        #print("get_page page:",page)
        # Find the index of url_prefix in the page
        url_prefix_index = page.find(url_prefix)
        # If url_prefix is not found, use it as it is
        if url_prefix_index == -1:
            url = self.url_template.format(page=page)
        else:
            # If url_prefix is found, remove it
            url = self.url_template.format(page=page[url_prefix_index+len(url_prefix):])
        #print("get_page output:",str(url))
        response = requests.get(url)
        return BeautifulSoup(response.text, 'html.parser')

    def special_search(self, search_term:str) -> BeautifulSoup:
        """Searches for a player using the query in the search bar and returns the soup object of the page"""
        url = self.url_template.format(page=f"Special:Search?query={search_term}&scope=internal&navigationSearch=true")
        #print("Special Search:",search_term, url)
        response = requests.get(url)
        return BeautifulSoup(response.text, 'html.parser')
        
    
    def search_pro_page(self, summoner_name:str, given_name:str, family_name:str, region:str=None, soup:BeautifulSoup=None) -> BeautifulSoup:
        """Searches for a pro player's leaguepedia page using their summoner name, given name, and family name,
        or takes in an already found BeautifulSoup object and returns the soup object of the pro player's page

        Args:
            summoner_name (str): The pro player's summoner name
            given_name (str): The pro player's legal given/first name
            family_name (str): The pro player's legal family name
            region (str, optional): The pro player's contracted region. Defaults to None.
            soup (BeautifulSoup, optional): An optional argument for if you've already found something. Defaults to None.

        Returns:
            BeautifulSoup: A soup object of the pro player's page for later parsing
        """
        # First try the summoner name
        if soup == None:
            soup = self.get_page(summoner_name)
        page_type = self.check_page(soup)

        if page_type == "disambiguation" or page_type == "search":
            new_url = self.find_disambiguation(soup, summoner_name, given_name, family_name)
            # print(f"search pro page, page type {page_type}, url {new_url} ")
            return self.search_pro_page(summoner_name, given_name, family_name, region, self.get_page(new_url))
        
        elif page_type == "doesn't exist":
            # Try changing the summoner name to title case
            soup = self.get_page(summoner_name.title())
            
            if self.check_page(soup) != "article":
                search_soup = self.special_search(summoner_name)
                # Use the find_disambiguation function to find the correct link
                new_url = self.find_disambiguation(search_soup, summoner_name, given_name, family_name)
                
                # If the new_url is None, use the find_search function
                if new_url == None:
                    new_soup = self.special_search(f"{given_name}+{family_name}")
                    new_url = self.find_disambiguation(new_soup, summoner_name, given_name, family_name)
                
                #print("search pro page, article doesn't exist", page_type,new_url)
                return self.search_pro_page(summoner_name, given_name, family_name, region, self.get_page(new_url))
        page_type = self.check_page(soup)
        if page_type == "article":
            return soup
        # If not found, try the given name and family name
    
    def find_disambiguation(self, soup:BeautifulSoup, summoner_name:str, first_name:str, family_name:str, search:bool=False):
        """Given a soup object containing a disambiguation page, find the correct link

        Args:
            soup (BeautifulSoup): the soup object for the disambiguation page
            first_name (str): the player's first name
            family_name (str): the player's family name
        """
        first_name = first_name.replace("_", " ")
        family_name = family_name.replace("_", " ")
        #print("Find disambiguation: ",summoner_name, first_name, family_name)
        result = None
        for link in soup.find_all("a"):
            if "discord" in link.get('href').lower():
                continue
            # If the link contains summoner_name.title() or summoner_name.lower() and first_name or family_name
            # return the href
            tests = [summoner_name.title(), summoner_name.lower(), summoner_name.upper(), summoner_name]
            #print(f"Summoner name: {summoner_name}, link text: {link.text.strip()}")
            if any([s in link.text for s in tests]) or summoner_name.title() in link.text.title():
                if (first_name in link.text or family_name in link.text):
                    #print("Matching link: ",link.text)
                    # Return the href
                    result = link.get("href")[6:]
                    return result
                else:
                    # Print the contents of the link after "wiki/"
                    #print("Else block in find disambiguation")
                    if len(summoner_name) == len(link.get("href").split("wiki/")[-1]):
                        result = link.get("href")
        # If no exact match found, loop through again and look for similar matches
        for link in soup.find_all("a"):
            if first_name in link.text.split(" ") or family_name in link.text.split(" "):
                print("Found something similar", link.text)
            # If first name and family name both in the search result, return the href
            if first_name.lower() in link.text.lower() or family_name.lower() in link.text.lower():
                #print(f"Name in link text {link.text}")
                return link.get("href")[6:]
        return result
            
                
        
    def check_page(self, soup:BeautifulSoup) -> str:
        """A classification function that checks if the page is an article, a disambiguation page, or a "doesn't exist" page

        Args:
            soup (BeautifulSoup): A BS object of the page to be checked

        Returns:
            str: Returns "article", "disambiguation", or "doesn't exist" depending on certain key words in the page
        """
        # Check if the page is an article, a disambiguation page, or a "doesn't exist" page
        
        # If it's an article, return "article"
        article_text = "Background Information"
        
        # If it's a disambiguation page, return "disambiguation"
        disambiguation_text = "This disambiguation page lists articles associated with the same title."
        
        # If it's a "doesn't exist" page, return "doesn't exist"
        doesnt_exist_text = "There is currently no text in this page."
        
        # If it's a "Search" page, return "search"
        search_text = "Search Results"
        
        if doesnt_exist_text in soup.text:
            return "doesn't exist"
        elif disambiguation_text in soup.text:
            return "disambiguation"
        elif search_text in soup.text:
            return "search"
        elif article_text in soup.text:
            return "article" 
        
        # If it's something else, return "other"
        return "other"
    
    def parse_pro_soloq_ids(self, soup:BeautifulSoup, region:str=None) -> dict:
        """Parses the soup object of the pro player's page and returns their soloq ids as a dictionary

        Args:
            soup (BeautifulSoup): A soup object of the pro player's page
            region (str, optional): The region the player was contracted in. Defaults to None.

        Returns:
            dict: A dictionary with the region as keys and the ids as a list of strings
        """
        #print("Parsing ids...")
        # Go through the tables and look for <td class="infobox-label">Soloqueue IDs</td>
        tables = soup.find_all("table", class_="infobox")
        # If tables doesn't exist, return an empty dictionary
        if tables == [] or tables == None:
            return {}
        
        for table in tables:
            for row in table.find_all("tr"):
                # If row.text starts with "Soloqueue IDs", save the row
                search_string = "Soloqueue IDs"
                if row.text.startswith("Soloqueue IDs"):
                    break
                
        # Get the contents inside all the <b> tag in the rows
        server = row.find_all("b")
        # Iterate through all the server names, and get the text following them in the row variable
        ids = {}
        # Zip the two lists together into a dictionary
        #print(row)
        for s in server:
            #print(s.text)
            if ":" not in s.text and ":" in s.next_sibling:
                ids[s.text] = (s.next_sibling.split(":")[1].strip())
            elif len(s.text.split(":")[1]) > 0:
                ids[s.text] = s.next_sibling.strip().split(", ")
            
        if ids == {}:
            # If the ids are not found, try a different format bc leaguepedia has inconsistent formatting

            # Find the index of the search_string, and print everything after that
            search_string_index = row.text.find(search_string)
            
            # If search_string is not found, return an empty dictionary
            if search_string_index == -1:
                return {}
            
            # Split the string by comma
            raw_ids = row.text[search_string_index + len(search_string):].split(", ")

            # Iterate backwards through the list, and if the id doesn't contain a region parenthesis,
            # add the parenthesis from the previous id.
            # If it's the first id, add the region to the end of the id
            for i in range(len(raw_ids)-1, -1, -1):

                if "(" not in raw_ids[i]:
                    try:
                        # Find the index of the parenthesis in the previous id
                        previous_id = raw_ids[i+1]
                        parenthesis_index = previous_id.find("(")
                        # Add the previous id to the current id
                        previous_region = previous_id[parenthesis_index:]
                    except IndexError:
                        previous_region = f"({region})"
                    raw_ids[i] = raw_ids[i] + " " + previous_region

            # For each entry in raw_ids, split by parenthesis
            for raw_id in raw_ids:
                raw_id = raw_id.split("(")
                server = raw_id[1].strip(")")
                id = raw_id[0].strip()
                # Append the server: id to the ids dictionary
                if server in ids:
                    ids[server].append(id)
                else:
                    ids[server] = [id]
        return ids
    
    def get_pro_role(self, soup:BeautifulSoup) -> str:
        """Takes in the soup object of the pro player's page and returns their role
        
        Args: soup (BeautifulSoup): the soup object of the pro player's page
        
        Returns:
            str: the pro player's role
        """
        # Find all tables with the class "infobox"
        tables = soup.find_all("table", class_="infobox")
        # Locate the table with this attribute, then find the text of the next sibling
        for table in tables:
            for row in table.find_all("tr"):
                if row.text.startswith("Role"):
                    # Find the span class with class "sprite" and get its title
                    role = row.find("span", class_="sprite").get("title")
                    return role
        
    
    def get_pro_soloq_ids(self, summoner_name:str, first_name:str, family_name:str, region:str=None) -> dict:
        """Finds the pro player's soloq ids and return it 
        as a dictionary containing the region as keys and the ids as a list of strings

        Args:
            summoner_name (str): the pro player's summoner name
            first_name (str): the pro player's first/given name
            family_name (str): the pro player's family name
            region (str): the pro player's tournament region

        Returns:
            dict: A dictionary of the pro player's soloq ids with the region as keys and the ids as a list of strings
        """
        summoner_name = str(summoner_name)
        summoner_name = summoner_name.replace(" ", "_")
        first_name = first_name.replace(" ", "_")
        family_name = family_name.replace(" ", "_")
        soup = self.search_pro_page(str(summoner_name), first_name.title(), family_name.title())
        if soup == None:
            # If the soup returns None, try searching for the player's name instead
            query_soup = self.special_search(f"{first_name}+{family_name}")
            soup = self.search_pro_page(str(summoner_name), first_name.title(), family_name.title(), soup=query_soup)
            if soup == None:
                raise Exception("Soup is None")
        ids = {}
        role = (self.get_pro_role(soup))
        non_player_roles = ["manager", "analyst", "coach", "caoch", "media"]
        # If the role contains keywords from non_player_roles, return the role
        for r in non_player_roles:
            if r in role.lower():
                return {'ids':ids, 'Roles':role}
        ids = self.parse_pro_soloq_ids(soup, region)
        return {'ids':ids, 'Roles':role}

test = LeaguepediaScraper()

In [5]:
test.get_pro_soloq_ids("Kuma", "Bernardo", "Louzada", "BR")

{'ids': {}, 'Roles': 'Coach'}

In [6]:
solved_problems = []


In [7]:
all_problems = []

### Problem test

In [24]:
i = 0
while i < 100:
    i += 1
    for problem in all_problems:
        if problem not in solved_problems:
            print(problem)
            result = test.get_pro_soloq_ids(summoner_name=problem["OSN"],
                                first_name=problem["First Name"],
                                family_name=problem["Family Name"],
                                region=problem["Region"])
            solved_problems.append(problem)
            print("Problem solved")

{'OSN': 'kyehoo', 'First Name': 'YeaHoo', 'Family Name': 'Kang', 'Region': 'KR', 'Team': 'DRX'}


AttributeError: 'NoneType' object has no attribute 'lower'

In [9]:
# Reaad the RCD.xlsx file and save into pandas dataframe
dtypes = {"Official Summoner Name":str}
raw_data = pd.read_excel("RCD.xlsx", sheet_name=None, dtype=dtypes)

In [10]:
raw_df = pd.DataFrame()
for region in raw_data:
    sheet = raw_data[region]
    sheet["Official Summoner Name"] = sheet["Official Summoner Name"].astype(str)
    sheet = raw_data[region]
    sheet["Region"] = region
    raw_df = pd.concat([raw_df,sheet])

In [11]:
def change_data(df:pd.DataFrame, OSN:str, team:str, change_dict:dict) -> None:
    """Changes the data in the dataframe based on the OSN and team name

    Args:
        df (pd.DataFrame): The dataframe to be changed
        OSN (str): The official summoner name of the player
        team (str): The team the player is on
        change_dict (dict): a dictionary of the columns to be changed and the values to be changed to
    """
    if team == "None":
        index = df[(df["Official Summoner Name"] == OSN) & (df["Team"].isna())].index[0]
    else:
        # Get the index of the row with the OSN and team name
        index = df[(df["Official Summoner Name"] == OSN) & (df["Team"] == team)].index[0]
    # Change the data based on the change_dict
    for key, value in change_dict.items():
        df.loc[index, key] = value

In [28]:
df = raw_df.copy()
# Merge the Legal Given Name and Legal First Name column
df["Legal First Name"] = df["Legal Given Name"].fillna(df["Legal First Name"])

# Rename the "End Date (Month, Day, Year)" to just "End Date"
df = df.rename(columns={"End Date (Month, Day, Year)":"End Date"})
# Merge the "Main Role", "Position", and "Positon" columns
df["Roles"] = df["Main Role"].fillna(df["Position"]).fillna(df["Positon"])
df["Roles"] = df["Roles"].apply(lambda x: x.strip().title() if isinstance(x, str) else "NaN")
df = df.reset_index(drop=True)


In [29]:

# Manually change the roles of some players

# Change the role of "Freizer" from NaN to "Coach"
change_data(df, "Freizer", "KaBuM! Esports", {"Roles":"Coach"})

# Change Duall's Official Summoner Name to "DuaLL"
change_data(df, "Duall", "Guasones", {"Official Summoner Name":"DuaLL"})

# Change Beanovich's First Name to "Louis" since they mistyped it as "Loius"
change_data(df, "Beanovich", "REBELS", {"Official Summoner Name":"BEAN", "Legal First Name":"Louis"})

# Change OSN "Alvaromorata9"'s OSN to "Whyx"
change_data(df, "Alvaromorata9", "REBELS", {"Official Summoner Name":"Whyx"})

# Change OSN "Lee Sang" to "Lee sang"
change_data(df, "Lee Sang", "Komil & Friends", {"Official Summoner Name": "Lee sang"})
 
# Change OSN "Sh4dowUS" to "ShadowUS"
change_data(df, "Sh4dowus", "Grypciocraft Esports", {"Official Summoner Name": "ShadowUS"})
 
# Change OSN "Neon" Matus Jakubcik's full name to Matúš Jakubčík
change_data(df, "Neon", "Team Vitality", {"Legal First Name": "Matúš", "Legal Family Name": "Jakubčík"})

# Change "Kim DONGHUN"'s OSN to "3_4"
df.loc[df["Legal Family Name"] == "DONG HUN", "Official Summoner Name"] = "3_4"

# Change "Marky"'s legal family name to "Serrano"
change_data(df, "Marky", "Movistar Riders", {"Legal Family Name": "Serrano"})

# Change "BAO"'s legal family name to "Jeong" and first name to "Hyeon-woo"
change_data(df, "Bao", "Denizbank İstanbul Wildcats", {"Legal Family Name": "Jeong", "Legal First Name": "Hyeon-woo"})

# Change Jung Jeong Bin's family name to "Jeong" and first name to "Jong-bin"
change_data(df, "Max", "Hanwha Life Esports", {"Legal Family Name": "Jeong", "Legal First Name": "Jong-bin"})

# Change "Castle" Jo Hyeonseong's family name to "Cho" and first name to "Hyeon-seong"
change_data(df, "Castle", "kt Rolster", {"Legal Family Name": "Cho", "Legal First Name": "Hyeon-seong"})

# Change "FATE" Yoo Su-hyeok's family name to "Yoo" and first name to "Su-hyeok"
change_data(df, "FATE", "DRX", {"Legal Family Name": "Yoo", "Legal First Name": "Su-hyeok"})

change_data(df, "Calix", "Nongshim Redforce", {"Legal First Name": "Hyun-bin", "Legal Family Name": "Syun"})

change_data(df, "Falco", "INF", {"Legal First Name": "Jesús", "Legal Family Name": "Pérez"})

change_data(df, "Kun", "R7", {"Legal First Name": "Oh-seong", "Legal Family Name": "Kwon"})

change_data(df, "Taki", "None", {"Legal First Name": "Anh Tài", "Legal Family Name": "Đinh"})

change_data(df, "Artemis", "None", {"Legal First Name": "Quốc Hưng", "Legal Family Name": "Trần"})

change_data(df, "R4ven", "REBELS", {"Official Summoner Name": "R4VEN"})

change_data(df, "Katherine Pierce", "Alior Bank Team", {"Official Summoner Name":"Jacob"})

change_data(df, "Lequ", "Grypciocraft Esports", {"Official Summoner Name":"LeQu"})

change_data(df, "Howla", "Fukuoka Softbank HAWKS gaming", {"Official Summoner Name":"HowLa"})

change_data(df, "kyehoo", "DRX", {"Official Summoner Name":"kyeahoo"})

# Filter out all players whose "Roles" contains the word "Coach", "Manager", or "Analyst"
df = df[~df["Roles"].str.contains("Coach|Manager|Analyst|Caoch")]

roleDict = {"Toplaner": "Top", "Jungler": "Jungle", "Midlaner": "Mid", "Botlaner": "Bot", "Support": "Sup",
            "Jun": "Jungle", "Jug":"Jungle", "Jung": "Jungle",
            "Middle" : "Mid",
            "Adc": "Bot", "Bottom": "Bot"}
# If the role is in the roleDict, replace it with the value in the roleDict
df["Roles"] = df["Roles"].replace(roleDict)
df = df[["Region", "Team", "Official Summoner Name","Roles" ,"Legal First Name", "Legal Family Name", "End Date"]]

df = df.dropna(subset=["Official Summoner Name"])
df = df[df["Official Summoner Name"] != "nan"]

In [30]:
len(df), len(ids), len(roles), len(already_scraped)

(1350, 1336, 1336, 1336)

### Scraping the whole database

In [32]:
all_problems = []

In [33]:
import tqdm.notebook as tqdm
# Iterate through each row in the dataframe and get the soloq ids by calling the get_pro_soloq_ids function on the Summoner Name, Legal First Name, and Legal Family Name columns
for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    if row["Official Summoner Name"] in already_scraped:
        continue
    #print(row["Official Summoner Name"], row["Legal Family Name"], row["Legal First Name"])
    try:
        result = test.get_pro_soloq_ids(row["Official Summoner Name"], row["Legal First Name"], row["Legal Family Name"])
        ids.append(result['ids'])
        roles.append(result['Roles'])
        already_scraped.append(row["Official Summoner Name"])
        #print(row["Official Summoner Name"], row["Legal Family Name"], row["Legal First Name"], result['Roles'])
    except:
        print(row)
        problem = {"OSN": row["Official Summoner Name"],
                    "First Name": row["Legal First Name"],
                    "Family Name": row["Legal Family Name"],
                    "Region": row["Region"],
                    "Team": row["Team"]
                }
        print(f"Problem found:\n\n\n{problem}\n\n\n")
        all_problems.append(problem)


  0%|          | 0/1350 [00:00<?, ?it/s]

In [35]:
# Take already_scraped, ids, roles and add them to the dataframe
pro_scrape_df = pd.DataFrame({"Official Summoner Name":already_scraped, "ids":ids, "Roles":roles})
pro_scrape_df.to_csv("pro_scrape_df.csv", index=False)
pro_scrape_df

Unnamed: 0,Official Summoner Name,ids,Roles
0,Baldan,{},Top Laner
1,Forlin,{},Top Laner
2,Tay,"{'None': ['BR: 7ay', 'Esdeath', 'FPXzhaoTOP303...",Top Laner
3,Disamis,{},Jungler
4,Ancrath,{},Jungler
...,...,...,...
1332,Jacob,{},Jungler
1333,Krysia,{},Jungler
1334,LeQu,{},Top Laner
1335,HowLa,{'None': ['KR: rkljlama04']},Bot Laner


In [None]:
"""# Take the first 160 rows of the dataframe and append the ids and roles to the lists
test_df = df.head(160).copy()
test_df["Scrapped Roles"] = roles
test_df["Alts"] = ids
test_df
# Save test_df to a csv file named "test_alt_df.csv"
test_df.to_csv("test_alt_df.csv", index=False)"""

In [43]:
roleDict = {"Top Laner": "Top", "Jungler": "Jungle", "Mid Laner": "Mid", "Bot Laner": "Bot", "Support": "Sup"}

In [44]:
# Merge df with pro_scrape_df and call it pro_df
pro_df = df.merge(pro_scrape_df, on="Official Summoner Name", how="left", suffixes=(" (RCD)", " (LeaguePedia)"))
pro_df = pro_df[["Region", "Team", "Official Summoner Name", "ids", "Roles (LeaguePedia)", "Roles (RCD)"]]

# Filter the dataframe to only include players whose roles are in roleDict
pro_df = pro_df[pro_df["Roles (LeaguePedia)"].isin(roleDict.keys())]
pro_df["Roles (LeaguePedia)"] = pro_df["Roles (LeaguePedia)"].replace(roleDict)
pro_df

Unnamed: 0,Region,Team,Official Summoner Name,ids,Roles (LeaguePedia),Roles (RCD)
0,BR,FLUXO,Baldan,{},Top,
1,BR,FLUXO,Forlin,{},Top,
2,BR,FLUXO,Tay,"{'None': ['BR: 7ay', 'Esdeath', 'FPXzhaoTOP303...",Top,
3,BR,FLUXO,Disamis,{},Jungle,
4,BR,FLUXO,Ancrath,{},Jungle,
...,...,...,...,...,...,...
1345,VN,,Emo,{},Mid,Sub/Mid
1346,VN,,Nugu,"{'VN': '211221', 'KR': '우주를 줄게 1802'}",Mid,Mid
1347,VN,,Slowz,{},Top,Bot
1348,VN,,Zodiac,{'VN': ['Zodiac12']},Sup,Sup


In [46]:
if not os.path.exists("pro_df.csv"):
    pro_df.to_csv("pro_df.csv", index=False)