In [2]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import csv

### Using Selenium & Beatuiful Soup

In [7]:
sports = ['football']
#years = ['2022','2023','2024','2025','2026']
years = ['2024']

exp = []
athlete_grade = []
pos_height_weight = []
ages = []
ranks = []
high_school = []
home_town = []
colleges = []
college_distance = []
num_offers = []
NIL_val = []
instagram_followers = []
twitter_followers = []
tiktok_followers = []

for year in years:
    for sport in sports:
        # Selenium Driver to click on "Load More"
        driver = webdriver.Chrome()
        URL = f'https://www.on3.com/db/rankings/industry-player/{sport}/{year}/'
        driver.get(URL)

        dummyCount = 0
        ###########################################################################################################################
        # Click the "Load More" button such that all athelete links are visible
        # dummyCount was implemented for sports/years in which the number of athletes is very big (ex: football 2023 has 3000+ athletes) AND
        # the 'Load More' button does not disappear when pressed to completiion... so a simple counter was implemented
        # By making this condition: 'dummyCount < 19' we max out at 1000 athletes
        while (dummyCount < 60):
            try:
                load_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//span[@class='MuiButton-label' and contains(text(), 'Load More')]"))
                )
                load_more_button.click()
                time.sleep(12)
                dummyCount += 1
            except:
                break

        ###########################################################################################################################
        
        ###########################################################################################################################
        # Get all athlete names & associated links
        # New Information: names, links
        page_source = driver.page_source
        page_soup = BeautifulSoup(page_source, 'html.parser')
        results = page_soup.find(class_="PlayerRankings_playerRankings__jvfFg")

        athletes = results.find_all('a', class_="MuiTypography-root MuiLink-root MuiLink-underlineHover MuiTypography-h5 MuiTypography-colorPrimary")
        names = [athlete.text for athlete in athletes]
        names_ = names

        # Generate a list of links that we can iterate through
        links = [athlete['href'] for athlete in athletes]
        base_url = "https://www.on3.com/"
        athlete_links = [urljoin(base_url, link) for link in links]

        tot = len(athlete_links)

        print('Successfully recovered {} athletes for {} {}!'.format(tot, sport, year))
        print('========== Starting Scrape ===========')

        ###########################################################################################################################

        for i, athlete_link in enumerate(athlete_links):

            ###########################################################################################################################
            # Information from the "Player" page for each of the athletes
            # New Information: exp, athlete_grade, ages, high_school, home_town, pos_height_weight
            try:
                driver.get(athlete_link)
                time.sleep(3)

                athlete_source = driver.page_source
                page_soup = BeautifulSoup(athlete_source, 'html.parser')
            except:
                print("Player page not loading for athlete {}".format(names_[i]))
                exp.append(np.nan)
                athlete_grade.append(np.nan)
                pos_height_weight.append(np.nan)
                ages.append(np.nan)
                ranks.append(np.nan)
                high_school.append(np.nan)
                home_town.append(np.nan)
                colleges.append(np.nan)
                college_distance.append(np.nan)
                num_offers.append(np.nan)
                NIL_val.append(np.nan)
                instagram_followers.append(np.nan)
                twitter_followers.append(np.nan)
                tiktok_followers.append(np.nan)
                continue
            
            try:
                CollegeRankingInfo = page_soup.find(class_='CollegeRanking_info__LM3nn')

                if CollegeRankingInfo:
                    exp_year = CollegeRankingInfo.find_all(class_='MuiTypography-root CollegeRanking_span__qtAfW MuiTypography-subtitle1 MuiTypography-colorTextPrimary')
                    if len(exp_year) == 2:
                        exp.append(exp_year[0].text)
                        athlete_grade.append(exp_year[1].text)
                    else:
                        exp.append(np.nan)
                        athlete_grade.append(np.nan)
                    age = CollegeRankingInfo.find(class_='MuiTypography-root CollegeRanking_span__qtAfW MuiTypography-subtitle1 MuiTypography-colorPrimary')
                    ages.append(age.text)
                else:
                    exp.append(np.nan)
                    athlete_grade.append(np.nan)
                    ages.append(np.nan)
            except:
                print("exp, athlete_grade, age not available for athlete {}".format(names_[i]))
                exp.append(np.nan)
                athlete_grade.append(np.nan)
                ages.append(np.nan)
            
            try:
                RecruitModuleInfo = page_soup.find(class_='RecruitModule_info__Ugxqd')

                if RecruitModuleInfo:
                    homeInfo = RecruitModuleInfo.find_all(class_='MuiTypography-root RecruitModule_span__KmmzN MuiTypography-subtitle1 MuiTypography-colorTextPrimary')
                    high_school.append(homeInfo[-2].text)
                    home_town.append(homeInfo[-1].text)
                else:
                    high_school.append(np.nan)
                    home_town.append(np.nan)
            except:
                print("high_school, home_town not available for athlete {}".format(names_[i]))
                high_school.append(np.nan)
                home_town.append(np.nan)

            try:
                Attributes = page_soup.find(class_='MeasurementInfo_info__IHmGD')

                if Attributes:
                    dummy = Attributes.find_all(class_='MuiTypography-root MeasurementInfo_text__dCryI MuiTypography-body1 MuiTypography-colorTextPrimary')
                    pos_height_weight.append(dummy[1].text)
                else:
                    pos_height_weight.append(np.nan)
            except:
                print("pos_height_weight not available for athlete {}".format(names_[i]))
                pos_height_weight.append(np.nan)
            ###########################################################################################################################
            
            ###########################################################################################################################
            # Information from the "Recruiting" page for each of the athletes
            # New Information: ranks, colleges (the college this athlete is targeting), college_distance, num_offers
            try:
                driver.get(urljoin(athlete_link,'recruiting/'))
                time.sleep(3)
                athlete_source = driver.page_source
                page_soup = BeautifulSoup(athlete_source, 'html.parser')

            except:
                print("Recruiting page not loading for athlete {}".format(names_[i]))
                ranks.append(np.nan)
                colleges.append(np.nan)
                college_distance.append(np.nan)
                num_offers.append(np.nan)
                NIL_val.append(np.nan)
                instagram_followers.append(np.nan)
                twitter_followers.append(np.nan)
                tiktok_followers.append(np.nan)
                continue

            try:
                Ranks = page_soup.find(class_="Rankings_industryRankWrapper__2qwnq")

                if Ranks:
                    ranking = Ranks.find(class_="MuiTypography-root Rankings_industryRating__9uavm MuiTypography-body1 MuiTypography-colorTextPrimary")
                    ranks.append(ranking.text)
                else:
                    ranks.append(np.nan)
            except:
                print("ranks not available for athlete {}".format(names_[i]))
                ranks.append(np.nan)

            try:
                # Find url for all teams and navigate to page
                all_team_link = page_soup.find(class_='MuiTypography-root MuiLink-root MuiLink-underlineHover PlayerInterestsModule_text__kjqNU MuiTypography-caption MuiTypography-colorPrimary')
                driver.get(urljoin(base_url,all_team_link['href']))
                time.sleep(5)
                athlete_source = driver.page_source
                page_soup = BeautifulSoup(athlete_source, 'html.parser')
            
            except:
                print("all_team_link page is not loading for athlete {}".format(names_[i]))
                colleges.append(np.nan)
                college_distance.append(np.nan)
                num_offers.append(np.nan)
                NIL_val.append(np.nan)
                instagram_followers.append(np.nan)
                twitter_followers.append(np.nan)
                tiktok_followers.append(np.nan)
                continue

            try:

                RecruitColleges = page_soup.find_all(class_='PlayerInterestsItem_teamContainer__vjQkf')

                if RecruitColleges:
                    count = 0
                    for college in RecruitColleges:
                        college_name = college.find(class_='MuiTypography-root MuiLink-root MuiLink-underlineNone PlayerInterestsItem_teamName__FeBHv MuiTypography-h5 MuiTypography-colorPrimary')
                        college_dist = college.find(class_='MuiTypography-root PlayerInterestsItem_distanceText__KJhj3 MuiTypography-caption MuiTypography-colorTextPrimary')

                        if count == 0:
                            colleges.append(college_name.text)
                            college_distance.append(college_dist.text)
                        count +=1
                    num_offers.append(count)
                else:
                    colleges.append(np.nan)
                    college_distance.append(np.nan)
                    num_offers.append(np.nan)
            except:
                print("colleges, college_distance, num_offers are not available for athlete {}".format(names_[i]))
                colleges.append(np.nan)
                college_distance.append(np.nan)
                num_offers.append(np.nan)
            ###########################################################################################################################
            
            ###########################################################################################################################
            # Information from the "NIL" page for each of the athletes
            # New Information: NIL_val, instagram_followers, twitter_followers, tiktok_followers
            try:
                driver.get(urljoin(athlete_link,'nil/'))
                time.sleep(3)
                athlete_source = driver.page_source
                page_soup = BeautifulSoup(athlete_source, 'html.parser')
            
            except:
                print("NIL page not loading for athlete {}".format(names_[i]))
                NIL_val.append(np.nan)
                instagram_followers.append(np.nan)
                twitter_followers.append(np.nan)
                tiktok_followers.append(np.nan)
                continue

            try:
                RecruitNIL = page_soup.find(class_='NilValuationCircle_nilCircleValue__wzomB')

                if RecruitNIL:
                    NIL_val.append(RecruitNIL.text)
                else:
                    NIL_val.append(np.nan)
            except:
                print("NIL_val not available for athlete {}".format(names_[i]))
                NIL_val.append(np.nan)
            
            try:
                RecruitSocials = page_soup.find(class_="NilSocialValuations_socialValuations__MeR7O")

                instagram = np.nan
                twitter = np.nan
                tiktok = np.nan

                if RecruitSocials:
                    socials = RecruitSocials.find_all(class_="NilSocialValuations_platform__3qBMy")
                    for social in socials:
                        social_name = social.find('a')['href']
                        if 'instagram' in social_name:
                            instagram = social.find(class_='MuiTypography-root NilSocialValuations_platformFollowers__sW8kK MuiTypography-body1 MuiTypography-colorTextPrimary').text
                        elif 'twitter' in social_name:
                            twitter = social.find(class_='MuiTypography-root NilSocialValuations_platformFollowers__sW8kK MuiTypography-body1 MuiTypography-colorTextPrimary').text
                        elif 'tiktok' in social_name:
                            tiktok = social.find(class_='MuiTypography-root NilSocialValuations_platformFollowers__sW8kK MuiTypography-body1 MuiTypography-colorTextPrimary').text
                
                instagram_followers.append(instagram)
                twitter_followers.append(twitter)
                tiktok_followers.append(tiktok)
            
            except:
                print("insta, twitter, and tiktok followers not available for athlete {}".format(names_[i]))
                instagram_followers.append(np.nan)
                twitter_followers.append(np.nan)
                tiktok_followers.append(np.nan)
            ###########################################################################################################################

            ###########################################################################################################################
            # Print something out to the user, like a progress bar
            if (i % 10 == 0) and (i != 0):
                print("=== Scraped {:.2f} % of the Players ===".format((i/tot)*100))
                print(len(names), len(exp), len(pos_height_weight), len(athlete_grade), len(ages), len(ranks), len(high_school), len(home_town),
                      len(colleges), len(college_distance), len(num_offers), len(NIL_val),
                      len(instagram_followers), len(twitter_followers),len(tiktok_followers))
                print("======================================")
            ###########################################################################################################################
        
        print("=== Scraped 100.0 % of the Players ===")
        print(len(names), len(exp), len(pos_height_weight), len(athlete_grade), len(ages), len(ranks), len(high_school), len(home_town),
              len(colleges), len(college_distance), len(num_offers), len(NIL_val),
              len(instagram_followers), len(twitter_followers),len(tiktok_followers))
        print("======================================")
        print('Successfully completed {} athletes for {} {}!'.format(len(names), sport, year))
        #end
    #end
#end

# Close the instance of the webpage
driver.quit()

Successfully recovered 1860 athletes for football 2024!
=== Scraped 0.54 % of the Players ===
1860 11 11 11 11 11 11 11 11 11 11 11 11 11 11
=== Scraped 1.08 % of the Players ===
1860 21 21 21 21 21 21 21 21 21 21 21 21 21 21
=== Scraped 1.61 % of the Players ===
1860 31 31 31 31 31 31 31 31 31 31 31 31 31 31
=== Scraped 2.15 % of the Players ===
1860 41 41 41 41 41 41 41 41 41 41 41 41 41 41
=== Scraped 2.69 % of the Players ===
1860 51 51 51 51 51 51 51 51 51 51 51 51 51 51
=== Scraped 3.23 % of the Players ===
1860 61 61 61 61 61 61 61 61 61 61 61 61 61 61
=== Scraped 3.76 % of the Players ===
1860 71 71 71 71 71 71 71 71 71 71 71 71 71 71
=== Scraped 4.30 % of the Players ===
1860 81 81 81 81 81 81 81 81 81 81 81 81 81 81
=== Scraped 4.84 % of the Players ===
1860 91 91 91 91 91 91 91 91 91 91 91 91 91 91
=== Scraped 5.38 % of the Players ===
1860 101 101 101 101 101 101 101 101 101 101 101 101 101 101
=== Scraped 5.91 % of the Players ===
1860 111 111 111 111 111 111 111 111 111 1

In [8]:
print(len(names), len(exp), len(pos_height_weight), len(athlete_grade), len(ages), len(ranks), len(high_school), len(home_town),
      len(colleges), len(college_distance), len(num_offers), len(NIL_val),
      len(instagram_followers), len(twitter_followers),len(tiktok_followers))
# print(names[0], exp[0], pos_height_weight[0], athlete_grade[0], ages[0], ranks[0], high_school[0])
# print(home_town[0], colleges[0], college_distance[0], num_offers[0])
# print(NIL_val[0], instagram_followers[0], twitter_followers[0], tiktok_followers[0])

1860 1860 1860 1860 1860 1860 1860 1860 1860 1860 1860 1860 1860 1860 1860


In [9]:
column_names = ['NAME', 'EXP', 'POS_HEI_WEI', 'GRADE', 'AGE', 'SKILL', 'HISCH', 'HOTOWN', 'STARCOLL', 'COLLDIST', 'NUMOFF', 'INSTA', 'TWIT', 'TIK', 'NILVAL']

csv_file_path = 'csv_files/{}_{}.csv'.format(sport, year)

with open(csv_file_path, mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    
    writer.writerow(column_names)

    for row in zip(names, exp, pos_height_weight, athlete_grade, ages, ranks, high_school, home_town, colleges, college_distance, num_offers, instagram_followers, twitter_followers, tiktok_followers, NIL_val):
        writer.writerow(row)

print("csv saved to 'csv_files' folder!'")

csv saved to 'csv_files' folder!'
