## ECON8320001.1231 Final Project
### By Mig Shyaka

#### Prolem
Collect and organize the information about student-athlete NIL deals found on https://nilcollegeathletes.com to an external site., https://www.on3.com/os/ to an external site. or other NIL sources. Collect information about each athlete:

- School
- Sport
- Sponsorships (you should decide how organize this information, and explain in your write-up why you made that choice)
- Athlete's social media accounts

In [12]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

headers = []
rows = []

# Loop through 420 pages  to get the data
for page_num in range(1, 3 ): #420 pages # 3 was used to get a subset of data (as required by the professor) replace 3 with 420 to get all the data
    url = f"https://nilcollegeathletes.com/athletes?page={page_num}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Get headers from the table
    if not headers:
        for header in soup.find_all("th"):
            headers.append(header.text.strip())

    # Get rows from the table
    for row in soup.find_all("tr")[1:]:
        row_data = []
        for cell in row.find_all("td"):
            row_data.append(cell.text.strip())
        rows.append(row_data)

# Create a dataframe with the data
df = pd.DataFrame(rows, columns=headers)

# Create a new column for the athlete link and social media links
athlete_links = []
sponsors_list = []
instagram_list = []
twitter_list = []  # add a new empty list for Twitter values

for name in df["Name"]:
    name_parts = name.lower().split()
    athlete_link = f"https://nilcollegeathletes.com/athletes/{name_parts[0]}-{name_parts[1]}"
    athlete_links.append(athlete_link)
    
    # Scrape the sponsors and social media links for each athlete
    athlete_response = requests.get(athlete_link)
    athlete_soup = BeautifulSoup(athlete_response.content, "html.parser")
    sponsors = []
    instagram = ""
    twitter = ""  # add a new variable for Twitter value
    
    for sponsor in athlete_soup.find_all("ul", class_="space-y-1"):
        for s in sponsor.find_all("a"):
            sponsors.append(s.text.strip())
    
    for insta in athlete_soup.find_all("span", class_="pr-2"):
        if "instagram" in insta.parent["href"]:
            instagram = insta.text.strip()
        elif "twitter" in insta.parent["href"]:
            twitter = insta.text.strip()  # scrape Twitter value similarly as Instagram
    
    sponsors_list.append(sponsors)
    instagram_list.append(instagram)
    twitter_list.append(twitter)  # append the Twitter value to the list of Twitter values
df["Sponsors"] = sponsors_list
df["Instagram"] = instagram_list
df["Twitter"] = twitter_list  # add the list of Twitter values as a new column
df_final = df.explode("Sponsors").reset_index(drop=True)
#exclude Details column
df_final = df_final.loc[:, df_final.columns!='Details']
df_final.head()


Unnamed: 0,Name,Sponsors,University,Sport,Instagram,Twitter
0,Jashon Hubbard,Liquid I.V.,The Ohio State University,Wrestling,jashonhubbard,@jashon_hubbard
1,Jashon Hubbard,Celsius,The Ohio State University,Wrestling,jashonhubbard,@jashon_hubbard
2,Jashon Hubbard,Essentia Water,The Ohio State University,Wrestling,jashonhubbard,@jashon_hubbard
3,Jashon Hubbard,Ez Fresh Meals,The Ohio State University,Wrestling,jashonhubbard,@jashon_hubbard
4,Jashon Hubbard,Playa Bowls,The Ohio State University,Wrestling,jashonhubbard,@jashon_hubbard


#### The head of df_final above shows that we have successfully collected information about each athlete:

- School
- Sport
- Sponsorships (you should decide how organize this information, and explain in your write-up why you made that choice)
- Athlete's social media accounts
Note: df_final contains data from page 1 to 3 for submission purposes but you can change 3 to 420 in this line of code (for page_num in range(1, 3)) to get the whole dataset.  
I have saved full dataset on my local computer. 

In [2]:
df_final.to_csv("NIL_deals_sample")

### Exploratory data analysis

In [3]:
df = pd.read_csv("NIL_deals_csv")
df['Sponsors'] = df['Sponsors'].apply(lambda x: [s.strip(" '[]") for s in x.split(',')])
df_final = df.explode("Sponsors")
# Convert the column to string type
df['Sponsors'] = df['Sponsors'].astype(str)



In [17]:
import seaborn as sns 
import plotly.express as px
import kaleido
# Get count of sponsors and drop missing values
univ_count = df_final['Sponsors'].value_counts().reset_index().dropna()

# Rename columns
univ_count.columns = ['Sponsors', 'Count']

# Get the first sponsor name
first_sponsor = univ_count.iloc[0]['Sponsors']

# Create a new column for sponsor categories
univ_count['Sponsor Category'] = univ_count['Sponsors'].apply(lambda x: first_sponsor if x == first_sponsor else 'Other Sponsors')


# Get the count of sponsors by category
sponsor_category_count = univ_count.groupby('Sponsor Category').sum().reset_index()

# Plot a pie chart of sponsor categories
fig = px.pie(sponsor_category_count, values='Count', names='Sponsor Category', title='Sponsor Categories')
fig.show()
fig.write_image("fig4.jpeg")

In [5]:
sport_count = df['Sport'].value_counts().reset_index()
sport_count.columns = ['Sport', 'Count']

fig = px.bar(sport_count, x='Sport', y='Count', title='Ranking of Sports')
fig.show()

fig.write_image("fig1.jpeg")

In [6]:
univ_count = df['University'].value_counts().reset_index()
univ_count.columns = ['University', 'Count']
top_univ_count = univ_count.head(5)

fig = px.bar(top_univ_count, x='University', y='Count', title='Top 5 Universities')
fig.show()
fig.write_image("fig2.jpeg")


In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
from geopy.geocoders import Nominatim


# Get latitude and longitude values for each university
geolocator = Nominatim(user_agent='my_app')
top_univ_count['location'] = top_univ_count['University'].apply(geolocator.geocode)
top_univ_count['latitude'] = top_univ_count['location'].apply(lambda loc: loc.latitude if loc else None)
top_univ_count['longitude'] = top_univ_count['location'].apply(lambda loc: loc.longitude if loc else None)

# Plot universities on USA map
fig = px.scatter_mapbox(top_univ_count, lat='latitude', lon='longitude', color='University', zoom=3)
fig.update_layout(mapbox_style='open-street-map')
fig.show()



In [9]:
last_univ_count = univ_count.tail(5)

fig = px.bar(last_univ_count, x='University', y='Count', title='Last 5 Universities')
fig.show()

fig.write_image("fig3.jpeg")