In [3]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [4]:
# saving url link
url = 'https://www.pro-football-reference.com/players/C/ChasJa00.htm'

In [5]:
# requests
r = requests.get(url)

In [None]:
# exploring page
page = BeautifulSoup(r.content, 'html.parser')
print(page.prettify())

In [None]:
# testing the title
page.title.string

"Ja'Marr Chase Stats, Height, Weight, Position, Draft, College | Pro-Football-Reference.com"

In [None]:
# trying to find the keyword 'birth'
text = page.find_all(text=re.compile("birth"))

In [None]:
text

['\n{\n    "@context": "http://schema.org",\n    "@type": "Person",\n    "name": "Ja\'Marr Chase",\n    "url": "https://www.pro-football-reference.com/players/C/ChasJa00.htm",\n    "image": {\n        "@type": "ImageObject",\n        "caption": "Ja\'Marr Chase, Cincinnati Bengals", \n        "representativeOfPage": true, \n        "contentUrl": "https://www.pro-football-reference.com/req/20230307/images/headshots/ChasJa00_2022.jpg"\n    },\n    "memberOf": {\n        "@type": "SportsTeam",\n        "name": "Cincinnati Bengals",\n        "sport": "Football"\n    },\n    "birthDate": "2000-03-01"\n    ,\n  "birthPlace": "Harvey, LA, USA",\n    "height": { "@type": "QuantitativeValue", "value": "6-1" },\n    "weight": { "@type": "QuantitativeValue", "value": "200 lbs" }\n}\n']

In [None]:
# so we found this giant lump of text that seems to contain all the biographical information we need.
# Bazinga!
# let's parse through it

In [None]:
# first lets remove as much noise as we can

# Join the elements of the list into a single string
text = "".join(text)

# Use the re module to extract the text within quotes
result = re.findall(r'"([^"]*)"', text)

print(result)

['@context', 'http://schema.org', '@type', 'Person', 'name', "Ja'Marr Chase", 'url', 'https://www.pro-football-reference.com/players/C/ChasJa00.htm', 'image', '@type', 'ImageObject', 'caption', "Ja'Marr Chase, Cincinnati Bengals", 'representativeOfPage', 'contentUrl', 'https://www.pro-football-reference.com/req/20230307/images/headshots/ChasJa00_2022.jpg', 'memberOf', '@type', 'SportsTeam', 'name', 'Cincinnati Bengals', 'sport', 'Football', 'birthDate', '2000-03-01', 'birthPlace', 'Harvey, LA, USA', 'height', '@type', 'QuantitativeValue', 'value', '6-1', 'weight', '@type', 'QuantitativeValue', 'value', '200 lbs']


In [None]:
# Create a dataframe from the result
df = pd.DataFrame(result, columns=['Data'])

In [None]:
df

Unnamed: 0,Data
0,@context
1,http://schema.org
2,@type
3,Person
4,name
5,Ja'Marr Chase
6,url
7,https://www.pro-football-reference.com/players...
8,image
9,@type


In [None]:
# Find the index of the row that contains the value 'birthdate'
name_index = df[df['Data'] == 'Person'].index[0]
birthdate_index = df[df['Data'] == 'birthDate'].index[0]
birthplace_index = df[df['Data'] == 'birthPlace'].index[0]
height_index = df[df['Data'] == 'height'].index[0]
weight_index = df[df['Data'] == 'weight'].index[0]

# Extract the value of the row that comes after the birthdate row
name_value = df.loc[name_index + 2, 'Data']
birthdate_value = df.loc[birthdate_index + 1, 'Data']
birthplace_value = df.loc[birthplace_index + 1, 'Data']
height_value = df.loc[height_index + 4, 'Data']
weight_value = df.loc[weight_index + 4, 'Data']


print(name_value,birthdate_value,birthplace_value, height_value,weight_value)

Ja'Marr Chase 2000-03-01 Harvey, LA, USA 6-1 200 lbs


In [None]:
# we want name, birthDate, birthPlace, height, weight
# name = Person_index + 2
# birthDate + 1, birthPlace +1
# height + 4, weight +4


In [None]:
# Create an empty dataframe with the desired columns
df = pd.DataFrame(columns=['name', 'bday', 'bplace', 'height', 'weight'])

# Store the variables in a dictionary
data = {
    'name': [name_value],
    'bday': [birthdate_value],
    'bplace': [birthplace_value],
    'height': [height_value],
    'weight': [weight_value]
}

# Append the data to the dataframe
df = df.append(pd.DataFrame(data), ignore_index=True)

  df = df.append(pd.DataFrame(data), ignore_index=True)


In [None]:
df

Unnamed: 0,name,bday,bplace,height,weight
0,Ja'Marr Chase,2000-03-01,"Harvey, LA, USA",6-1,200 lbs


In [None]:
# trying to find high school

In [None]:
text2 = page.find_all(text=re.compile("Upson"))

In [None]:
text2

[]

In [None]:
text2 = page.find('a')

In [None]:
text2

<a href="https://www.sports-reference.com/?utm_source=pfr&amp;utm_medium=sr_xsite&amp;utm_campaign=2023_01_srnav"><svg height="15px" width="20px"><use xlink:href="#ic-sr-pennant"></use></svg> Sports Reference ®</a>

In [None]:
# trying to find high school again, but with a different player

In [None]:
saved = 'https://www.pro-football-reference.com/players/P/PurdBr00.htm'

In [None]:
r = requests.get(saved)
page = BeautifulSoup(r.content, 'html.parser')
text = page.find_all(text=re.compile("(AZ)"))

In [None]:
text[0].parent

<a href="/schools/high_schools.cgi?hs_state=AZ">AZ</a>

In [None]:
link = page.find_all("a", href="/schools/high_schools.cgi?")

In [None]:
print(link)

[]


In [None]:
links = page.find_all("a", href=lambda href: href and href.startswith("/schools/high_schools.cgi?"))

In [None]:
links[1].text

'AZ'

In [None]:
names = 'HS '+ links[1].text +','+links[0].text

In [None]:
names

'HS AZ,Perry'

In [None]:
# code to potentially use to create new 'birthplace' column, which if null, will be replaced with HS state value
try:
    birthplace_index = df[df['Data'] == 'birthPlace'].index[0]
    birthplace_value = df.loc[birthplace_index + 1, 'Data']
except IndexError:
    try:
        links = page.find_all("a", href=lambda href: href and href.startswith("/schools/high_schools.cgi?"))
        names = 'HS '
        for link in links:
            names = names + link.text + ','
        names= names[:-1]
    except IndexError:
        birthplace_value = None

KeyError: 'Data'

In [None]:
links = page.find_all("a", href=lambda href: href and href.startswith("/schools/high_schools.cgi?"))
names = 'HS '
for link in links:
    names = names + link.text + ','
names= names[:-1]

In [None]:
names= names[:-1]

In [None]:
names

'HS Perry,A'

In [None]:
final = 'https://www.pro-football-reference.com/players/W/WalkTr03.htm'

In [None]:
r = requests.get(final)
page = BeautifulSoup(r.content, 'html.parser')
links = page.find_all("a", href=lambda href: href and href.startswith("/schools/high_schools.cgi?"))
names = 'HS '
for link in links:
    names = names + link.text + ','
names= names[:-1]

In [None]:
names

'HS Upson,GA'

In [None]:
links

[<a href="/schools/high_schools.cgi?id=93bcd6a7">Upson</a>,
 <a href="/schools/high_schools.cgi?hs_state=GA">GA</a>]

In [None]:
del.names

SyntaxError: invalid syntax (2143224281.py, line 1)

##  looking for image url

In [None]:
text = page.find_all(text=re.compile("20180910"))

In [None]:
text

['\n{\n    "@context": "http://schema.org",\n    "@type": "Person",\n    "name": "Travon Walker",\n    "url": "https://www.pro-football-reference.com/players/W/WalkTr03.htm",\n    "image": {\n        "@type": "ImageObject",\n        "caption": "Travon Walker, Jacksonville Jaguars", \n        "representativeOfPage": true, \n        "contentUrl": "https://www.pro-football-reference.com/req/20180910/images/headshots/WalkTr03_2022.jpg"\n    },\n    "memberOf": {\n        "@type": "SportsTeam",\n        "name": "Jacksonville Jaguars",\n        "sport": "Football"\n    },\n    "birthDate": "2000-12-18"\n    ,\n  "birthPlace": "Thomaston, GA, USA",\n    "height": { "@type": "QuantitativeValue", "value": "6-5" },\n    "weight": { "@type": "QuantitativeValue", "value": "275 lbs" }\n}\n']