In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import ipdb
from bs4 import BeautifulSoup

# Inputs

In [2]:
df = pd.read_csv('../raw_data/all_clean.csv')

In [4]:
df = df[['Id', 'Player']].drop_duplicates()
df.reset_index(inplace=True)
df.drop(columns="index", inplace=True)

In [5]:
df

Unnamed: 0,Id,Player
0,1,Greg Oden
1,2,Carl Landry
2,3,Gabe Pruitt
3,4,Glen Davis
4,5,Jermareo Davidson
...,...,...
63964,172607,Will Mcaloney
63965,172628,Kent Salado
63966,172801,Prince Rivero
63967,175140,Adrian Riesgo


# Soup

In [6]:
pid = 1
url = f'https://basketball.realgm.com/player/a/Summary/{pid}'

In [7]:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [8]:
box = soup.find(class_="profile-box")
box

<div class="profile-box">
<div class="wrapper clearfix container">
<div class="half-column-left" style="margin: 0;">
<h2 style="margin-top: 0;">Greg Oden <span class="feature">C</span>  <span class="feature">#20</span></h2>
<img src="/images/nba/4.2/profiles/photos/2006/Oden_Greg_mia_1314.jpg" style="border: 1px solid #000; float: left; margin-right: 15px; margin-top:5px;"/>
<p><strong>Born:</strong> <a href="/info/birthdays/19880122/1">Jan 22, 1988</a> (32 years old)</p>
<p><strong>Birthplace/Hometown:</strong> <a href="/info/birth_cities/188/Buffalo-NY-United-States">Buffalo, New York</a></p>
<p><strong>Nationality:</strong> <a href="/info/nationality/1/United-States/O">United States</a></p>
<p><strong>Height:</strong> 7-0 (213cm)     <strong>Weight:</strong> 273 (124kg)</p>
<p><strong>Website:</strong> <a href="http://www.gregoden52.com/" target="_blank">http://www.gregoden52.com/</a></p>
</div>
<div class="half-column-right" style="margin: 0;">
<img src="/images/basketball/5.0/team

# Info

In [9]:
position = box.select('h2 > span')[0].text
position

'C'

In [10]:
photo = box.find('img', src=True)['src']
photo

'/images/nba/4.2/profiles/photos/2006/Oden_Greg_mia_1314.jpg'

In [11]:
birthdate = box.find("strong", string="Born:").parent.find('a').text
birthdate

'Jan 22, 1988'

In [12]:
birthplace = box.find("strong", string="Birthplace/Hometown:").parent.find('a').text
birthplace

'Buffalo, New York'

In [13]:
nationality = box.find("strong", string="Nationality:").parent.find('a').text
nationality

'United States'

In [14]:
size = box.find("strong", string="Height:").parent.text.split()
height = size[1]
weight = size[4]
height, weight

('7-0', '273')

In [15]:
nba_status = box.find("strong", string="Current NBA Status:").parent.text.split(":")[1].strip()
nba_status

'Unrestricted Free Agent'

In [16]:
agent = box.find("strong", string="Agent:").parent.find('a').text
agent

'Bill Duffy'

In [17]:
hs_info = box.find("strong", string="High School:").parent
highschool = hs_info.find('a').text
hslocation = hs_info.text.split('[')[1].strip(']')
highschool, hslocation

('Lawrence North High School', 'Indianapolis, Indiana')

In [18]:
draft_entry = box.find("strong", string="Draft Entry:").parent.find('a').text
draft_entry

'2007 NBA Draft'

In [19]:
draft_early = box.find("strong", string="Early Entry Info:").parent.find('a').text
draft_early

'2007 Early Entrant'

In [20]:
drafted = box.find("strong", string="Drafted:").parent.find('a').text
drafted

'Round 1, Pick 1, Portland Trail Blazers'

In [21]:
predraft_team = box.find("strong", string="Pre-Draft Team:").parent.find('a').text
predraft_team

'Ohio State'

# Basic Function

In [22]:
def scrape_player(id):
    url = f'https://basketball.realgm.com/player/a/Summary/{id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    box = soup.find(class_="profile-box")
    
    if box:
        position = box.select('h2 > span')[0].text
        
        birthdate_info = box.find("strong", string="Born:")
        if birthdate_info: 
            birthdate = birthdate_info.parent.find('a').text
        else:
            birthdate = None
        
        birthplace_info = box.find("strong", string="Birthplace/Hometown:")
        if birthplace_info: 
            birthplace = birthplace_info.parent.find('a').text
        else:
            birthplace = None
        
        size_info = box.find("strong", string="Height:")
        if size_info:
            size = size_info.parent.text.split()
            height, weight = size[1], size[4]
        else:
            height, weight = None, None
            
        nba_status_info = box.find("strong", string="Current NBA Status:")
        if nba_status_info: 
            nba_status = nba_status_info.parent.text.split(":")[1].strip()
        else:
            nba_status = None
            
        agent_info = box.find("strong", string="Agent:")
        if agent_info: 
            agent = agent_info.parent.find('a').text
        else:
            agent = None
        
        hs_info = box.find("strong", string="High School:")
        if hs_info:
            hs = hs_info.parent
            highschool = hs.find('a').text
            hslocation = hs.text.split('[')[1].strip(']')
        else:
            highschool = None
            hslocation = None
            
        draft_entry_info = box.find("strong", string="Draft Entry:")
        if draft_entry_info: 
            draft_entry = draft_entry_info.parent.text
        else:
            draft_entry = None
            
        draft_early_info = box.find("strong", string="Early Entry Info:")
        if draft_early_info: 
            draft_early = draft_early_info.parent.find('a').text
        else:
            draft_early = None
            
        drafted_info = box.find("strong", string="Drafted:")
        if drafted_info: 
            drafted = drafted_info.parent.text
        else:
            drafted = None
            
        predraft_team_info = box.find("strong", string="Pre-Draft Team:")
        if predraft_team_info: 
            predraft_team = predraft_team_info.parent.find('a').text
        else:
            predraft_team = None
        
        photo = box.find('img', src=True)['src']
        
        data = {'Position': position,
                'Birthdate': birthdate,
                'Birthplace': birthplace,
                'Height': height,
                'Weight': weight,
                'Status': nba_status,
                'Agent': agent,
                'Highschool': highschool,
                'HsLocation': hslocation,
                'DraftEntry': draft_entry,
                'DraftEarly': draft_early,
                'Drafted': drafted,
                'PreDraftTeam': predraft_team,
                'Photo': photo}
        return data
    
    return None

In [23]:
pid = df['Id'].sample().values[0]
print(pid)
data = scrape_player(int(pid))

9984


In [24]:
data

{'Position': 'PG',
 'Birthdate': 'Apr 3, 1984',
 'Birthplace': 'Grapevine, Texas',
 'Height': '5-10',
 'Weight': '175',
 'Status': 'Unrestricted Free Agent',
 'Agent': None,
 'Highschool': 'Colleyville Heritage High School',
 'HsLocation': 'Colleyville, Texas',
 'DraftEntry': 'Draft Entry: 2006 NBA Draft',
 'DraftEarly': None,
 'Drafted': 'Drafted: Undrafted',
 'PreDraftTeam': 'Texas Christian',
 'Photo': '/images/nba/4.2/profiles/photos/2006/player_photo.jpg'}

In [29]:
round(100/df.shape[0] * 100, 2)

0.16