In [8]:
# Credit to https://github.com/robotallie/baseball-injuries/blob/master/injury_data_scrape.ipynb, just had to make a few adjustments
import csv
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lxml 

%matplotlib inline

In [10]:
df = pd.DataFrame()
# Get a list of dictionaries for the injuries 
injuries_data = []
for i in range(0,731): # number of page increments plus one
    url_string = "http://www.prosportstransactions.com/baseball/Search/SearchResults.php?Player=&Team=&BeginDate=1999-03-01&EndDate=2017-11-01&DLChkBx=yes&submit=Search&start="+str(25* i )
    req = requests.get(url_string)
    soup = BeautifulSoup(req.content, "html.parser")
    for item in soup.find_all("tr", {"align":"left"}):# Code for each individual page to capture data
        raw_text = item.text.strip().split("\n")
        injuries_data.append(raw_text)

# Create a dataframe from the injuries data for 730 pages, with ~25 per page = ~18,250 rows        
df = pd.DataFrame(injuries_data)
df.head()

Unnamed: 0,0,1,2,3,4
0,1999-04-05,Cardinals,,• David Howard (Wayne),placed on 15-day DL
1,1999-04-05,Cardinals,,• Ray Lankford,placed on 15-day DL
2,1999-04-05,Giants,,• Bill Mueller,placed on 15-day DL
3,1999-04-08,Padres,,• George Arias,placed on 15-day DL
4,1999-04-12,Blue Jays,• Robert Person,,activated from 15-day DL


In [11]:
df.columns = ['Date','Team','Acquired','Relinquished','Notes']
df.head()

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,1999-04-05,Cardinals,,• David Howard (Wayne),placed on 15-day DL
1,1999-04-05,Cardinals,,• Ray Lankford,placed on 15-day DL
2,1999-04-05,Giants,,• Bill Mueller,placed on 15-day DL
3,1999-04-08,Padres,,• George Arias,placed on 15-day DL
4,1999-04-12,Blue Jays,• Robert Person,,activated from 15-day DL


In [14]:
# Create a dummy column that is 1 if the row represents an injury 
# or a 0 if the row represents a player reactivated.
df['Injury'] = [1 if 'placed' in text else 0 for text in df.Notes]

In [15]:
# Start to extract the number out of the Notes column.
# Replace the hyphen in '15-day' with a space to help splitting and extracting digits.
df.Notes = df.Notes.apply(lambda x: x.replace('-',' '))

In [16]:
def filter_notes_for_DL(notes):
    if '15' in notes:
        return 15
    elif '60' in notes:
        return 60
    elif '10' in notes:
        return 10
    elif '7' in notes:
        return 7
    elif 'restricted' in notes:
        return 0
    elif 'temporary' in notes:
        return 0
    else:
        return 0

In [17]:
df['DL_length'] = df.Notes.map(filter_notes_for_DL)

In [18]:
def extract_injury(notes):
    """Function parses notes column
    to obtain the injury type and returns a string"""
    if len(notes.split('with')) > 1:
        return notes.split('with')[1]
    else:
        return 'unknown'

In [19]:
df.Notes.head()

0          placed on 15 day DL
1          placed on 15 day DL
2          placed on 15 day DL
3          placed on 15 day DL
4     activated from 15 day DL
Name: Notes, dtype: object

In [20]:
# Create a column that describes the type of injury based on the notes column using
# the function I created: extract_injury, df['Injury_Type']
df['Injury_Type'] = df.Notes.map(extract_injury)

In [21]:
# What kind of injuries are we looking at?
df['Injury_Type'].value_counts()

Injury_Type
unknown                                                                                   9798
 strained left hamstring                                                                   283
 strained right hamstring                                                                  231
 right shoulder inflammation                                                               222
 concussion                                                                                185
                                                                                          ... 
 bursitis in right shoulder and internal rotation deficit                                    1
 left elbow inflammation (CBC CBS MLB)                                                       1
 left testicular fracture                                                                    1
 pulled right oblique muscle                                                                 1
 strained rotator cuff in right should

In [22]:
# Remove rows where df['Injury']==0
print('Before removing reactivations:',df.shape)
df = df[df.Injury != 0]
print('With only placements onto the Disabled List:',df.shape)

Before removing reactivations: (18235, 8)
With only placements onto the Disabled List: (8440, 8)


In [23]:
df.to_csv('injuries.csv',index=False)