## Introduction

This is the code used in the reddit post I made here. This is the data prep step

https://old.reddit.com/r/baseball/comments/8wb4dd/oc_the_hardest_and_easiest_nohitters_in_the/

Some caveats:

* the code is a bit hacky in placed to make it work especially for web scraping
* it takes a bit of time to run

## Import libraries

We use requests/bs4/re for scraping and cleaning data and pandas for analysis

In [3]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

### Pull and parse list of no hitters

In [249]:
#use wikipedia
no_hits = pd.read_html("https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_no-hitters",header=0)[1]

#clean up dates and keep only post-war
no_hits['date'] = pd.to_datetime(no_hits.Date.str.split("0000").str[2].str.strip("-"))
no_hits = no_hits[no_hits.date>='1945']

#write to CSV for later use
no_hits[['Pitcher','Team','Opponent','date']].to_csv("no_hits.csv",index=None)

no_hits.head(3)

Unnamed: 0,#,Date,Pitcher,Team,RS,Opponent,RA,League,Catcher,Notes,date
114,114.0,"000000001945-09-09-0000September 9, 1945",Dick Fowler,Philadelphia Athletics,1,St. Louis Browns,0,AL,Buddy Rosar (1),[notes 42],1945-09-09
115,115.0,"000000001946-04-23-0000April 23, 1946",Ed Head,Brooklyn Dodgers,5,Boston Braves,0,NL,Ferrell Anderson,,1946-04-23
116,116.0,"000000001946-04-30-0000April 30, 1946",Bob Feller (2),Cleveland Indians,1,New York Yankees,0,AL,Frankie Hayes,,1946-04-30


In [35]:
#for future runs, we only need to read data
no_hits = pd.read_csv("no_hits.csv", encoding = "ISO-8859-1")
no_hits.date = pd.to_datetime(no_hits.date)

#compute a list of games to lookup
items_to_do = no_hits.index.tolist()
results = {}

### For each no hitter, scrape the box score, getting the list of players who had ABs for the no-hit team

In [122]:
#regex for the AB/H stats
pat = '<a href="([^"]+)">([^<]+)</a>[^<]+</th><td class="right endpoint" data-endpoint="[^"]+" '+\
'data-stat="AB" >([0-9]+)</td><td class="right " data-stat="R" >[0-9]+</td>'+\
'<td class="right " data-stat="H" >([0-9]+)</td>'



for i in [x for x in items_to_do if x not in results]:
    print(i, len(items_to_do))
    todo = no_hits.iloc[i]
    #get date and URL for this date
    y = todo.date.date().year
    print(y)
    m = todo.date.date().month
    d = todo.date.date().day
    
    url = f"https://www.baseball-reference.com/boxes/?month={m}&day={d}&year={y}"
        
    #clean up team names
    team1 = todo['Team'].replace('(AL)','').replace('(NL)','').strip()
    team1 = team1.replace(".45s",".45's").replace("Los Angeles Angels of Anaheim","LA Angels of Anaheim")
    team2 = todo['Opponent'].replace('(AL)','').replace('(NL)','').strip()
    team2 = team2.replace(".45s",".45's").replace("Los Angeles Angels of Anaheim","LA Angels of Anaheim")

    #get box scores for date and find URL for game involving our team of interest
    text = requests.get(url).text.replace("\n"," ")
    try:
        match1 = re.findall(f"{team1}.*?{team2}", text)[0]
    except:
        match1 = "x"*10000
    try:
        match2 = re.findall(f"{team2}.*?{team1}", text)[0]
    except:
        match2 = "x"*10000
    
    url = "https://www.baseball-reference.com" + re.findall("/boxes/.*?.shtml",min([match1, match2], key=len))[0]
    
    #account for double headers - in this case we pull both and decide which to keep based on hits
    urls = list(set([url, url.replace('1.shtml','2.shtml')]))
    res_tmp = []
    for url in urls:
        print(url)
        r = requests.get(url)
        txt = r.text

        #get player stats and split into two teams. Get name/AB/H for player and link to their player page
        players = [(x.start(),x.group()) for x in re.finditer(pat, txt)]
        players_min = players[0][0]
        players_max = players[-1][0]

        teams = sorted([(x.start(),x.group()) for x in re.finditer(team2, txt)] + [(x.start(),x.group()) for x in re.finditer(team1, txt)])

        split = [x for x in teams if x[0] >= players_min]
        tm2 = split[0]
        tm1 = split[3]

        players = [(x[1],tm1[1]) if x[0] <= tm2[0] else (x[1],tm2[1]) for x in players]

        players = [x[0] for x in players if x[1]==team2]
        players = pd.DataFrame(players).rename(columns={0:'raw'})
        players = players.raw.str.extract(pat, expand=True)
        players.columns = ['player_link','Name','AB','H']
        players = players[players.AB.astype(int) > 0]
        players.player_link = "https://www.baseball-reference.com" + players.player_link

        #special handling for Angels. Hacky and could break.
        if i == 158:
            players = players.iloc[9:]
        if i == 160:
            players = players.iloc[:9]
        tmp = pd.DataFrame(todo).T
        tmp['key'] = 1
        tmp['year'] = y
        tmp['game_link'] = url

        players['key'] = 1

        tmp.merge(players)

        res = tmp.merge(players).drop('key',axis=1)
        res_tmp.append(res)
    results[i] = res_tmp

158 2
2011
https://www.baseball-reference.com/boxes/CLE/CLE201107270.shtml
160 2
2012
https://www.baseball-reference.com/boxes/ANA/ANA201205020.shtml


In [161]:
#collate into one data frame
df = pd.concat([x for a in results.values() for x in a])

### Clean up - we pulled by date, so we have double-headers, so take the game with the no-hit

In [162]:
df.H = df.H.astype(int)

df_h = df.groupby('game_link').H.sum().reset_index()
df_h[df_h.H==0].shape

df = df.merge(df_h[df_h.H==0][['game_link']]).reset_index(drop=True)

### For each player, get their season AB/BA for the year of the no-hit and the same for their career

In [202]:
def get_ba(u, y):
    raw = requests.get(u)
    raw = raw.text.replace("<!--","").replace("<!--","-->")
    stats = [x for x in pd.read_html(raw) if 'BA' in x.columns][0]
    BAc = stats[stats.Year.fillna('').str.contains("Yr")]['CS'].tolist()[0]
    ABc = stats[stats.Year.fillna('').str.contains("Yr")]['Lg'].tolist()[0]
    
    stats = stats[['Year','BA','AB','Lg']]
    stats = stats[(stats.Year==str(y))&(stats.Lg.isin(['AL','NL']))]
    ABs = stats.AB.sum()
    BAs = (stats.BA*stats.AB).sum() / (ABs)

    return BAs, ABs, BAc, ABc

In [203]:
BAss = [0]*len(df)
ABss = [0]*len(df)
BAcs = [0]*len(df)
ABcs = [0]*len(df)
i=0

In [None]:
while i < len(df):
    print(i)
    x= df.iloc[i]
    BAs, ABs, BAc, ABc = get_ba(x['player_link'],x['year'])
    BAss[i]=BAs
    ABss[i]=ABs
    BAcs[i]=BAc
    ABcs[i]=ABc
    i+=1

In [208]:
df['BA_season'] = BAss
df['AB_season'] = ABss
df['BA_career'] = BAcs
df['AB_career'] = ABcs

In [234]:
df['BA'] = 0
df.AB_career = df.AB_career.astype(int)

#correction for error in code
df.BA_season = df.BA_season*14 / df.AB_season

df.loc[df.AB_season>=50,'BA'] = df.loc[df.AB_season>=50,'BA_season']
df.loc[df.AB_season<50,'BA'] = df.loc[df.AB_season<50,'BA_career']

In [13]:
df.to_csv("no_hits_res.csv",index=None)

In [250]:
#this dataframe has a row for each batter in a no-hit game with their game H/AB and season/career AB/BA
df.head()

Unnamed: 0,Pitcher,Team,Opponent,date,year,game_link,player_link,Name,AB,H,BA,BA_season,AB_season,BA_career,AB_career,id,abba
0,Dick Fowler,Philadelphia Athletics,St. Louis Browns,1945-09-09,1945,https://www.baseball-reference.com/boxes/PHA/P...,https://www.baseball-reference.com/players/b/b...,Milt Byrnes,3,0,0.249,0.249,442.0,0.274,1278,0,0.747
1,Dick Fowler,Philadelphia Athletics,St. Louis Browns,1945-09-09,1945,https://www.baseball-reference.com/boxes/PHA/P...,https://www.baseball-reference.com/players/f/f...,Lou Finney,3,0,0.274212,0.274212,430.0,0.287,4631,0,0.822635
2,Dick Fowler,Philadelphia Athletics,St. Louis Browns,1945-09-09,1945,https://www.baseball-reference.com/boxes/PHA/P...,https://www.baseball-reference.com/players/m/m...,Gene Moore,2,0,0.26,0.26,354.0,0.27,3543,0,0.52
3,Dick Fowler,Philadelphia Athletics,St. Louis Browns,1945-09-09,1945,https://www.baseball-reference.com/boxes/PHA/P...,https://www.baseball-reference.com/players/l/l...,Chet Laabs,3,0,0.239,0.239,109.0,0.262,3102,0,0.717
4,Dick Fowler,Philadelphia Athletics,St. Louis Browns,1945-09-09,1945,https://www.baseball-reference.com/boxes/PHA/P...,https://www.baseball-reference.com/players/c/c...,Mark Christman,3,0,0.277,0.277,289.0,0.253,3081,0,0.831
