In [86]:
import os
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [87]:
from dotenv import load_dotenv
load_dotenv()

True

In [88]:
username = os.getenv('POSTGRES_USERNAME')
password = os.getenv('POSTGRES_PASSWORD')


postgres_connection_string = "postgres://{username}:{password}@{host}:{port}/{database}?gssencmode=disable".format(
    username=username,
    password=password,
    host="localhost",
    port="5432",
    database="lahman_baseball"
)

In [89]:
engine = create_engine(postgres_connection_string)

In [106]:
people_sql = "SELECT * FROM people;"
salaries_sql = "SELECT * FROM salaries;"
homegames_sql = "SELECT * FROM homegames;"
parks_sql = "SELECT * FROM parks;"
teams_sql = "SELECT * FROM teams;"
batting_sql = "SELECT * FROM batting;"
fielding_sql = "SELECT * FROM fielding;"
pitching_sql = "SELECT * FROM pitching;"
awards_sql = "SELECT * FROM awardsplayers;"

In [107]:
people_df = pd.read_sql(people_sql, con=engine)
salaries_df = pd.read_sql(salaries_sql, con=engine)
homegames_df = pd.read_sql(homegames_sql, con=engine)
parks_df = pd.read_sql(parks_sql, con=engine)
teams_df = pd.read_sql(teams_sql, con=engine)
batting_df = pd.read_sql(batting_sql, con=engine)
fielding_df = pd.read_sql(fielding_sql, con=engine)
pitching_df = pd.read_sql(pitching_sql, con=engine)
awardsplayers_df = pd.read_sql(awards_sql, con=engine)

# Case 1, Jackie Robinson

#### When was Jackie Robinson signed to the Dodgers and broke the color barrier?

In [109]:
people_df['debut'][people_df['playerid'] == 'robinja02']

14632    1947-04-15
Name: debut, dtype: object

### ^^^Jackie Robinson made his debut in 1947
---

#### Let's see what awards he has won

In [108]:
awardsplayers_df[awardsplayers_df['playerid'] == 'robinja02']

Unnamed: 0,playerid,awardid,yearid,lgid,tie,notes
1799,robinja02,Rookie of the Year,1947,ML,,
1836,robinja02,Baseball Magazine All-Star,1948,NL,,2B
1878,robinja02,Baseball Magazine All-Star,1949,ML,,2B
1890,robinja02,Baseball Magazine All-Star,1949,NL,,2B
1901,robinja02,Most Valuable Player,1949,NL,,
1905,robinja02,TSN All-Star,1949,ML,,2B
1944,robinja02,Baseball Magazine All-Star,1950,NL,,2B
1959,robinja02,TSN All-Star,1950,ML,,2B
1980,robinja02,TSN All-Star,1951,ML,,2B
2001,robinja02,TSN All-Star,1952,ML,,2B


### ^^^As it turns out, Jackie Robinson won Rookie of the Year in 1947
---

In [93]:
homegames_df.columns

Index(['year', 'league', 'team', 'park', 'span_first', 'span_last', 'games',
       'openings', 'attendance'],
      dtype='object')

In [94]:
dodgers_attendance_mask = (homegames_df['year'] >= 1937) & (homegames_df['year'] <= 1957) & (homegames_df['team'] == 'BRO')

In [95]:
dodgers_attendance = homegames_df.loc[dodgers_attendance_mask]

In [96]:
teams_df.columns

Index(['yearid', 'lgid', 'teamid', 'franchid', 'divid', 'rank', 'g', 'ghome',
       'w', 'l', 'divwin', 'wcwin', 'lgwin', 'wswin', 'r', 'ab', 'h', 'h2b',
       'h3b', 'hr', 'bb', 'so', 'sb', 'cs', 'hbp', 'sf', 'ra', 'er', 'era',
       'cg', 'sho', 'sv', 'ipouts', 'ha', 'hra', 'bba', 'soa', 'e', 'dp', 'fp',
       'name', 'park', 'attendance', 'bpf', 'ppf', 'teamidbr',
       'teamidlahman45', 'teamidretro'],
      dtype='object')

In [97]:
dodgers_teams_mask = (teams_df['yearid'] >= 1937) & (teams_df['yearid'] <= 1957) & (teams_df['teamid'] == 'BRO')

In [98]:
dodgers_teams = teams_df.loc[dodgers_teams_mask]

In [99]:
dodgers_teams.columns

Index(['yearid', 'lgid', 'teamid', 'franchid', 'divid', 'rank', 'g', 'ghome',
       'w', 'l', 'divwin', 'wcwin', 'lgwin', 'wswin', 'r', 'ab', 'h', 'h2b',
       'h3b', 'hr', 'bb', 'so', 'sb', 'cs', 'hbp', 'sf', 'ra', 'er', 'era',
       'cg', 'sho', 'sv', 'ipouts', 'ha', 'hra', 'bba', 'soa', 'e', 'dp', 'fp',
       'name', 'park', 'attendance', 'bpf', 'ppf', 'teamidbr',
       'teamidlahman45', 'teamidretro'],
      dtype='object')

In [100]:
dodgers_record = dodgers_teams[['yearid', 'rank', 'ghome', 'w', 'l', 'lgwin', 'wswin', 'attendance']]

In [104]:
dodgers_attendance[['year', 'attendance', 'games']]

Unnamed: 0,year,attendance,games
1101,1937,454551,76
1118,1938,728519,74
1136,1939,1048457,78
1153,1940,969439,81
1170,1941,975162,79
1187,1942,882336,79
1204,1943,692492,77
1221,1944,686971,77
1238,1945,1069629,78
1255,1946,1830974,79


In [103]:
dodgers_record

Unnamed: 0,yearid,rank,ghome,w,l,lgwin,wswin,attendance
976,1937,6,76.0,62,91,N,N,482481.0
992,1938,7,74.0,69,80,N,N,663087.0
1008,1939,3,78.0,84,69,N,N,955668.0
1024,1940,2,81.0,88,65,N,N,975978.0
1040,1941,1,79.0,100,54,Y,N,1214910.0
1056,1942,2,79.0,104,50,N,N,1037765.0
1072,1943,3,77.0,81,72,N,N,661739.0
1088,1944,7,77.0,63,91,N,N,605905.0
1104,1945,3,78.0,87,67,N,N,1059220.0
1120,1946,2,79.0,96,60,N,N,1796824.0


### How did he do in that first season?

In [8]:
batting_1947 = batting_df[batting_df['yearid'] == 1947]

In [9]:
jackie_robinson = batting_1947[batting_1947['playerid'] == 'robinja02']

### Let's look at how he performed compared to his teammates

In [12]:
dodgers_bat_1947 = batting_1947[batting_1947['teamid'] == 'BRO']

In [15]:
dodgers_bat_1947['slugging'] = (((dodgers_bat_1947['h']) + (2 * dodgers_bat_1947['h2b']) + (3 * dodgers_bat_1947['h3b']) + (4 * dodgers_bat_1947['hr'])) / (dodgers_bat_1947['ab'])).round(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dodgers_bat_1947['slugging'] = (((dodgers_bat_1947['h']) + (2 * dodgers_bat_1947['h2b']) + (3 * dodgers_bat_1947['h3b']) + (4 * dodgers_bat_1947['hr'])) / (dodgers_bat_1947['ab'])).round(3)


In [17]:
dodgers_bat_1947.sort_values(by = 'slugging', ascending = False)

Unnamed: 0,playerid,yearid,stint,teamid,lgid,g,ab,r,h,h2b,...,sb,cs,bb,so,ibb,hbp,sh,sf,gidp,slugging
32748,bankhda01,1947,1,BRO,NL,6,4,2,1,0,...,0.0,,0,1.0,,1.0,0.0,,0.0,1.25
33149,ramsdwi01,1947,1,BRO,NL,2,1,0,1,0,...,0.0,,0,0.0,,0.0,0.0,,0.0,1.0
33079,meltoru01,1947,1,BRO,NL,4,1,0,1,0,...,0.0,,0,0.0,,0.0,0.0,,0.0,1.0
33040,lunddo01,1947,1,BRO,NL,11,20,5,6,2,...,0.0,,3,7.0,,0.0,0.0,,0.0,0.9
32941,higbeki01,1947,1,BRO,NL,4,5,0,1,1,...,0.0,,0,1.0,,0.0,0.0,,0.0,0.6
32881,furilca01,1947,1,BRO,NL,124,437,61,129,24,...,7.0,,34,24.0,,1.0,4.0,,17.0,0.526
33259,vaughar01,1947,1,BRO,NL,64,126,24,41,5,...,4.0,,27,11.0,,0.0,0.0,,2.0,0.516
32938,hermage01,1947,1,BRO,NL,79,189,36,52,7,...,5.0,,28,7.0,,3.0,5.0,,3.0,0.513
33151,reesepe01,1947,1,BRO,NL,142,476,81,135,24,...,7.0,,104,67.0,,2.0,8.0,,7.0,0.511
33270,walkedi02,1947,1,BRO,NL,148,529,77,162,31,...,6.0,,97,26.0,,1.0,10.0,,9.0,0.509


In [134]:
((jackie_robinson['h']) + (2 * jackie_robinson['h2b']) + (3 * jackie_robinson['h3b']) + (4 * jackie_robinson['hr'])) / (jackie_robinson['ab'])

33167    0.508475
dtype: float64

In [112]:
#use this to determine the earliest year sacrifice flies were recorded
#sf_recording_begins = batting_df[['yearid', 'sf']]
#sf_recording_begins = sf_recording_begins.dropna()
#sf_recording_begins.loc[(sf_recording_begins.sf >= 1)].sort_values(by = 'yearid')

In [103]:
#this is a good way to see 
#sf_recording_begins = batting_df.loc[(batting_df.sf > 0)]
#sf_recording_begins

### Accolades? (Hall of Fame, All Star)

In [9]:
mcgwire_salary = salaries_df[salaries_df['playerid'] == 'mcgwima01']

In [10]:
bonds_salary = salaries_df[salaries_df['playerid'] == 'bondsba01']

In [11]:
griffey_salary = salaries_df[salaries_df['playerid'] == 'griffke02']

In [12]:
nryan_salary = salaries_df[salaries_df['playerid'] == 'ryanno01']

In [13]:
#teams_df[teams_df['teamid'] == 'BSN']

In [14]:
#teams_df[teams_df['name'] == 'Chicago White Sox']

In [15]:
#homegames_df[homegames_df['year'] == 1918]

---
### Let's look at our first example, Jackie Robinson

In [87]:
bk_dodgers_homegames = homegames_df[homegames_df['team'] == 'BRO']

In [126]:
jackie_robinson_id = people_df[people_df['playerid'] == 'robinja02']

In [138]:
bk_dodgers_teams = teams_df[(teams_df['teamid'] == 'BRO') & (teams_df['name'] == 'Brooklyn Dodgers')]

In [143]:
bk_dodgers_teams['lgwin'].value_counts()

N    21
Y     7
Name: lgwin, dtype: int64

In [144]:
bk_dodgers_teams['wswin'].value_counts()

N    27
Y     1
Name: wswin, dtype: int64

In [66]:
sf_giants_homegames = homegames_df[homegames_df['team'] == 'SFN']

In [67]:
sf_giants_homegames['attendance'].mean().round(2)

1895639.36

In [68]:
sf_giants_homegames['mean_diff'] = sf_giants_homegames['attendance'] - 1895639.36

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_giants_homegames['mean_diff'] = sf_giants_homegames['attendance'] - 1895639.36


In [72]:
np.sqrt((sf_giants_homegames['mean_diff']**2).mean()).round(2)

947710.38

In [80]:
sf_giants_homegames.head()

Unnamed: 0,year,league,team,park,span_first,span_last,games,openings,attendance,mean_diff
1463,1958,NL,SFN,SFO01,1958-04-15,1958-09-28,77,75,1272857,-622782.36
1479,1959,NL,SFN,SFO01,1959-04-14,1959-09-20,77,77,1421630,-474009.36
1495,1960,NL,SFN,SFO02,1960-04-12,1960-10-02,77,76,1796356,-99283.36
1513,1961,NL,SFN,SFO02,1961-04-11,1961-09-20,77,74,1391251,-504388.36
1533,1962,NL,SFN,SFO02,1962-04-10,1962-10-01,82,77,1590136,-305503.36


In [57]:
#np.sqrt((sf_giants_homegames['attendance']**2).mean())

## Clean up the connection!

In [23]:
#engine.dispose()