## Final exercise of NSS Data Analytics class.
- Connect a Python notebook to a SQL database
- Answer a few of the same questions in Python that we had answered previously using SQL

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from functools import reduce   # Needed this to do 3-way merge

In [2]:
# establish a database connection, using the actual name of your 
# lahman baseball database as it appears in pgadmin

engine = create_engine("postgres+psycopg2://postgres:postgres@localhost:5432/baseball")

In [3]:
# use the connection to run a query using pandas!
# con = connection (uses engine, described above)

df_batting = pd.read_sql("SELECT * FROM batting;", con=engine)
df_batting.head(2)

Unnamed: 0,playerid,yearid,stint,teamid,lgid,g,ab,r,h,h2b,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,


### 1. Vanderbilt:

a. Find all players in the database who played at Vanderbilt University.   
    - TABLES:
        - people (referred to as "MASTER" on lahman Readme): playerid, namegiven (first & middle), namelast
        - schools: schoolid, schoolname
        - collegeplaying:  playerid, schoolid
**DONE**

b.  Create a list showing each player’s first and last names as well as the total salary they earned in the major leagues.    
    - TABLES:  
        - people: playerid, namegiven (first & middle), namelast
        - salaries: playerid, salary (watch for duplications, there was a data problem wtih this in SQL)      


c.  Sort this list in descending order by the total salary earned.   

d. Which Vanderbilt player earned the most money in the majors? 

In [4]:
# Read in sql data

df_people = pd.read_sql("SELECT * FROM people;", con=engine)
df_people.head(2)

Unnamed: 0,playerid,birthyear,birthmonth,birthday,birthcountry,birthstate,birthcity,deathyear,deathmonth,deathday,...,namelast,namegiven,weight,height,bats,throws,debut,finalgame,retroid,bbrefid
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,...,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,,,,...,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01


In [5]:
df_people.columns

Index(['playerid', 'birthyear', 'birthmonth', 'birthday', 'birthcountry',
       'birthstate', 'birthcity', 'deathyear', 'deathmonth', 'deathday',
       'deathcountry', 'deathstate', 'deathcity', 'namefirst', 'namelast',
       'namegiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalgame',
       'retroid', 'bbrefid'],
      dtype='object')

In [6]:
# Read in sql data
df_schools = pd.read_sql("SELECT * FROM schools;", con=engine)
df_schools.head(2)

Unnamed: 0,schoolid,schoolname,schoolcity,schoolstate,schoolnick
0,abilchrist,Abilene Christian University,Abilene,TX,USA
1,adelphi,Adelphi University,Garden City,NY,USA


In [7]:
# Read in sql data
df_collegeplaying = pd.read_sql("SELECT * FROM collegeplaying;", con=engine)
df_collegeplaying.head()

Unnamed: 0,playerid,schoolid,yearid
0,aardsda01,pennst,2001
1,aardsda01,rice,2002
2,aardsda01,rice,2003
3,abadan01,gamiddl,1992
4,abadan01,gamiddl,1993


In [8]:
# Read in sql data
df_salaries = pd.read_sql("SELECT * FROM salaries;", con=engine)
df_salaries.head(2)

Unnamed: 0,yearid,teamid,lgid,playerid,salary
0,1985,ATL,NL,barkele01,870000.0
1,1985,ATL,NL,bedrost01,550000.0


### a. Find all players in the database who played at Vanderbilt University.

In [9]:
# First: subset df_collegeplaying to only include vanderbilt

df_collplay_vandy = df_collegeplaying.loc[df_collegeplaying['schoolid'] == 'vandy']
df_collplay_vandy.head()

Unnamed: 0,playerid,schoolid,yearid
232,alvarpe01,vandy,2006
233,alvarpe01,vandy,2007
234,alvarpe01,vandy,2008
895,baxtemi01,vandy,2004
896,baxtemi01,vandy,2005


In [10]:
# 65 rows, with each player having multiple rows (one for each year played at school)

df_collplay_vandy.shape

(65, 3)

In [20]:
# Confirming total # of years and players in df; and confirming that theres just one school.

print(df_collplay_vandy.yearid.unique())
print(df_collplay_vandy.nunique())

[2006 2007 2008 2004 2005 2009 1911 1912 1913 1983 1984 1985 1921 1922
 1923 2010 2011 1920 1997 1998 1999 2003 1977 1978 1979 1980 1915 1916
 1917 1925 1926 1927 1994 1995 1996 1976 1930 1931 2002 1970 1971 1972]
playerid    24
schoolid     1
yearid      42
dtype: int64


In [25]:
# Renaming yearid in vandy college play, and in salaries so that the columns make
# more sense in the merged df

df_collplay_vandy = df_collplay_vandy.rename(columns = {'yearid': 'year_at_school'})
df_collplay_vandy.head(2)

Unnamed: 0,playerid,schoolid,year_at_school
232,alvarpe01,vandy,2006
233,alvarpe01,vandy,2007


In [23]:
df_salaries = df_salaries.rename(columns = {'yearid': 'salary_year'})
df_salaries.head()

Unnamed: 0,salary_year,teamid,lgid,playerid,salary
0,1985,ATL,NL,barkele01,870000.0
1,1985,ATL,NL,bedrost01,550000.0
2,1985,ATL,NL,benedbr01,545000.0
3,1985,ATL,NL,campri01,633333.0
4,1985,ATL,NL,ceronri01,625000.0


In [24]:
df_collplay_vandy.head()

Unnamed: 0,playerid,schoolid,year_at_school
232,alvarpe01,vandy,2006
233,alvarpe01,vandy,2007
234,alvarpe01,vandy,2008
895,baxtemi01,vandy,2004
896,baxtemi01,vandy,2005


In [26]:
df_collplay_vandy.shape

(65, 3)

In [45]:
df_collplay_vandy.nunique()

playerid          24
schoolid           1
year_at_school    42
dtype: int64

In [46]:
# First: Merging people, collegeplaying, and salaries on playerid.

df_ppl_collvandy_sal = pd.merge(pd.merge
                           (df_people
                            , df_collplay_vandy
                            , how='right'
                            , on='playerid'
                           )
                           , df_salaries
                           , how='left'
                           , on='playerid'
                          )

In [47]:
df_ppl_collvandy_sal.head()

Unnamed: 0,playerid,birthyear,birthmonth,birthday,birthcountry,birthstate,birthcity,deathyear,deathmonth,deathday,...,debut,finalgame,retroid,bbrefid,schoolid,year_at_school,salary_year,teamid,lgid,salary
0,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,2010-06-16,2016-10-01,alvap001,alvarpe01,vandy,2006,2011.0,PIT,NL,2050000.0
1,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,2010-06-16,2016-10-01,alvap001,alvarpe01,vandy,2006,2012.0,PIT,NL,2200000.0
2,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,2010-06-16,2016-10-01,alvap001,alvarpe01,vandy,2006,2013.0,PIT,NL,700000.0
3,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,2010-06-16,2016-10-01,alvap001,alvarpe01,vandy,2006,2014.0,PIT,NL,4250000.0
4,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,2010-06-16,2016-10-01,alvap001,alvarpe01,vandy,2006,2015.0,PIT,NL,5750000.0


In [48]:
# df has grown to 198 rows, because several salary years are included for each player

df_ppl_collvandy_sal.shape

(220, 30)

In [49]:
df_ppl_collvandy_sal.columns

Index(['playerid', 'birthyear', 'birthmonth', 'birthday', 'birthcountry',
       'birthstate', 'birthcity', 'deathyear', 'deathmonth', 'deathday',
       'deathcountry', 'deathstate', 'deathcity', 'namefirst', 'namelast',
       'namegiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalgame',
       'retroid', 'bbrefid', 'schoolid', 'year_at_school', 'salary_year',
       'teamid', 'lgid', 'salary'],
      dtype='object')

In [50]:
# Next: Merge this with schools on schoolid

df_ppl_collvandy_sal_sch = pd.merge(df_ppl_collvandy_sal
                               , df_schools
                               , how='inner'
                               , on='schoolid'
                              )

In [51]:
df_ppl_collvandy_sal_sch.head()

Unnamed: 0,playerid,birthyear,birthmonth,birthday,birthcountry,birthstate,birthcity,deathyear,deathmonth,deathday,...,schoolid,year_at_school,salary_year,teamid,lgid,salary,schoolname,schoolcity,schoolstate,schoolnick
0,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,vandy,2006,2011.0,PIT,NL,2050000.0,Vanderbilt University,Nashville,TN,USA
1,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,vandy,2006,2012.0,PIT,NL,2200000.0,Vanderbilt University,Nashville,TN,USA
2,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,vandy,2006,2013.0,PIT,NL,700000.0,Vanderbilt University,Nashville,TN,USA
3,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,vandy,2006,2014.0,PIT,NL,4250000.0,Vanderbilt University,Nashville,TN,USA
4,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,vandy,2006,2015.0,PIT,NL,5750000.0,Vanderbilt University,Nashville,TN,USA


In [52]:
df_ppl_collvandy_sal_sch.columns

Index(['playerid', 'birthyear', 'birthmonth', 'birthday', 'birthcountry',
       'birthstate', 'birthcity', 'deathyear', 'deathmonth', 'deathday',
       'deathcountry', 'deathstate', 'deathcity', 'namefirst', 'namelast',
       'namegiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalgame',
       'retroid', 'bbrefid', 'schoolid', 'year_at_school', 'salary_year',
       'teamid', 'lgid', 'salary', 'schoolname', 'schoolcity', 'schoolstate',
       'schoolnick'],
      dtype='object')

In [53]:
# Dropping columns I don't need (tried to )

'''df_ppl_collvandy_sal_sch.columns = df_ppl_collvandy_sal_sch.columns[['playerid'
                                                                     , 'namefirst'
                                                                     , 'namegiven'
                                                                     , 'namelast'
                                                                     , 'schoolid'
                                                                     , 'schoolname'
                                                                     , 'year_at_school'
                                                                     , 'salary'
                                                                     , 'salary_year'
                                                                    ]]
df_ppl_collvandy_sal_sch.head()'''

df_ppl_collvandy_sal_sch = df_ppl_collvandy_sal_sch.drop(columns = ['birthyear', 'birthmonth', 'birthday'
                                                                    , 'birthcountry', 'birthstate', 'birthcity'
                                                                    , 'deathyear', 'deathmonth', 'deathday'
                                                                    , 'deathcountry', 'deathstate', 'deathcity'
                                                                    , 'weight', 'height', 'bats'
                                                                    , 'throws', 'debut', 'finalgame'
                                                                    , 'retroid', 'bbrefid', 'teamid'
                                                                    , 'lgid' , 'schoolcity', 'schoolstate'
                                                                    , 'schoolnick'])
df_ppl_collvandy_sal_sch.head()

Unnamed: 0,playerid,namefirst,namelast,namegiven,schoolid,year_at_school,salary_year,salary,schoolname
0,alvarpe01,Pedro,Alvarez,Pedro Manuel,vandy,2006,2011.0,2050000.0,Vanderbilt University
1,alvarpe01,Pedro,Alvarez,Pedro Manuel,vandy,2006,2012.0,2200000.0,Vanderbilt University
2,alvarpe01,Pedro,Alvarez,Pedro Manuel,vandy,2006,2013.0,700000.0,Vanderbilt University
3,alvarpe01,Pedro,Alvarez,Pedro Manuel,vandy,2006,2014.0,4250000.0,Vanderbilt University
4,alvarpe01,Pedro,Alvarez,Pedro Manuel,vandy,2006,2015.0,5750000.0,Vanderbilt University


In [54]:
df_ppl_collvandy_sal_sch.shape

(220, 9)

### Answer to 1.a. Find all players in the database who played at Vanderbilt University.   

In [56]:
# Finally got the counts right!
# Had to change the types of merges to left/right 

df_ppl_collvandy_sal_sch.namelast.unique()

array(['Alvarez', 'Baxter', 'Christiani', 'Collins', 'Cora', 'Embry',
       'Flaherty', 'Gray', 'Hendrick', 'Kata', 'Lewis', 'Madison',
       'Minor', 'Moore', 'Moss', 'Paul', 'Price', 'Prior', 'Richardson',
       'Sanderson', 'Sewell', 'Sowers', 'Willis', 'Zeid'], dtype=object)

### NEXT: 
b.  Create a list showing each player’s first and last names as well as the total salary they earned in the major leagues.    

c.  Sort this list in descending order by the total salary earned.   

d. Which Vanderbilt player earned the most money in the majors? 