### ETL_Project_Team_1
* Stadium Capacity & Attendance Table - Clean & Merge Data

In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# Create connection to database
rds_connection_string = "postgres:smitty77@localhost:5432/mlb_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# see what tables exist in database
engine.table_names()

In [3]:
# pull in the attendance table
attendance = pd.read_sql_query('Select * From attendance', con=engine)
attendance_df = pd.DataFrame(attendance).set_index("team_key")
attendance_df.head()

Unnamed: 0_level_0,team_abrv,gms_h,total_h,avg_h,pct_h,gms_r,avg_r,pct_r,gms_t,avg_t,pct_t
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,LA Dodgers,81,3974309,49065,0,81,33427,0,162,41246,0
2,St. Louis,81,3480393,42967,0,81,29447,0,162,36207,0
3,NY Yankees,79,3304404,41827,0,80,30759,0,159,36258,0
4,Chicago Cubs,81,3094865,38208,0,81,33454,0,162,35831,0
5,LA Angels,81,3023010,37321,0,81,26849,0,162,32085,0


In [4]:
# Select Attendance Columns to Keep & Rename Columns
attendance_df_clean = attendance_df[["gms_h", "total_h", "avg_h"]].rename(columns = {"gms_h": "home_games", "total_h": "tot_home_game_att", "avg_h": "avg_home_game_att"})
attendance_df_clean.head()

Unnamed: 0_level_0,home_games,tot_home_game_att,avg_home_game_att
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,81,3974309,49065
2,81,3480393,42967
3,79,3304404,41827
4,81,3094865,38208
5,81,3023010,37321


In [6]:
# pull in the ballpark capacity table from database
venue_capacity = pd.read_sql_query('Select * From capacity', con=engine)
venue_capacity_df = pd.DataFrame(venue_capacity).set_index("team_key")
venue_capacity_df.head()

Unnamed: 0_level_0,index,team_longname,stadium_name,stadium_capacity,team_cityname,team_abrv,team_shortname
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,0,Los Angeles Angels,Angel Stadium,45517,Los Angeles,LA Angels,Angels
2,1,St. Louis Cardinals,Busch Stadium,45494,St. Louis,St. Louis,Cardinals
17,2,Arizona Diamondbacks,Chase Fielddouble-dagger,48686,Arizona,Arizona,Diamondbacks
13,3,New York Mets,Citi Field,41922,New York,NY Mets,Mets
10,4,Philadelphia Phillies,Citizens Bank Park,42792,Philadelphia,Philadelphia,Phillies


In [7]:
# merge the attendance df with the capacity df
merged_attendance_capacity = pd.merge(attendance_df_clean, venue_capacity_df, on="team_key")
merged_attendance_capacity.head()

Unnamed: 0_level_0,home_games,tot_home_game_att,avg_home_game_att,index,team_longname,stadium_name,stadium_capacity,team_cityname,team_abrv,team_shortname
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,81,3974309,49065,7,Los Angeles Dodgers,Dodger Stadium,56000,Los Angeles,LA Dodgers,Dodgers
2,81,3480393,42967,1,St. Louis Cardinals,Busch Stadium,45494,St. Louis,St. Louis,Cardinals
3,79,3304404,41827,29,New York Yankees,Yankee Stadium,47309,New York,NY Yankees,Yankees
4,81,3094865,38208,28,Chicago Cubs,Wrigley Field,41649,Chicago,Chicago Cubs,Cubs
5,81,3023010,37321,0,Los Angeles Angels,Angel Stadium,45517,Los Angeles,LA Angels,Angels


In [8]:
# Select and Organize Columns
capacityattendance = merged_attendance_capacity[["team_cityname", "team_shortname", "team_longname", "team_abrv", "stadium_name", "stadium_capacity", "home_games", "tot_home_game_att","avg_home_game_att"]]
capacityattendance.head()

Unnamed: 0_level_0,team_cityname,team_shortname,team_longname,team_abrv,stadium_name,stadium_capacity,home_games,tot_home_game_att,avg_home_game_att
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Los Angeles,Dodgers,Los Angeles Dodgers,LA Dodgers,Dodger Stadium,56000,81,3974309,49065
2,St. Louis,Cardinals,St. Louis Cardinals,St. Louis,Busch Stadium,45494,81,3480393,42967
3,New York,Yankees,New York Yankees,NY Yankees,Yankee Stadium,47309,79,3304404,41827
4,Chicago,Cubs,Chicago Cubs,Chicago Cubs,Wrigley Field,41649,81,3094865,38208
5,Los Angeles,Angels,Los Angeles Angels,LA Angels,Angel Stadium,45517,81,3023010,37321


In [10]:
# check out the datatypes in the new table 
capacityattendance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 1 to 30
Data columns (total 9 columns):
team_cityname        30 non-null object
team_shortname       30 non-null object
team_longname        30 non-null object
team_abrv            30 non-null object
stadium_name         30 non-null object
stadium_capacity     30 non-null int64
home_games           30 non-null int64
tot_home_game_att    30 non-null int64
avg_home_game_att    30 non-null int64
dtypes: int64(4), object(5)
memory usage: 2.3+ KB


In [11]:
# push the new merged & cleaned table to the database
capacityattendance.to_sql(name='capacityattendance', con=engine, if_exists='append', index=True)

In [10]:
# double check that it made it -- it did!! 
engine.table_names()

['winslosses',
 'attendance',
 'revenue',
 'payroll',
 'salary',
 'avgticketprice',
 'looktable',
 'capacity',
 'teamsalary',
 'capacityattendance',
 'revenueprice']

In [17]:
# Export file as a CSV, without the Pandas index, but with the header
capacityattendance.to_csv("capacityattendance.csv", index=False, header=True)