### ETL_Project_Team_1
* WinsLosses & Salary Table - Clean & Merge Data

In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# Establish connection with database
rds_connection_string = "postgres:smitty77@localhost:5432/mlb_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [24]:
# see what tables exist in database
engine.table_names()

['winslosses',
 'attendance',
 'revenue',
 'payroll',
 'salary',
 'avgticketprice',
 'looktable',
 'capacityattendance',
 'capacity',
 'revenueprice']

In [3]:
# pull in the salary table
salary = pd.read_sql_query('Select * From salary', con=engine)
salary_df = pd.DataFrame(salary).set_index("team_key")
salary_df.head()

Unnamed: 0_level_0,rank,team_shortname,opening_day,current,diff,avg_salary,median
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,6,Angels,177328583,177328583,--,5373593,1400000
9,8,Astros,165653000,165653000,--,6135296,4500000
24,23,Athletics,102545000,102545000,--,3107424,1400000
22,28,Blue Jays,71228671,71228671,--,2225896,1100000
12,17,Braves,117855753,117855753,--,4063991,1250000


In [4]:
# Select Columns to Keep & Rename Columns
salary_df_clean = salary_df[["team_shortname", "current", "avg_salary", "median"]].rename(columns={"current": "current_salary", "median": "median_salary"})
salary_df_clean.head()

Unnamed: 0_level_0,team_shortname,current_salary,avg_salary,median_salary
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,Angels,177328583,5373593,1400000
9,Astros,165653000,6135296,4500000
24,Athletics,102545000,3107424,1400000
22,Blue Jays,71228671,2225896,1100000
12,Braves,117855753,4063991,1250000


In [5]:
# pull in the payroll table
payroll = pd.read_sql_query('Select * From payroll', con=engine)
payroll_df = pd.DataFrame(payroll).set_index("team_key")
payroll_df.head()

Unnamed: 0_level_0,team_longname,payroll_openingday
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1
17,Arizona Diamondbacks,107.58
12,Atlanta Braves,110.53
28,Baltimore Orioles,67.37
7,Boston Red Sox,213.19
4,Chicago Cubs,208.2


In [6]:
# merge the cleaned up salary df with the payroll df
merged_salary = pd.merge(salary_df_clean, payroll_df, on="team_key")
merged_salary.head()

Unnamed: 0_level_0,team_shortname,current_salary,avg_salary,median_salary,team_longname,payroll_openingday
team_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,Angels,177328583,5373593,1400000,Los Angeles Angels,167.46
9,Astros,165653000,6135296,4500000,Houston Astros,177.44
24,Athletics,102545000,3107424,1400000,Oakland Athletics,96.83
22,Blue Jays,71228671,2225896,1100000,Toronto Blue Jays,66.63
12,Braves,117855753,4063991,1250000,Atlanta Braves,110.53


In [30]:
# push the new table to the database
merged_salary.to_sql(name='teamsalary', con=engine, if_exists='append', index=True)

In [7]:
merged_salary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 5 to 3
Data columns (total 6 columns):
team_shortname        30 non-null object
current_salary        30 non-null int64
avg_salary            30 non-null int64
median_salary         30 non-null int64
team_longname         30 non-null object
payroll_openingday    30 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 1.6+ KB


In [8]:
# double check that it made it -- it did!! 
engine.table_names()

['winslosses',
 'revenue',
 'payroll',
 'salary',
 'avgticketprice',
 'looktable',
 'teamsalary',
 'capacity',
 'attendance',
 'revenueprice',
 'capacityattendance']

In [9]:
# Export file as a CSV, without the Pandas index, but with the header
merged_salary.to_csv("teamsalary.csv", index=False, header=True)