# ETL-Project-Stadiums

# Extract

In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
# URL of page to be scraped
url = 'https://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums'

tables = pd.read_html(url)
tables

[                                          0
 0        Denotes stadium with a fixed roof.
 1  Denotes stadium with a retractable roof.,
     Image                        Name  Capacity                     Location  \
 0     NaN           Allegiant Stadium     65000             Paradise, Nevada   
 1     NaN           Arrowhead Stadium     76416        Kansas City, Missouri   
 2     NaN                AT&T Stadium     80000             Arlington, Texas   
 3     NaN     Bank of America Stadium     75523    Charlotte, North Carolina   
 4     NaN           CenturyLink Field     69000          Seattle, Washington   
 5     NaN  Empower Field at Mile High     76125             Denver, Colorado   
 6     NaN                  FedExField     82000           Landover, Maryland   
 7     NaN         FirstEnergy Stadium     67895              Cleveland, Ohio   
 8     NaN                  Ford Field     65000            Detroit, Michigan   
 9     NaN            Gillette Stadium     66829    Fo

In [3]:
type(tables)

list

In [4]:
len(tables)

16

In [5]:
tables[1].head()

Unnamed: 0,Image,Name,Capacity,Location,Surface,Roof type,Team(s),Opened,Ref(s)
0,,Allegiant Stadium,65000,"Paradise, Nevada",Grass,Fixed,Las Vegas Raiders,2020,[3]
1,,Arrowhead Stadium,76416,"Kansas City, Missouri",Bermuda grass,Open,Kansas City Chiefs,1972,[4]
2,,AT&T Stadium,80000,"Arlington, Texas",Hellas Matrix Turf,Retractable,Dallas Cowboys,2009,[5][6]
3,,Bank of America Stadium,75523,"Charlotte, North Carolina",Bermuda grass,Open,Carolina Panthers,1996,[7]
4,,CenturyLink Field,69000,"Seattle, Washington",FieldTurf Revolution 360[8],Open,Seattle Seahawks,2002,[9]


# Transform

In [6]:
df=tables[1]

In [7]:
# new data frame with split value columns 
new = df["Location"].str.split(",", n = 1, expand = True) 
  
# making separate first name column from new data frame 
df["City"]= new[0] 
  
# making separate last name column from new data frame 
df["State"]= new[1] 


In [8]:
df.head()

Unnamed: 0,Image,Name,Capacity,Location,Surface,Roof type,Team(s),Opened,Ref(s),City,State
0,,Allegiant Stadium,65000,"Paradise, Nevada",Grass,Fixed,Las Vegas Raiders,2020,[3],Paradise,Nevada
1,,Arrowhead Stadium,76416,"Kansas City, Missouri",Bermuda grass,Open,Kansas City Chiefs,1972,[4],Kansas City,Missouri
2,,AT&T Stadium,80000,"Arlington, Texas",Hellas Matrix Turf,Retractable,Dallas Cowboys,2009,[5][6],Arlington,Texas
3,,Bank of America Stadium,75523,"Charlotte, North Carolina",Bermuda grass,Open,Carolina Panthers,1996,[7],Charlotte,North Carolina
4,,CenturyLink Field,69000,"Seattle, Washington",FieldTurf Revolution 360[8],Open,Seattle Seahawks,2002,[9],Seattle,Washington


In [18]:
df['Teams']=df['Team(s)']
stadiums=df[['Name', 'Capacity','City', 'State','Surface','Roof type','Teams', 'Opened']]
stadiums.head()

Unnamed: 0,Name,Capacity,City,State,Surface,Roof type,Teams,Opened
0,Allegiant Stadium,65000,Paradise,Nevada,Grass,Fixed,Las Vegas Raiders,2020
1,Arrowhead Stadium,76416,Kansas City,Missouri,Bermuda grass,Open,Kansas City Chiefs,1972
2,AT&T Stadium,80000,Arlington,Texas,Hellas Matrix Turf,Retractable,Dallas Cowboys,2009
3,Bank of America Stadium,75523,Charlotte,North Carolina,Bermuda grass,Open,Carolina Panthers,1996
4,CenturyLink Field,69000,Seattle,Washington,FieldTurf Revolution 360[8],Open,Seattle Seahawks,2002


In [14]:
stadiums.dtypes

Name         object
Capacity      int64
City         object
State        object
Surface      object
Roof type    object
Team(s)      object
Opened       object
dtype: object

# Load

In [15]:
#Import additional dependancies
from sqlalchemy import create_engine
from sqlalchemy.types import Integer,BigInteger
import psycopg2

In [16]:
#Session Engine
engine = create_engine('postgresql://postgres:postgres@localhost:5432/nfl_db')
connection = engine.connect()

In [None]:
# #Establishing the connection
# conn = psycopg2.connect(
#    database="nfl_db", user='postgres', password='postgres', host='127.0.0.1', port= '5432'
# )
# #Creating a cursor object using the cursor() method
# cursor = conn.cursor()

# #Doping EMPLOYEE table if already exists.
# cursor.execute("DROP TABLE IF EXISTS salary")

# #Creating table as per requirement
# sql ='''CREATE TABLE salary(
#    Rank INT NOT NULL,
#    First_Name CHAR(30) NOT NULL,
#    Last_Name CHAR(30)NOT NULL,
#    POS CHAR(2) NOT NULL,
#    TM CHAR(3) NOT NULL,
#    Salary INT NOT NULL,
#    Primary Key (Rank)
# )'''
# cursor.execute(sql)
# print("Table created successfully........")

# #Closing the connection
# conn.close()

In [20]:
#Load dataframe to PostgreSQL
stadiums.to_sql('stadiums', engine,
               dtype={"Capacity": Integer()},
                      index=False)
