In [19]:
# importing necessary packages

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options

import time
import pprint
import pymongo
from pymongo import MongoClient
import pandas as pd
import re
import datetime

import psycopg2
from sqlalchemy import create_engine


In [20]:
# defining a universal pause time
SCROLL_PAUSE_TIME = 0.5

In [21]:
# Reading the stations list
stations_df = pd.read_csv("C:/Users/snkri/OneDrive/Desktop/tfl_project/tube_stations_list.csv", index_col=0)

# Displaying the head
stations_df.head()

Unnamed: 0,Station,Photograph,Line(s)[a],Local authority,Zone[b],Opened[5],Resited,Main line opened,Other name(s)[c][6][d],"Annual usage (millions, 2022)[7]",Area Served,URL
0,Acton Town,,DistrictPiccadilly,Ealing,3,1 Jul 1879,,,Mill Hill Park: 1879–1910,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,,Met.[e]Circle,City of London,1,18 Nov 1876,,,,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,,H&C[f]District,Tower Hamlets,1,6 Oct 1884,31 Oct 1938,,Commercial Road: proposed before opening,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,,Piccadilly[g],Brent,4,28 Jun 1903,,,Perivale-Alperton: 1903–1910,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,,Met.,Bucking­ham­shire,9,1 Sep 1892,,,Amersham: 1892–1922Amersham & Chesham Bois: 19...,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [22]:
# column names
stations_df.columns

Index(['Station', 'Photograph', 'Line(s)[a]', 'Local authority', 'Zone[b]',
       'Opened[5]', 'Resited', 'Main line opened', 'Other name(s)[c][6][d]',
       'Annual usage (millions, 2022)[7]', 'Area Served', 'URL'],
      dtype='object')

In [23]:
# dropping unnecessary columns
stations_df.drop(['Photograph', 'Resited', 'Other name(s)[c][6][d]', 'Main line opened'], inplace=True, axis=1)

stations_df.head()


Unnamed: 0,Station,Line(s)[a],Local authority,Zone[b],Opened[5],"Annual usage (millions, 2022)[7]",Area Served,URL
0,Acton Town,DistrictPiccadilly,Ealing,3,1 Jul 1879,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,Met.[e]Circle,City of London,1,18 Nov 1876,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,H&C[f]District,Tower Hamlets,1,6 Oct 1884,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,Piccadilly[g],Brent,4,28 Jun 1903,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,Met.,Bucking­ham­shire,9,1 Sep 1892,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [24]:
# renaming the column names
stations_df.columns = ['station', 'lines', 'local_authority', 'fare_zone', 'opened_on', 'annual_usage', 'area_served', 'url']

# remaning the index
stations_df.index.name = 'station_id'

stations_df.head()

Unnamed: 0_level_0,station,lines,local_authority,fare_zone,opened_on,annual_usage,area_served,url
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Acton Town,DistrictPiccadilly,Ealing,3,1 Jul 1879,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,Met.[e]Circle,City of London,1,18 Nov 1876,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,H&C[f]District,Tower Hamlets,1,6 Oct 1884,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,Piccadilly[g],Brent,4,28 Jun 1903,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,Met.,Bucking­ham­shire,9,1 Sep 1892,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [25]:
# list of the tube lines
tube_lines = ['bakerloo', 'central', 'circle', 'district', 'h&c', 'jubilee', 'met', 'northern', 'piccadilly', 'victoria', 'waterloo & city']

# dictionary of tube lines and their code
tube_lines_dict = {'bakerloo': 'B', 'central': 'C', 'circle': 'O', 'h&c': 'H', 'jubilee': 'J', 'met': 'M', 'northern': 'N', 'piccadilly': 'P', 'victoria': 'V', 'waterloo & city': 'W', 'district': 'D'}

In [26]:
# replacing the name of the lines with their respective codes
lines = []

for line in stations_df['lines'].to_list():
        temp_s = ""
        for l in tube_lines:
                if l in line.lower():
                        temp_s = temp_s + tube_lines_dict[l]
        lines.append(temp_s)


stations_df['lines'] = lines

In [27]:
stations_df.head()

Unnamed: 0_level_0,station,lines,local_authority,fare_zone,opened_on,annual_usage,area_served,url
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Acton Town,DP,Ealing,3,1 Jul 1879,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,OM,City of London,1,18 Nov 1876,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,DH,Tower Hamlets,1,6 Oct 1884,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,P,Brent,4,28 Jun 1903,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,M,Bucking­ham­shire,9,1 Sep 1892,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [28]:
# collecting the unclean numbers from annual_usage column
bad_num = []
for num in stations_df['annual_usage'].to_list():
        try:
                float(num)
        except:
                bad_num.append(num)

In [29]:
# cleaning the unclean numbers
correct_num = []
for num in stations_df['annual_usage'].to_list():
        correct_num.append(float(re.sub("\[.*?\]","",num)))

In [30]:
# replacing the unclean numbers with clean numbers in annual_usage
stations_df['annual_usage'] = correct_num
stations_df.head()

Unnamed: 0_level_0,station,lines,local_authority,fare_zone,opened_on,annual_usage,area_served,url
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Acton Town,DP,Ealing,3,1 Jul 1879,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,OM,City of London,1,18 Nov 1876,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,DH,Tower Hamlets,1,6 Oct 1884,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,P,Brent,4,28 Jun 1903,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,M,Bucking­ham­shire,9,1 Sep 1892,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [37]:
# getting the dates as a list
dates = stations_df['opened_on'].to_list()

In [38]:
# getting the dates in required format
clean_dates = []
for date in dates:
        #print(date)
        dt = re.sub(r' \[.*?\]', '', date)
        #print(dt)
        element = datetime.datetime.strptime(dt,"%d %b %Y")
        
        clean_dates.append(element.strftime('%d-%m-%Y'))

In [43]:
# changing the format of the dates
stations_df['opened_on'] = clean_dates


                 station lines    local_authority  fare_zone   opened_on  \
station_id                                                                 
0             Acton Town    DP             Ealing          3  01-07-1879   
1                Aldgate    OM     City of London          1  18-11-1876   
2           Aldgate East    DH      Tower Hamlets          1  06-10-1884   
3               Alperton     P              Brent          4  28-06-1903   
4               Amersham     M  Bucking­ham­shire          9  01-09-1892   

            annual_usage  area_served  \
station_id                              
0                   4.93        Acton   
1                   6.90    Portsoken   
2                  10.23  Whitechapel   
3                   2.30     Alperton   
4                   1.66     Amersham   

                                                          url  
station_id                                                     
0           https://www.google.com/maps?q=Acton+Tow

In [45]:
# testing the changes
stations_df.head()

Unnamed: 0_level_0,station,lines,local_authority,fare_zone,opened_on,annual_usage,area_served,url
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Acton Town,DP,Ealing,3,01-07-1879,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,OM,City of London,1,18-11-1876,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,DH,Tower Hamlets,1,06-10-1884,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,P,Brent,4,28-06-1903,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,M,Bucking­ham­shire,9,01-09-1892,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [46]:
# checking for null values
stations_df.isnull().any()

station            False
lines              False
local_authority    False
fare_zone          False
opened_on          False
annual_usage       False
area_served        False
url                False
dtype: bool

In [47]:
# creating a connection with database
conn = psycopg2.connect(
    database='tfl_database',
    user='postgres',
    password='Hemanthkumar#1',
    host='localhost',
    port='5432'
)

In [48]:
# creating a cursor object
cursor = conn.cursor()

# creating an engine
engine = create_engine('postgresql+psycopg2://postgres:Hemanthkumar#1@localhost:5432/tfl_database')

In [49]:
# uploading the dataframe to postgresql
stations_df.to_sql('underground_stations', engine, if_exists='replace')

272

In [51]:
# testing if the data is uploaded to the dataframe
# executing the script
cursor.execute('SELECT * FROM underground_stations')

# fetching the table
table = cursor.fetchall()

test_df = pd.DataFrame(table)
test_df.columns = ['station_id', 'station', 'lines', 'local_authority', 'fare_zone', 'opened_on', 'annual_usage', 'area_served', 'url']
test_df.set_index('station_id', inplace=True)
test_df.head()

Unnamed: 0_level_0,station,lines,local_authority,fare_zone,opened_on,annual_usage,area_served,url
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Acton Town,DP,Ealing,3,01-07-1879,4.93,Acton,https://www.google.com/maps?q=Acton+Town+Under...
1,Aldgate,OM,City of London,1,18-11-1876,6.9,Portsoken,https://www.google.com/maps?q=Aldgate+Undergro...
2,Aldgate East,DH,Tower Hamlets,1,06-10-1884,10.23,Whitechapel,https://www.google.com/maps?q=Aldgate+East+Und...
3,Alperton,P,Brent,4,28-06-1903,2.3,Alperton,https://www.google.com/maps?q=Alperton+Undergr...
4,Amersham,M,Bucking­ham­shire,9,01-09-1892,1.66,Amersham,https://www.google.com/maps?q=Amersham+Undergr...


In [52]:
# commiting the changes
conn.commit()

# Closing cursor
cursor.close()

# closing connection
conn.close()