In [1]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [2]:
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name    


In [3]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

In [4]:
data  = requests.get(static_url).text

In [5]:
soup = BeautifulSoup(data, "html.parser")

In [6]:
soup.title

<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>

In [7]:
html_tables = soup.find_all('table')

In [8]:
first_launch_table = html_tables[2]

In [9]:
column_names = []
cols = first_launch_table.find_all('th')
for row in cols:
    name = extract_column_from_header(row)
    if name is not None and len(name)>0:
        column_names.append(name)
print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [13]:
table_rows=first_launch_table.find_all('tr')
table_rows[0]

<tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date andtime ()
</th>
<th scope="col"> 
</th>
<th scope="col">Launch site
</th>
<th scope="col">Payload
</th>
<th scope="col">Payload mass
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th>
<th scope="col">Launchoutcome
</th>
<th scope="col">
</th></tr>

In [18]:
first_row =table_rows[0]
first_row.th

<th scope="col">Flight No.
</th>

In [16]:
first_row.td

In [19]:
len(html_tables)

24

In [22]:
second_launch_table = html_tables[3]
second_launch_table

<table class="wikitable plainrowheaders collapsible" style="width: 100%;">
<tbody><tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date and<br/>time (<a href="/wiki/Coordinated_Universal_Time" title="Coordinated Universal Time">UTC</a>)
</th>
<th scope="col"><a href="/wiki/List_of_Falcon_9_first-stage_boosters" title="List of Falcon 9 first-stage boosters">Version,<br/>Booster</a><sup class="reference" id="cite_ref-booster_11-1"><a href="#cite_note-booster-11">[b]</a></sup>
</th>
<th scope="col">Launch site
</th>
<th scope="col">Payload<sup class="reference" id="cite_ref-Dragon_12-1"><a href="#cite_note-Dragon-12">[c]</a></sup>
</th>
<th scope="col">Payload mass
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th>
<th scope="col">Launch<br/>outcome
</th>
<th scope="col"><a href="/wiki/Falcon_9_first-stage_landing_tests" title="Falcon 9 first-stage landing tests">Booster<br/>landing</a>
</th></tr>
<tr>
<th rowspan="2" scope="row" style="text-align:center;">8
</th>
<td>6

In [27]:
requiered_tabels = soup.find_all("table",class_="wikitable plainrowheaders collapsible")
len(requiered_tabels)

9

In [28]:
launch_dict= dict.fromkeys(column_names)
launch_dict

{'Flight No.': None,
 'Date and time ( )': None,
 'Launch site': None,
 'Payload': None,
 'Payload mass': None,
 'Orbit': None,
 'Customer': None,
 'Launch outcome': None}

In [29]:
del launch_dict['Date and time ( )']
launch_dict

{'Flight No.': None,
 'Launch site': None,
 'Payload': None,
 'Payload mass': None,
 'Orbit': None,
 'Customer': None,
 'Launch outcome': None}

In [82]:
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

launch_dict

{'Flight No.': [],
 'Launch site': [],
 'Payload': [],
 'Payload mass': [],
 'Orbit': [],
 'Customer': [],
 'Launch outcome': [],
 'Version Booster': [],
 'Booster landing': [],
 'Date': [],
 'Time': []}

In [33]:
first_required_table = requiered_tabels[0]
first_required_table_rows=first_required_table.find_all('tr')
len(first_required_table_rows)

16

In [38]:
first_required_table_rows[2]

<tr>
<td colspan="9">First flight of Falcon 9 v1.0.<sup class="reference" id="cite_ref-sfn20100604_17-0"><a href="#cite_note-sfn20100604-17">[11]</a></sup> Used a boilerplate version of Dragon capsule which was not designed to separate from the second stage.<small>(<a href="#First_flight_of_Falcon_9">more details below</a>)</small> Attempted to recover the first stage by parachuting it into the ocean, but it burned up on reentry, before the parachutes even deployed.<sup class="reference" id="cite_ref-parachute_18-0"><a href="#cite_note-parachute-18">[12]</a></sup>
</td></tr>

In [39]:
first_required_table_rows[1]

<tr>
<th rowspan="2" scope="row" style="text-align:center;">1
</th>
<td>4 June 2010,<br/>18:45
</td>
<td><a href="/wiki/Falcon_9_v1.0" title="Falcon 9 v1.0">F9 v1.0</a><sup class="reference" id="cite_ref-MuskMay2012_13-0"><a href="#cite_note-MuskMay2012-13">[7]</a></sup><br/>B0003.1<sup class="reference" id="cite_ref-block_numbers_14-0"><a href="#cite_note-block_numbers-14">[8]</a></sup>
</td>
<td><a href="/wiki/Cape_Canaveral_Space_Force_Station" title="Cape Canaveral Space Force Station">CCAFS</a>,<br/><a href="/wiki/Cape_Canaveral_Space_Launch_Complex_40" title="Cape Canaveral Space Launch Complex 40">SLC-40</a>
</td>
<td><a href="/wiki/Dragon_Spacecraft_Qualification_Unit" title="Dragon Spacecraft Qualification Unit">Dragon Spacecraft Qualification Unit</a>
</td>
<td>
</td>
<td><a href="/wiki/Low_Earth_orbit" title="Low Earth orbit">LEO</a>
</td>
<td><a href="/wiki/SpaceX" title="SpaceX">SpaceX</a>
</td>
<td class="table-success" style="background: #9EFF9E; vertical-align: middle; 

In [44]:
first_required_table_rows[0]

<tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date andtime ()
</th>
<th scope="col"> 
</th>
<th scope="col">Launch site
</th>
<th scope="col">Payload
</th>
<th scope="col">Payload mass
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th>
<th scope="col">Launchoutcome
</th>
<th scope="col">
</th></tr>

In [48]:
for i,row in enumerate(first_required_table_rows):
    if i%2 != 0:
        print("row",i,"is",row.td.text)
    

row 1 is 4 June 2010,18:45

row 3 is 8 December 2010,15:43[13]

row 5 is 22 May 2012,07:44[17]

row 7 is 8 October 2012,00:35[21]

row 9 is CRS-1 was successful, but the secondary payload was inserted into an abnormally low orbit and subsequently lost. This was due to one of the nine Merlin engines shutting down during the launch, and NASA declining a second reignition, as per ISS visiting vehicle safety rules, the primary payload owner is contractually allowed to decline a second reignition. NASA stated that this was because SpaceX could not guarantee a high enough likelihood of the second stage completing the second burn successfully which was required to avoid any risk of secondary payload's collision with the ISS.[26][27][28]

row 11 is Last launch of the original Falcon 9 v1.0 launch vehicle, first use of the unpressurized trunk section of Dragon.[29]

row 13 is First commercial mission with a private customer, first launch from Vandenberg, and demonstration flight of Falcon 9 v1.

In [50]:
for rows in first_required_table_rows:
    if rows.th:
        if rows.th.string:
            flight_number=rows.th.string.strip()
            flag=flight_number.isdigit()
        else:
            flag=False
    if flag:
        print("row"," is",flight_number)

row  is 1
row  is 1
row  is 2
row  is 2
row  is 3
row  is 3
row  is 4
row  is 4
row  is 4
row  is 5
row  is 5
row  is 6
row  is 6
row  is 7
row  is 7


In [52]:
second_required_table = requiered_tabels[1]
second_required_table_rows=second_required_table.find_all('tr')
len(second_required_table_rows)

13

In [54]:
for rows in second_required_table_rows:
    if rows.th:
        if rows.th.string:
            flight_number=rows.th.string.strip()
            flag=flight_number.isdigit()
        else:
            flag=False
    if flag:
        print("row"," is",flight_number)

row  is 8
row  is 8
row  is 9
row  is 9
row  is 10
row  is 10
row  is 11
row  is 11
row  is 12
row  is 12
row  is 13
row  is 13


In [None]:
requiered_tabels.find_all('tr')

In [57]:
for rows in requiered_tabels[1].find_all('tr'):
    if rows.th:
        if rows.th.string:
            flight_number=rows.th.string.strip()
            flag=flight_number.isdigit()
        else:
            flag=False
        if flag:
            print("row"," is",flight_number)

row  is 8
row  is 9
row  is 10
row  is 11
row  is 12
row  is 13


In [None]:
for rows in requiered_tabels[i].find_all('tr'):
    if rows.th:
        if rows.th.string:
            flight_number=rows.th.string.strip()
            flag=flight_number.isdigit()
        else:
            flag=False
        if flag:
            print("row"," is",flight_number)

In [59]:
for i in range(len(requiered_tabels)):
    for rows in requiered_tabels[i].find_all('tr'):
        if rows.th:
            if rows.th.string:
                flight_number=rows.th.string.strip()
                flag=flight_number.isdigit()
            else:
                flag=False
            if flag:
                print("row"," is",flight_number)

row  is 1
row  is 2
row  is 3
row  is 4
row  is 5
row  is 6
row  is 7
row  is 8
row  is 9
row  is 10
row  is 11
row  is 12
row  is 13
row  is 14
row  is 15
row  is 16
row  is 17
row  is 18
row  is 19
row  is 20
row  is 21
row  is 22
row  is 23
row  is 24
row  is 25
row  is 26
row  is 27
row  is 28
row  is 29
row  is 30
row  is 31
row  is 32
row  is 33
row  is 34
row  is 35
row  is 36
row  is 37
row  is 38
row  is 39
row  is 40
row  is 41
row  is 42
row  is 43
row  is 44
row  is 45
row  is 46
row  is 47
row  is 48
row  is 49
row  is 50
row  is 51
row  is 52
row  is 53
row  is 54
row  is 55
row  is 56
row  is 57
row  is 58
row  is 59
row  is 60
row  is 61
row  is 62
row  is 63
row  is 64
row  is 65
row  is 66
row  is 67
row  is 68
row  is 69
row  is 70
row  is 71
row  is 72
row  is 73
row  is 74
row  is 75
row  is 76
row  is 77
row  is 78
row  is 79
row  is 80
row  is 81
row  is 82
row  is 83
row  is 84
row  is 85
row  is 86
row  is 87
row  is 88
row  is 89
row  is 90
row  is 91
row  is 

In [149]:
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

launch_dict

{'Flight No.': [],
 'Launch site': [],
 'Payload': [],
 'Payload mass': [],
 'Orbit': [],
 'Customer': [],
 'Launch outcome': [],
 'Version Booster': [],
 'Booster landing': [],
 'Date': [],
 'Time': []}

In [156]:
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

#launch_dict

for rows in requiered_tabels[1].find_all('tr'):
    col = rows.find_all("td")
    if rows.th:
        if rows.th.string:
            flight_number=rows.th.string.strip()
            flag=flight_number.isdigit()
        else:
            flag=False
        if flag:
            print()
            print("row"," is",flight_number)
            #if flight_number not in launch_dict['Flight No.']:
            launch_dict['Flight No.'].append(str(flight_number))

            datatimelist = date_time(col[0])
            date = datatimelist[0].strip(',')
            print(date)
            # if date not in launch_dict['Date']:
            launch_dict['Date'].append(date)

            time = datatimelist[1]
            print(time)
            launch_dict['Time'].append(time)
            
            bv=booster_version(col[1])
            if not(bv):
                bv=col[1].a.string
            print("Version Booster",bv)
            launch_dict['Version Booster'].append(bv)
            
            launch_site = col[2].a.string
            print(launch_site)
            launch_dict['Launch site'].append(launch_site)
            
            payload = col[3].a.string
            print(payload)
            launch_dict['Payload'].append(payload)
            
            payload_mass = get_mass(col[4])
            print(payload_mass)
            launch_dict['Payload mass'].append(str(payload_mass))
            
            orbit = col[5].a.string
            print(orbit)
            launch_dict['Orbit'].append(orbit)
            
            customer = list(col[6].text.split())[0]
            print(customer)
            launch_dict['Customer'].append(customer)
            
            launch_outcome = list(col[7].strings)[0]
            print(launch_outcome.strip())
            launch_dict['Launch outcome'].append(launch_outcome.strip())
            
            booster_landing = landing_status(col[8])
            print('booster_landing',booster_landing)
            launch_dict['Booster landing'].append(booster_landing)
            
print()                
print(launch_dict)



row  is 8
6 January 2014
22:06
Version Booster F9 v1.1
CCAFS
Thaicom 6
3,325 kg
GTO
Thaicom
Success
booster_landing No attempt

row  is 9
18 April 2014
19:25
Version Booster F9 v1.1
Cape Canaveral
SpaceX CRS-3
2,296 kg
LEO
NASA
Success
booster_landing Controlled

row  is 10
14 July 2014
15:15
Version Booster F9 v1.1
Cape Canaveral
Orbcomm-OG2
1,316 kg
LEO
Orbcomm
Success
booster_landing Controlled

row  is 11
5 August 2014
08:00
Version Booster F9 v1.1
Cape Canaveral
AsiaSat 8
4,535 kg
GTO
AsiaSat
Success
booster_landing No attempt

row  is 12
7 September 2014
05:00
Version Booster F9 v1.1
Cape Canaveral
AsiaSat 6
4,428 kg
GTO
AsiaSat
Success
booster_landing No attempt


row  is 13
21 September 2014
05:52
Version Booster F9 v1.1
Cape Canaveral
SpaceX CRS-4
2,216 kg
LEO
NASA
Success
booster_landing Uncontrolled

{'Flight No.': ['8', '9', '10', '11', '12', '13'], 'Launch site': ['CCAFS', 'Cape Canaveral', 'Cape Canaveral', 'Cape Canaveral', 'Cape Canaveral', 'Cape Canaveral'], 'Payload'

In [157]:
df=pd.DataFrame(launch_dict)
df

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,8,CCAFS,Thaicom 6,"3,325 kg",GTO,Thaicom,Success,F9 v1.1,No attempt,6 January 2014,22:06
1,9,Cape Canaveral,SpaceX CRS-3,"2,296 kg",LEO,NASA,Success,F9 v1.1,Controlled,18 April 2014,19:25
2,10,Cape Canaveral,Orbcomm-OG2,"1,316 kg",LEO,Orbcomm,Success,F9 v1.1,Controlled,14 July 2014,15:15
3,11,Cape Canaveral,AsiaSat 8,"4,535 kg",GTO,AsiaSat,Success,F9 v1.1,No attempt,5 August 2014,08:00
4,12,Cape Canaveral,AsiaSat 6,"4,428 kg",GTO,AsiaSat,Success,F9 v1.1,No attempt\n,7 September 2014,05:00
5,13,Cape Canaveral,SpaceX CRS-4,"2,216 kg",LEO,NASA,Success,F9 v1.1,Uncontrolled,21 September 2014,05:52


In [161]:
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

requiered_tabels = soup.find_all("table",class_="wikitable plainrowheaders collapsible")

for i in range(len(requiered_tabels)):
    for rows in requiered_tabels[i].find_all('tr'):
        col = rows.find_all("td")
        if rows.th:
            if rows.th.string:
                flight_number=rows.th.string.strip()
                flag=flight_number.isdigit()
            else:
                flag=False
            if flag:
                # print()
                # print("row"," is",flight_number)
                launch_dict['Flight No.'].append(str(flight_number))

                datatimelist = date_time(col[0])
                date = datatimelist[0].strip(',')
                # print(date)
                launch_dict['Date'].append(date)

                time = datatimelist[1]
                # print(time)
                launch_dict['Time'].append(time)

                bv=booster_version(col[1])
                if not(bv):
                    bv=col[1].a.string
                # print("Version Booster",bv)
                launch_dict['Version Booster'].append(bv)

                launch_site = col[2].a.string
                # print(launch_site)
                launch_dict['Launch site'].append(launch_site)

                payload = col[3].a.string
                # print(payload)
                launch_dict['Payload'].append(payload)

                payload_mass = get_mass(col[4])
                # print(payload_mass)
                launch_dict['Payload mass'].append(str(payload_mass))

                orbit = col[5].a.string
                # print(orbit)
                launch_dict['Orbit'].append(orbit)

                customer = list(col[6].text.split())[0]
                # print(customer)
                launch_dict['Customer'].append(customer)

                launch_outcome = list(col[7].strings)[0]
                # print(launch_outcome.strip())
                launch_dict['Launch outcome'].append(launch_outcome.strip())

                booster_landing = landing_status(col[8])
                # print('booster_landing',booster_landing)
                launch_dict['Booster landing'].append(booster_landing)
              

In [162]:
df=pd.DataFrame(launch_dict)
df

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success,F9 v1.0B0003.1,Failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,Success,F9 v1.0B0004.1,Failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.0B0005.1,No attempt\n,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success,F9 v1.0B0006.1,No attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success,F9 v1.0B0007.1,No attempt\n,1 March 2013,15:10
...,...,...,...,...,...,...,...,...,...,...,...
116,117,CCSFS,Starlink,"15,600 kg",LEO,SpaceX,Success,F9 B5B1051.10,Success,9 May 2021,06:42
117,118,KSC,Starlink,"~14,000 kg",LEO,SpaceX,Success,F9 B5B1058.8,Success,15 May 2021,22:56
118,119,CCSFS,Starlink,"15,600 kg",LEO,SpaceX,Success,F9 B5B1063.2,Success,26 May 2021,18:59
119,120,KSC,SpaceX CRS-22,"3,328 kg",LEO,NASA,Success,F9 B5B1067.1,Success,3 June 2021,17:29


In [None]:
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

launch_dict

In [None]:
for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
        population_data = population_data.append({"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}, ignore_index=True)


In [None]:
for row in first_required_table.find_all("tr")