## Scraping from Wikipedia the Registration and Coordinates, if they exist.
https://en.wikipedia.org/wiki/List_of_accidents_and_incidents_involving_commercial_aircraft

In [61]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [48]:
url = "https://en.wikipedia.org/wiki/List_of_accidents_and_incidents_involving_commercial_aircraft"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")

# starting section  "1910s and 1920s"
section = soup.find("span", id="1910s_and_1920s").parent.parent

# Dataframe
df = pd.DataFrame(columns=["Registration", "Latitude", "Longitude"])

# Iterate all links that are in bold
for bold_link in section.find_all("b"):
    link = bold_link.find("a")
    if link:
        link = link["href"]
        sub_res = requests.get("https://en.wikipedia.org" + link)
        sub_soup = BeautifulSoup(sub_res.text, "html.parser")

        # Find the infobox
        infobox = sub_soup.find("table", class_="infobox")
        if infobox:
            # Need to check if the values wanted exist
            reg_col = infobox.find("th", text="Registration")
            lat_col = infobox.find("span", class_="latitude")
            lon_col = infobox.find("span", class_="longitude")
            if reg_col and lat_col and lon_col:
                # Extracting them
                reg = reg_col.find_next_sibling("td").text.strip()
                lat = lat_col.text.strip()
                lon = lon_col.text.strip()

                # Append the datframe
                df = df.append({"Registration": reg, "Latitude": lat, "Longitude": lon}, ignore_index=True)
df

Unnamed: 0,Registration,Latitude,Longitude
0,G-EAMA,51°34′13.5″N,0°12′11.07″W
1,F-GEAD,49°38′00″N,01°56′49″E
2,F-AEBY,49°42′36″N,2°10′19″E
3,F-AECB,51°16′16″N,0°25′41″E
4,G-EBBS,51°50′11″N,0°37′34″W
...,...,...,...
872,UR-CIC,40°58′N,24°12′E
873,HL7525,10°17′42″N,123°58′00″E
874,5H-PWF[1],01°20′07″S,31°49′33″E
875,CC-BHB[1],12°01′19″S,77°06′52″W


### some values contain unwanted strings

In [59]:
df['Registration'] = df['Registration'].str.split("[").str.get(0)
df['Registration'] = df['Registration'].str.split("(").str.get(0)
df

Unnamed: 0,Registration,Latitude,Longitude
0,G-EAMA,51°34′13.5″N,0°12′11.07″W
1,F-GEAD,49°38′00″N,01°56′49″E
2,F-AEBY,49°42′36″N,2°10′19″E
3,F-AECB,51°16′16″N,0°25′41″E
4,G-EBBS,51°50′11″N,0°37′34″W
...,...,...,...
872,UR-CIC,40°58′N,24°12′E
873,HL7525,10°17′42″N,123°58′00″E
874,5H-PWF,01°20′07″S,31°49′33″E
875,CC-BHB,12°01′19″S,77°06′52″W


### will use this file to concate with the master file

In [60]:
df.to_excel('wiki.xlsx', index=False)