# Scrape the InfoBox for each link
### https://en.wikipedia.org/wiki/List_of_accidents_and_incidents_involving_commercial_aircraft
### Store the InfoBoxes into a Dafaframe

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#wikpedia list with all accidents and incidents
url = "https://en.wikipedia.org/wiki/List_of_accidents_and_incidents_involving_commercial_aircraft"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")

# Find the sections "1910s and 1920s" and "External Links" to minimize any uncessary links
start_section = soup.find("span", {"id": "1910s_and_1920s"})
end_section = soup.find("span", {"id": "External_links"})

# Empty list to store the URLs
urls = []

# Find all the UL tags before the "External links" section
ul_tags = start_section.find_next("ul")

while ul_tags and ul_tags.find_next("span") != end_section:
    # Find all the links inside the UL tag
    links = ul_tags.find_all("a")
    for link in links:
        # Get the URL of the link
        link_url = link.get("href")
        if link_url and link_url.startswith("/wiki/") and ":" not in link_url:
            # Append the URL to the list
            urls.append("https://en.wikipedia.org" + link_url)
    ul_tags = ul_tags.find_next("ul")

# empty dataframe
combined_df = pd.DataFrame()

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    infoboxes = soup.find_all('table', {'class': 'infobox'})
    if infoboxes:
        for infobox in infoboxes:
            title = soup.find("h1", class_="firstHeading").text
            for df in pd.read_html(str(infobox)):
                df['title'] = title
                combined_df = pd.concat([combined_df, df], ignore_index=True)
                #combined_df will be our dataframe to use
    else:
        print(f"No table in {url}")

No table in https://en.wikipedia.org/wiki/Martin_and_Osa_Johnson
No table in https://en.wikipedia.org/wiki/Aeroelasticity#Flutter
No table in https://en.wikipedia.org/wiki/Helen_E._Hokinson
No table in https://en.wikipedia.org/wiki/Decoto,_California
No table in https://en.wikipedia.org/wiki/Mid-air_collision
No table in https://en.wikipedia.org/wiki/Mount_Takamagahara
No table in https://en.wikipedia.org/wiki/Rahman_Dadman
No table in https://en.wikipedia.org/wiki/2002_Africa_One_Antonov_An-26_crash
No table in https://en.wikipedia.org/wiki/Polaris_Award
No table in https://en.wikipedia.org/wiki/Takeoff


### !!!need to fix this, need to include the title of the table and exclude the infobox headers!!!

In [3]:
combined_df.head()

Unnamed: 0,0,1,title,Accident,Accident.1,R101,R101.1,Bronson M. Cutting,Bronson M. Cutting.1,Queensland,...,Osh International AirportОш эл аралык аэропорту,Osh International AirportОш эл аралык аэропорту.1,Video of the crash,Video of the crash.1,Moments of the crash,Moments of the crash.1,Mid-air collision,Mid-air collision.1,Raman Pratasevich,Raman Pratasevich.1
0,Wingfoot Air Express,Wingfoot Air Express,Wingfoot Air Express crash,,,,,,,,...,,,,,,,,,,
1,Accident,Accident,Wingfoot Air Express crash,,,,,,,,...,,,,,,,,,,
2,Date,"July 21, 1919",Wingfoot Air Express crash,,,,,,,,...,,,,,,,,,,
3,Summary,In-flight fire,Wingfoot Air Express crash,,,,,,,,...,,,,,,,,,,
4,Site,"Chicago, Illinois, United States .mw-parser-ou...",Wingfoot Air Express crash,,,,,,,,...,,,,,,,,,,


### Drop the extra columns

In [4]:
df2 = combined_df.drop(combined_df.columns[3:],axis = 1)
df2.head()

Unnamed: 0,0,1,title
0,Wingfoot Air Express,Wingfoot Air Express,Wingfoot Air Express crash
1,Accident,Accident,Wingfoot Air Express crash
2,Date,"July 21, 1919",Wingfoot Air Express crash
3,Summary,In-flight fire,Wingfoot Air Express crash
4,Site,"Chicago, Illinois, United States .mw-parser-ou...",Wingfoot Air Express crash


### reorder columns

In [5]:
df2 = df2[['title', 0, 1]]
df2.head()

Unnamed: 0,title,0,1
0,Wingfoot Air Express crash,Wingfoot Air Express,Wingfoot Air Express
1,Wingfoot Air Express crash,Accident,Accident
2,Wingfoot Air Express crash,Date,"July 21, 1919"
3,Wingfoot Air Express crash,Summary,In-flight fire
4,Wingfoot Air Express crash,Site,"Chicago, Illinois, United States .mw-parser-ou..."


### changing the columns names

In [6]:
df2.rename(columns = {0:'Headers', 1:'Values'}, inplace = True)
df2

Unnamed: 0,title,Headers,Values
0,Wingfoot Air Express crash,Wingfoot Air Express,Wingfoot Air Express
1,Wingfoot Air Express crash,Accident,Accident
2,Wingfoot Air Express crash,Date,"July 21, 1919"
3,Wingfoot Air Express crash,Summary,In-flight fire
4,Wingfoot Air Express crash,Site,"Chicago, Illinois, United States .mw-parser-ou..."
...,...,...,...
31427,LATAM Perú Flight 2213,Injuries,40
31428,LATAM Perú Flight 2213,Survivors,108
31429,LATAM Perú Flight 2213,Ground casualties,Ground casualties
31430,LATAM Perú Flight 2213,Ground fatalities,2


### Store the Dataframe into an excel file

In [9]:
df2.to_excel('Data\scraped_data.xlsx', index=False)