<div align="center">

| <h1> **Aviation Accident Network Analysis** </h1> |
---

## *Nediljka Kujundžić*

##Web scraping

In [None]:
!pip install beautifulsoup4 requests



In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from datetime import datetime
import urllib.parse

def scrape_aircraft_accidents():
    """
    Scrapes aircraft accident data directly from Wikipedia and saves to Excel file.
    Includes detailed information from each accident's specific page.
    """
    # URL glavne stranice s popisom nesreća
    main_url = "https://en.wikipedia.org/wiki/List_of_accidents_and_incidents_involving_commercial_aircraft"
    base_url = "https://en.wikipedia.org"

    print("Dohvaćam glavnu stranicu...")
    response = requests.get(main_url)
    main_soup = BeautifulSoup(response.text, 'html.parser')

    # Pronalazi sve linkove na godine (u sadržaju)
    toc = main_soup.find('div', {'id': 'toc'})

    # Alternativni način ako TOC nije dostupan
    if not toc:
        print("Traženje godina u tekstu...")
        # Pronađi sve naslove koji su godine
        year_headings = main_soup.find_all(['h2', 'h3', 'h4'], id=re.compile(r'^\d{4}$'))
        year_links = []
        for heading in year_headings:
            if heading.get('id') and heading.get('id').isdigit() and 1900 <= int(heading.get('id')) <= 2025:
                year_links.append({'year': heading.get('id'), 'href': f"#{heading.get('id')}"})
    else:
        # Pronađi sve linkove na godine u sadržaju
        year_links = []
        for link in toc.find_all('a'):
            href = link.get('href', '')
            text = link.get_text().strip()
            # Provjeri je li link na godinu
            if href and '#' in href and text.isdigit() and 1900 <= int(text) <= 2025:
                year_links.append({'year': text, 'href': href})

    # Sortiraj godine u padajućem redoslijedu (najnovije prvo)
    year_links.sort(key=lambda x: int(x['year']), reverse=True)

    # Uzmi samo zadnjih 25 godina
    current_year = datetime.now().year
    last_25_years = [y for y in year_links if int(y['year']) > current_year - 25]

    print(f"Pronađeno {len(year_links)} godina ukupno.")
    print(f"Obrađujem samo zadnjih 25 godina ({len(last_25_years)} godina)...")

    # Lista za spremanje podataka o nesrećama
    all_accidents = []

    # Obradi svaku godinu iz zadnjih 25 godina
    for i, year_link in enumerate(last_25_years):
        year = year_link['year']
        print(f"\nObrađujem godinu {year} ({i+1}/{len(last_25_years)})...")

        # Pronađi sekciju za tu godinu
        year_id = year_link['href'].replace('#', '')
        year_section = main_soup.find(id=year_id)

        if not year_section:
            print(f"Ne mogu pronaći sekciju za godinu {year}")
            continue

        # Pronađi sljedeći <ul> element koji sadrži listu nesreća
        accident_list = None
        current = year_section.find_next()
        while current and current.name != 'h2' and current.name != 'h3':
            if current.name == 'ul':
                accident_list = current
                break
            current = current.find_next()

        if not accident_list:
            print(f"Ne mogu pronaći listu nesreća za godinu {year}")
            continue

        # Pronađi sve stavke liste (nesreće)
        accident_items = accident_list.find_all('li')
        print(f"Pronađeno {len(accident_items)} nesreća za godinu {year}")

        # Obradi sve nesreće
        for j, item in enumerate(accident_items):
            print(f"  Obrađujem nesreću {j+1}/{len(accident_items)}")

            # Ekstrahiraj tekst nesreće
            accident_text = item.get_text()

            # Inicijaliziraj podatke o nesreći
            accident_data = {
                'year': year,
                'date': None,
                'location': None,
                'aircraft_type': None,
                'operator': None,
                'flight_origin': None,  # Preimenovano iz flight_number
                'registration': None,
                'destination': None,
                'passengers': None,
                'crew': None,
                'occupants': None,
                'injuries': None,
                'fatalities': None,
                'survivors': None,
                'missing': None,
                'summary': None
            }

            # Provjeri ima li link za detaljniju stranicu
            accident_link = item.find('a')
            if accident_link and accident_link.get('href'):
                # Dohvati stranicu specifičnu za nesreću
                detail_url = urllib.parse.urljoin(base_url, accident_link.get('href'))
                print(f"    Dohvaćam detalje sa: {detail_url}")

                try:
                    detail_response = requests.get(detail_url)
                    # Kratka pauza između zahtjeva
                    time.sleep(0.5)

                    if detail_response.status_code == 200:
                        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

                        # DODATNO: Pokušaj pronaći aircraft_type iz prvog paragrafa ako sadrži proizvođača
                        first_paragraph = detail_soup.find('p')
                        if first_paragraph:
                            para_text = first_paragraph.get_text()
                            # Provjeri sadrži li paragraf informacije o avionu
                            aircraft_manufacturers = ['Boeing', 'Airbus', 'McDonnell Douglas', 'Embraer',
                                                    'Fokker', 'Cessna', 'Bombardier', 'ATR', 'Tupolev',
                                                    'Ilyushin', 'Antonov', 'Sukhoi', 'Lockheed', 'BAe',
                                                    'De Havilland', 'Concorde', 'Saab', 'Yakovlev', 'Dornier']

                            # Traži prvi proizvođač koji se spominje u paragrafu
                            for manufacturer in aircraft_manufacturers:
                                if manufacturer in para_text:
                                    # Pronađi puni naziv modela (do 30 znakova nakon imena proizvođača)
                                    model_regex = f"({manufacturer}\\s+[\\w\\d\\-\\.\\s]{{1,30}}?)(?:\\s|,|\\.|\\(|was)"
                                    model_match = re.search(model_regex, para_text)
                                    if model_match:
                                        # Očisti rezultat
                                        aircraft_type = model_match.group(1).strip()
                                        # Skrati na razumnu duljinu ako je predugo
                                        if len(aircraft_type) > 50:
                                            aircraft_type = aircraft_type[:50]
                                        accident_data['aircraft_type'] = aircraft_type
                                        break

                        # Pronađi info tablicu (obično ima klasu 'infobox')
                        info_table = detail_soup.find('table', {'class': ['infobox', 'vcard']})

                        if info_table:
                            print("    Pronađena info tablica!")
                            # Pronađi sve redove tablice
                            rows = info_table.find_all('tr')

                            for row in rows:
                                # Pronađi zaglavlje i vrijednost
                                header = row.find('th')
                                value = row.find('td')

                                if header and value:
                                    header_text = header.get_text().strip()
                                    value_text = value.get_text().strip()

                                    # Mapiraj polja iz tablice na naša polja
                                    if 'Date' in header_text:
                                        accident_data['date'] = value_text
                                    elif 'Site' in header_text:
                                        accident_data['location'] = value_text
                                    # OVDJE JE PROMJENA: Dodali smo 'Type' kao jedan od mogućih naziva stupca
                                    elif any(s in header_text for s in ['Aircraft type', 'Aircraft', 'Type']):
                                        accident_data['aircraft_type'] = value_text
                                    elif 'Operator' in header_text:
                                        accident_data['operator'] = value_text
                                    elif 'Flight' in header_text or 'flight No' in header_text:
                                        accident_data['flight_origin'] = value_text  # Preimenovano
                                    elif 'Registration' in header_text:
                                        accident_data['registration'] = value_text
                                    elif 'Destination' in header_text:
                                        accident_data['destination'] = value_text
                                    elif 'Passengers' in header_text:
                                        accident_data['passengers'] = value_text
                                    elif 'Crew' in header_text:
                                        accident_data['crew'] = value_text
                                    elif 'Occupants' in header_text:
                                        accident_data['occupants'] = value_text
                                    elif 'Injuries' in header_text:
                                        accident_data['injuries'] = value_text
                                    elif 'Fatalities' in header_text:
                                        accident_data['fatalities'] = value_text
                                    elif 'Survivors' in header_text:
                                        accident_data['survivors'] = value_text
                                    elif 'Missing' in header_text:
                                        accident_data['missing'] = value_text
                                    elif 'Summary' in header_text:
                                        accident_data['summary'] = value_text
                        else:
                            print("    Info tablica nije pronađena, koristim regularne izraze iz teksta.")
                    else:
                        print(f"    Greška pri dohvatu detalja: {detail_response.status_code}")

                except Exception as e:
                    print(f"    Greška pri obradi detaljne stranice: {e}")

            # Ako neki podaci nedostaju, pokušaj ih dobiti iz originalnog teksta nesreće
            # Ovo je korisno za slučajeve kada nisu dostupne detaljne stranice

            # Pokušaj pronaći datum ako još nije pronađen
            if not accident_data['date']:
                date_match = re.search(r'(\d{1,2}\s+\w+|\w+\s+\d{1,2}),?\s+\d{4}', accident_text)
                if date_match:
                    accident_data['date'] = date_match.group(0)

            # Pokušaj pronaći lokaciju ako još nije pronađena
            if not accident_data['location']:
                location_patterns = [
                    r'near ([^\.]+)',
                    r'at ([^\.]+) Airport',
                    r'in ([^\.]+?)(?: when| after| while)'
                ]
                for pattern in location_patterns:
                    location_match = re.search(pattern, accident_text, re.IGNORECASE)
                    if location_match:
                        accident_data['location'] = location_match.group(1).strip()
                        break

            # Pokušaj pronaći aviokompaniju ako još nije pronađena
            if not accident_data['operator']:
                airline_patterns = [
                    r'([A-Za-z\s]+) Flight',
                    r'operated by ([A-Za-z\s]+)',
                    r'([A-Za-z\s]+) Airlines'
                ]
                for pattern in airline_patterns:
                    airline_match = re.search(pattern, accident_text, re.IGNORECASE)
                    if airline_match:
                        operator = airline_match.group(1).strip()
                        # Očisti nepotrebne riječi
                        operator = re.sub(r'\b(flight|operated|by)\b', '', operator, flags=re.IGNORECASE).strip()
                        accident_data['operator'] = operator
                        break

            # Pokušaj pronaći model aviona ako još nije pronađen
            if not accident_data['aircraft_type']:
                # Prošireni popis obrazaca za pronalaženje modela aviona
                model_patterns = [
                    # Standardni proizvođači
                    r'(Boeing \d{3}(?:-\d+)?)',
                    r'(Airbus A\d{3}(?:-\d+)?)',
                    r'(McDonnell Douglas [A-Z0-9\-]+)',
                    r'(Douglas [A-Z0-9\-]+)',
                    r'(Embraer [A-Z0-9\-]+)',
                    r'(Bombardier [A-Z0-9\-]+)',
                    r'(Canadair [A-Z0-9\-]+)',
                    r'(ATR (?:\d{2}(?:-\d+)?))',
                    r'(Fokker [A-Z0-9\-]+)',
                    r'(Cessna [A-Z0-9\-]+)',
                    # Ruski/sovjetski proizvođači
                    r'(Tupolev Tu-\d+)',
                    r'(Ilyushin Il-\d+)',
                    r'(Antonov An-\d+)',
                    r'(Sukhoi [A-Z0-9\-]+)',
                    # Drugi važni proizvođači
                    r'(Lockheed [A-Z0-9\-]+)',
                    r'(BAe [A-Z0-9\-]+)',
                    r'(British Aerospace [A-Z0-9\-]+)',
                    r'(De Havilland [A-Z0-9\-]+)',
                    r'(Saab [A-Z0-9\-]+)',
                    r'(Yakovlev [A-Z0-9\-]+)',
                    r'(Dornier [A-Z0-9\-]+)',
                    r'(Concorde)',
                    # Opći obrasci za hvatanje modela koji možda nisu specifično navedeni
                    r'aircraft (?:type|was)? (?:a|an) ([A-Za-z\-]+ [A-Za-z0-9\-]+)',
                    r'([A-Za-z]+ [A-Za-z0-9\-]+ aircraft)',
                    r'flight was (?:operated by )?(?:a|an) ([A-Za-z\-]+ [A-Za-z0-9\-]+)'
                ]

                for pattern in model_patterns:
                    model_match = re.search(pattern, accident_text, re.IGNORECASE)
                    if model_match:
                        model = model_match.group(1).strip()
                        # Očisti nepotrebne riječi
                        model = re.sub(r'\baircraft\b', '', model, flags=re.IGNORECASE).strip()
                        accident_data['aircraft_type'] = model
                        break

            # Dodaj nesreću u listu
            all_accidents.append(accident_data)

    # Kreiraj DataFrame iz prikupljenih podataka
    df = pd.DataFrame(all_accidents)

    # Ispiši statistiku o popunjenosti
    total_rows = len(df)
    aircraft_filled = df['aircraft_type'].notna().sum()
    percentage = (aircraft_filled / total_rows * 100) if total_rows > 0 else 0
    print(f"\nStatistika popunjenosti aircraft_type: {aircraft_filled}/{total_rows} ({percentage:.1f}%)")

    # Generiraj timestamp za nazive datoteka
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Spremi u Excel datoteku
    excel_filename = f'aircraft_accidents.xlsx'
    df.to_excel(excel_filename, index=False)
    print(f"\nPodaci uspješno spremljeni u {excel_filename}")

    # Spremi u CSV datoteku
    csv_filename = f'aircraft_accidents.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Podaci uspješno spremljeni u {csv_filename}")

    # Za Google Colab, omogući preuzimanje
    try:
        from google.colab import files
        print(f"Preuzimam Excel datoteku {excel_filename}...")
        files.download(excel_filename)
        print(f"Preuzimam CSV datoteku {csv_filename}...")
        files.download(csv_filename)
    except ImportError:
        print("Google Colab nije detektiran. Preuzimanje datoteke neće biti dostupno.")

    return df

if __name__ == "__main__":
    # Instaliraj potrebne pakete
    try:
        import pandas
        import bs4
        import urllib.parse
    except ImportError:
        print("Instaliram potrebne pakete...")
        import sys
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "beautifulsoup4", "requests", "openpyxl"])

    # Pokreni scraping
    df = scrape_aircraft_accidents()

    # Prikaži prvih 10 redova podataka
    print("\nPrvih 10 redova prikupljenih podataka:")
    print(df.head(10))

Dohvaćam glavnu stranicu...
Traženje godina u tekstu...
Pronađeno 105 godina ukupno.
Obrađujem samo zadnjih 25 godina (25 godina)...

Obrađujem godinu 2025 (1/25)...
Pronađeno 7 nesreća za godinu 2025
  Obrađujem nesreću 1/7
    Dohvaćam detalje sa: https://en.wikipedia.org/wiki/Air_Busan_Flight_391
    Pronađena info tablica!
  Obrađujem nesreću 2/7
    Dohvaćam detalje sa: https://en.wikipedia.org/wiki/2025_Light_Air_Services_Beechcraft_1900_crash
    Pronađena info tablica!
  Obrađujem nesreću 3/7
    Dohvaćam detalje sa: https://en.wikipedia.org/wiki/2025_Potomac_River_mid-air_collision
    Pronađena info tablica!
  Obrađujem nesreću 4/7
    Dohvaćam detalje sa: https://en.wikipedia.org/wiki/Bering_Air_Flight_445
    Pronađena info tablica!
  Obrađujem nesreću 5/7
    Dohvaćam detalje sa: https://en.wikipedia.org/wiki/Delta_Connection_Flight_4819
    Pronađena info tablica!
  Obrađujem nesreću 6/7
    Dohvaćam detalje sa: https://en.wikipedia.org/wiki/Aerol%C3%ADnea_Lanhsa_Flight_0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Preuzimam CSV datoteku aircraft_accidents.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Prvih 10 redova prikupljenih podataka:
   year                           date  \
0  2025    28 January 2025 (2025-1-28)   
1  2025    29 January 2025 (2025-1-29)   
2  2025  January 29, 2025 (2025-01-29)   
3  2025  February 6, 2025 (2025-02-06)   
4  2025  February 17, 2025 (2025-2-17)   
5  2025     17 March 2025 (2025-03-17)   
6  2025     22 March 2025 (2025-03-22)   
7  2024      2 January 2024 (2024-1-2)   
8  2024   January 5, 2024 (2024-01-05)   
9  2024   23 January 2024 (2024-01-23)   

                                            location  \
0  Gimhae International Airport, Busan, South Kor...   
1  Near GPOC Unity Airstrip, Rubkona County, Unit...   
2  Potomac River, Washington, D.C., U.S. 38°50′33...   
3  Over the Norton Sound, Bering Sea 64°21′0″N 16...   
4  Toronto Pearson International Airport, Mississ...   
5  near Juan Manuel Gálvez International Airport,...   
6  Ceel Xabaaloow, 24 km (15 mi) southwest of Mog...   
7  Runway 34R, Haneda Airport, Tokyo, Japan 35°32

##Cleaning Data

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
df = pd.read_csv('/content/drive/My Drive/Analiza_mreza/aircraft_accidents-2.csv')

In [55]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

In [57]:
# Učitavanje podataka
df = pd.read_csv('aircraft_accidents.csv', encoding='utf-8')

# Prikaz početnih informacija o podacima
print("Dimenzije izvornog skupa podataka:", df.shape)
print("Broj praznih vrijednosti po stupcima:")
print(df.isnull().sum())
print("\nPregled tipova podataka:")
print(df.dtypes)

Dimenzije izvornog skupa podataka: (321, 16)
Broj praznih vrijednosti po stupcima:
year               0
date               2
location           2
aircraft_type      0
operator           0
flight_origin      8
registration       5
destination        9
passengers        24
crew               5
occupants         29
injuries         158
fatalities         6
survivors          6
missing          319
summary            4
dtype: int64

Pregled tipova podataka:
year               int64
date              object
location          object
aircraft_type     object
operator          object
flight_origin     object
registration      object
destination       object
passengers        object
crew              object
occupants         object
injuries          object
fatalities        object
survivors         object
missing          float64
summary           object
dtype: object


In [63]:
df.head()

Unnamed: 0,year,date,location,aircraft_type,operator,flight_origin,registration,destination,passengers,crew,occupants,injuries,fatalities,survivors,missing,summary
0,2025,28 January 2025 (2025-1-28),"Gimhae International Airport, Busan, South Kor...",Airbus A321-231,Air Busan,"Gimhae International Airport, Busan, South Korea",HL7763,"Hong Kong International Airport, Hong Kong SAR",169,7,176,7.0,0,176,,"Ground fire before takeoff, under investigation"
1,2025,29 January 2025 (2025-1-29),"Near GPOC Unity Airstrip, Rubkona County, Unit...",Beechcraft 1900D,Eagle Air on behalf of Light Air Services,"GPOC Unity Airstrip, Rubkona County, Unity Sta...",5X-RHB,"Juba International Airport, Juba, South Sudan",19,2,21,1.0,20,1,,"Crashed after takeoff, under investigation"
2,2025,"January 29, 2025 (2025-01-29)","Potomac River, Washington, D.C., U.S. 38°50′33...",Sikorsky UH-60L Black Hawk,"12th Aviation Battalion, United States Army","Davison Army Airfield, Fort Belvoir, Virginia,...",00-26860[5],"Davison Army Airfield, Fort Belvoir, Virginia,...",60,3,3,,3,0,,"Mid-air collision, under investigation"
3,2025,"February 6, 2025 (2025-02-06)","Over the Norton Sound, Bering Sea 64°21′0″N 16...",Cessna 208B Grand Caravan EX,Bering Air,"Unalakleet Airport, Alaska, United States",N321BA,"Nome Airport, Alaska, United States",9,1,10,,10,0,,"Crashed into ice, under investigation"
4,2025,"February 17, 2025 (2025-2-17)","Toronto Pearson International Airport, Mississ...",Bombardier CRJ900LR,Endeavor Air as Delta Connection[a],"Minneapolis–Saint Paul International Airport, ...",N932XJ,"Toronto Pearson International Airport, Mississ...",76,4,80,21.0,0,80,,"Crashed upon landing, overturned on the runway..."


In [None]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/312.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


In [42]:
from neo4j import GraphDatabase

URI = "bolt://44.203.174.63:7687"
AUTH = ("neo4j", "expenditure-scab-grain")

try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    with driver.session() as session:
        result = session.run("RETURN 'Connection Successful!' AS message")
        print(result.single()["message"])
except Exception as e:
    print(f"Error connecting to Neo4j: {e}")


Connection Successful!


In [None]:
df.columns

Index(['year', 'date', 'location', 'aircraft_type', 'operator',
       'flight_origin', 'registration', 'destination', 'passengers', 'crew',
       'occupants', 'injuries', 'fatalities', 'survivors', 'missing',
       'summary'],
      dtype='object')