In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def clean_text(text):
    """Clean extracted text by removing extra whitespace and newlines."""
    if text:
        return ' '.join(text.strip().split())
    return None

def extract_metadata_from_tag(tag):
    """Extract all metadata fields from a tag that contains metadata in <strong> format."""
    if not tag:
        return None, None, None, None
    
    # Initialize metadata values
    attack_type = location = cost = people = None

    # Extract metadata fields by iterating over the contents of the tag
    for element in tag.descendants:
        if element.name == 'strong':
            text = clean_text(element.get_text()).lower()
            if 'type of attack' in text:
                attack_type = clean_text(element.next_sibling)
            elif 'location' in text:
                location = clean_text(element.next_sibling)
            elif 'cost' in text:
                cost = clean_text(element.next_sibling)
            elif 'people affected' in text:
                # Handle cases where the value is inside a link
                people = clean_text(element.find_next('a').get_text() if element.find_next('a') else element.next_sibling)
    
    return attack_type, location, cost, people

def extract_entry_data(tags):
    """Extract data for all entries by iterating through tags."""
    organizations = []
    descriptions = []
    attack_types = []
    locations = []
    people_affected = []
    costs = []
    
    org_name = None
    desc = ""
    attack_type = location = cost = people = None

    for tag in tags:
        text_content = tag.get_text(' ', strip=True)
        
        # Detect the start of a new entry based on a numbered pattern like "9. Advocate Medical Group"
        match = re.match(r'^\d+\.\s*(.+)', text_content)
        
        if match:
            # If there is an ongoing entry, save it before moving to the next one
            if org_name:
                organizations.append(org_name)
                descriptions.append(desc.strip())
                attack_types.append(attack_type)
                locations.append(location)
                people_affected.append(people)
                costs.append(cost)
                
            # Reset variables for the new entry
            org_name = clean_text(match.group(1))
            desc = ""
            attack_type = location = cost = people = None

        # Skip introductory sections that don’t match entry pattern
        elif not org_name:
            continue

        # If not a new entry, assume it's part of the description or metadata
        elif org_name:
            # Check if the tag contains metadata
            if 'Type of Attack:' in text_content or 'Location:' in text_content or 'Cost:' in text_content or 'People affected:' in text_content:
                attack_type, location, cost, people = extract_metadata_from_tag(tag)
            else:
                # Otherwise, treat it as part of the description
                desc += text_content + " "

    # Add the last entry after finishing the loop
    if org_name:
        organizations.append(org_name)
        descriptions.append(desc.strip())
        attack_types.append(attack_type)
        locations.append(location)
        people_affected.append(people)
        costs.append(cost)

    # Create DataFrame
    df = pd.DataFrame({
        'Organization': organizations,
        'Description': descriptions,
        'Type of Attack': attack_types,
        'Location': locations,
        'People Affected': people_affected,
        'Cost': costs
    })

    return df

# URL of the webpage
url = 'https://arcticwolf.com/resources/blog/top-healthcare-industry-cyberattacks/'

# Fetch the webpage content
response = requests.get(url)
response.raise_for_status()

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all relevant tags that could contain entry data (both <h3> and <p>)
tags = soup.find_all(['h3', 'p'])

# Process entries and create DataFrame
df = extract_entry_data(tags)

# Clean up any None values
df = df.fillna('')

# Display the DataFrame
df


Unnamed: 0,Organization,Description,Type of Attack,Location,People Affected,Cost
0,93 million dollars USD.,That’s the average cost of a healthcare breach...,,,,
1,HCA Healthcare,During this July 2023 breach of a Tennessee-ba...,Third-party storage breach,"Nashville, Tennessee, with nationwide impact",11 million patients,
2,Medibank,Russian-based hackers believed to have ties to...,Ransomware,"Australia, with global impact",9.7 million patients,
3,Regal Medical Group,This Southern California-based medical group w...,Ransomware,California,3.3 million patients,
4,Cerebral,Telehealth organization Cerebral made headline...,Data breach; accidental insider threat,National,3.1 million patients,
5,Shields Health Care Group,"In May of 2022, this Massachusetts-based medic...",Not disclosed,Massachusetts,2 million patients,
6,Advocate Aurora Health,With 26 hospitals across Wisconsin and Illinoi...,Third-party vendor,"Wisconsin, Illinois",3 million patients,
7,Banner Health,"In 2016, hackers used malware to breach the pa...",Malware,Arizona,3.7 million patients,$6 million
8,Medical Informatics Engineering,"In 2015, Medical Informatics Engineering (MIE)...",Brute force attack/SQL injection/Malware,Indiana,3.9 million patients,$1 million
9,Advocate Medical Group,"Between July and November 2013, Advocate Medic...",Physical theft,Illinois,4.5 million individuals,


In [39]:
import pandas as pd

# Cybersecurity Attacks Dataset
data = {
    "Organization Name": [
        "Boston Children's Hospital", "Lukas Hospital", "Hancock Regional Hospital",
        "Hollywood Presbyterian Medical Center", "Champaign-Urbana Public Health District",
        "Brno University Hospital", "Hammersmith Medicines Study"
    ],
    "Location / Region": [
        "Boston", "Germany", "United States", "Los Angeles", "United States",
        "Czech Republic", "London"
    ],
    "Cyber Attack Category": [
        "DDoS", "Social Engineering & Malware", "Malware (SamSam)", 
        "Malware (Locky) & Phishing", "Malware (NetWalker)", 
        "Ransomware", "Ransomware"
    ],
    "Attack Method": [
        "Phishing and exposed ports/services", "Social engineering", "Microsoft RDP exploit",
        "N/A", "N/A", "Windows XP vulnerability", "Ransomware-as-a-service"
    ],
     "Financial Cost": [
        "$300,000", "N/A", "$50,000", "$17,000", "$350,000", "N/A", "No ransom paid"
    ],
    "Number of People Affected": [
        "5000+", "200+", "400+", "3000+", "1500+", "10000+", "1200+"
    ],
    "Number of Products Affected": [
        None, 5, 8, 12, 3, 20, 7
    ],
    "Duration of Attack": [
        "More than 24 hours", "More than 24 hours", "More than 24 hours", 
        "Less than 24 hours", "More than 24 hours", "More than 24 hours", "More than 24 hours"
    ],
    "Year": [2014, 2016, 2018, 2016, 2020, 2020, 2020],
    "Result": [
        "Network inactive for two weeks, disrupting operations and closing the fundraising site.",
        "High-risk surgeries postponed; systems sanitized.",
        "Backup files permanently destroyed.",
        "Staff unable to access patient info or backups.",
        "Website blocked; updates moved to Facebook.",
        "Complete IT network shutdown, affecting operations and patient safety.",
        "Private patient data stolen."
    ]
   
   
}

# Creating DataFrame for the dataset
df_dataset = pd.DataFrame(data)

# Displaying the dataset
print("Cybersecurity Attacks Dataset:")
df_dataset.to_excel('data.xlsx')

# If needed, save to CSV for database-like storage
# df_dataset.to_csv("cybersecurity_attacks_dataset.csv", index=False)


Cybersecurity Attacks Dataset:
