In [2]:
import pandas as pd
import re
from collections import Counter

In [5]:
# Loading the dataset
data = pd.read_csv("hacktivist_messages.csv", delimiter=';')

## Question A

In [12]:
# Identifying countries EU+Nordic
eu_nordic_countries = ["Denmark", "Sweden", "Finland", "Norway", "Iceland", "Poland", 
                       "Germany", "France", "Italy", "Spain", "Netherlands", "Belgium", 
                       "Lithuania", "Latvia", "Estonia", "Ireland", "Portugal", "Greece", 
                       "Luxembourg", "Czech Republic", "Slovakia", "Slovenia", "Hungary", 
                       "Austria", "Romania", "Bulgaria", "Croatia", "Cyprus", "Malta"]


In [15]:
def clean_and_concatenate_text(data):
    # Convert all text to lowercase and fill NaN values with empty strings
    all_text = ' '.join(data['Text'].fillna('').str.lower())
    return all_text

In [17]:
def count_mentions(text, items):
    mentions = Counter()
    
    for item in items:
        # Use regex to find whole words matching the item (case-insensitive)
        item_mentions = len(re.findall(r"\b{}\b".format(re.escape(item.lower())), text))
        if item_mentions > 0:
            mentions[item] = item_mentions
    
    return mentions

cleaned_text = clean_and_concatenate_text(data)

country_mentions = count_mentions(cleaned_text, eu_nordic_countries)


for country, count in sorted(country_mentions.items(), key=lambda x: x[1], reverse=True):
    print(f"{country}: {count} mentions")


Poland: 176 mentions
Spain: 159 mentions
Czech Republic: 146 mentions
Finland: 115 mentions
Germany: 102 mentions
Italy: 102 mentions
Lithuania: 99 mentions
Latvia: 96 mentions
Denmark: 76 mentions
France: 76 mentions
Netherlands: 73 mentions
Sweden: 61 mentions
Estonia: 54 mentions
Luxembourg: 44 mentions
Norway: 42 mentions
Belgium: 42 mentions
Romania: 40 mentions
Slovakia: 29 mentions
Bulgaria: 27 mentions
Slovenia: 23 mentions
Greece: 20 mentions
Austria: 17 mentions
Iceland: 8 mentions
Hungary: 7 mentions
Croatia: 7 mentions
Cyprus: 6 mentions
Portugal: 5 mentions
Ireland: 3 mentions
Malta: 3 mentions


## Question B

In [18]:
critical_infrastructure_sectors = [
    "Energy", 
    "Transport", 
    "Banking", 
    "Financial Market", 
    "Health", 
    "Drinking Water", 
    "Waste Water", 
    "Digital Infrastructure", 
    "Public Administration", 
    "Space", 
    "Food Production"
]

In [19]:
def count_sector_mentions(text, sectors):
    sector_mentions = Counter()
    
    for sector in sectors:
        # Find exact sector name in the text (case-insensitive)
        if re.search(r"\b{}\b".format(re.escape(sector.lower())), text):
            # Count occurrences of the sector name in the entire dataset
            sector_mentions[sector] = len(re.findall(r"\b{}\b".format(re.escape(sector.lower())), text))
    
    return sector_mentions

# Step 5: Clean and preprocess the text
all_text = clean_and_concatenate_text(data)

# Step 6: Apply the function to get the count of mentions for each sector
sector_mentions = count_sector_mentions(all_text, critical_infrastructure_sectors)

# Step 7: Display the results
for sector, count in sorted(sector_mentions.items(), key=lambda x: x[1], reverse=True):
    print(f"{sector}: {count} mentions")

Transport: 423 mentions
Energy: 117 mentions
Banking: 72 mentions
Public Administration: 20 mentions
Space: 8 mentions
Digital Infrastructure: 6 mentions
Health: 4 mentions
Drinking Water: 1 mentions


## Question C

In [27]:
security_terms = {
    'Confidentiality': ['Confidentiality','confidentiality','leak', 'data breach', 'exposed', 'stolen', 'accessed', 'unauthorized'],
    'Integrity': ['Integrity','integrity','tampered', 'altered', 'defaced', 'manipulated', 'changed', 'corrupted'],
    'Availability': ['Availability','availability','ddos', 'down', 'unavailable', 'crash', 'offline', 'service disruption']
}


In [28]:
def count_security_mentions(text, security_terms):
    """
    Counts the occurrences of each security property (confidentiality, integrity, availability) based on keywords.
    """
    mentions = Counter()

    for property_name, terms in security_terms.items():
        # Use regex to find occurrences of the terms associated with each property
        for term in terms:
            mentions[property_name] += len(re.findall(r"\b{}\b".format(re.escape(term.lower())), text))

    return mentions
cleaned_text = clean_and_concatenate_text(data)

# Step 6: Count security property mentions
security_mentions = count_security_mentions(cleaned_text, security_terms)

# Step 7: Display the results
print("Security Property Mentions:")
for property_name, count in security_mentions.items():
    print(f"{property_name}: {count} mentions")

Security Property Mentions:
Confidentiality: 2 mentions
Integrity: 11 mentions
Availability: 3052 mentions
