In [None]:
import pandas as pd

# Load the data
df = pd.read_csv("vcdb.csv", low_memory=False)

In [None]:
# Max and min years
max_year = df['timeline.incident.year'].max()
min_year = df['timeline.incident.year'].min()
max_year, min_year

In [9]:
# Filter entries between 2001 and 2024
filtered_df = df[(df['timeline.incident.year'] >= 2001) & (df['timeline.incident.year'] <= 2024)]

# Count the number of entries
entry_count = len(filtered_df)
entry_count

10277

In [11]:

# Filter columns that start with 'victim.country'
country_columns = [col for col in df.columns if col.startswith('victim.country')]


attacks_by_victim_country = df[country_columns]
# Output the filtered columns
attacks_by_victim_country.to_csv("attacks_by_victim_country.csv")
#attacks_by_victim_country

In [83]:
import pandas as pd
import plotly.express as px
import pycountry
import requests

# 读取CSV数据
df = pd.read_csv("attacks_by_victim_country.csv", index_col=0)

# 统计每个国家的攻击次数
attack_counts = df.sum().reset_index()
attack_counts.columns = ['country_code', 'attack_count']
attack_counts['country_code'] = attack_counts['country_code'].str.replace('victim.country.', '')

# 获取国家人口数据
def get_population_data():
    url = "https://restcountries.com/v3.1/all?fields=cca2,population"
    response = requests.get(url)
    data = response.json()
    return {country['cca2']: country['population'] for country in data}

population_data = get_population_data()

# 合并攻击数据和人口数据
attack_counts['population'] = attack_counts['country_code'].map(population_data)

# 计算攻击密度（每百万人口攻击次数）
attack_counts['attack_density'] = (attack_counts['attack_count'] / attack_counts['population']) * 1_000_000

# 转换国家代码为ISO3格式
def code_to_iso3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except:
        return None

attack_counts['iso3'] = attack_counts['country_code'].apply(code_to_iso3)

# 过滤无效数据
valid_data = attack_counts.dropna(subset=['iso3', 'population'])

# 输出valid_data到csv
valid_data.to_csv("valid_data_v.csv")

In [71]:

from scipy.stats import zscore

# 确保操作的是 DataFrame 的副本
valid_data = valid_data.copy()
if valid_data['attack_density'].isnull().any():
    valid_data['attack_density'].fillna(0, inplace=True)  # 填充缺失值
# 计算 Z-Score
valid_data.loc[:, 'attack_density_zscore'] = zscore(valid_data['attack_density'])

# 截断 Z-Score 范围（例如 -3 到 3）
valid_data.loc[:, 'attack_density_zscore_clipped'] = valid_data['attack_density_zscore'].clip(-3, 3)
valid_data.attack_density_zscore.head()

0   -0.065112
1   -0.064212
2   -0.065082
3   -0.059057
4   -0.065112
Name: attack_density_zscore, dtype: float64

In [82]:
# 创建热力图
fig = px.choropleth(
    valid_data,
    locations="iso3",
    color="attack_density_zscore_clipped",
    hover_name="country_code",
    hover_data={"attack_count": True, "population": True, "attack_density": ":.2f"},
    color_continuous_scale=px.colors.sequential.Plasma,
    labels={'attack_density_zscore_clipped': 'Z-Score of Attack Density'},
    title="Cyber Attack Density by Country (Z-Score Normalization)",
    range_color=[-0.066, -0.05]
)

fig.update_layout(
    margin={"r": 0, "t": 40, "l": 0, "b": 0},
    coloraxis_colorbar=dict(
        title="Z-Score",
        tickprefix="≈"
    ),
    plot_bgcolor="white",  # 设置图表背景颜色为白色
    paper_bgcolor="white",  # 设置纸张背景颜色为白色
    geo=dict(
        showcoastlines=True,
        coastlinecolor="Black",
        projection_type="equirectangular",  # 选择合适的地图投影
        bgcolor="white"  # 设置地图背景颜色为白色
    ),
    font=dict(color="black"),  # 设置文字颜色为黑色
)

fig.show()




In [80]:

import plotly.express as px

# 假设 valid_data 是已经处理好的数据
valid_data = valid_data.copy()

# 分位数归一化函数
def quantile_normalize(series):
    # 计算分位数
    rank = series.rank(method='average')
    normalized = rank / len(series)  # 归一化到 [0, 1]
    return normalized

# 对攻击密度进行分位数归一化
valid_data.loc[:, 'attack_density_normalized'] = quantile_normalize(valid_data['attack_density'])

# 创建热力图
fig = px.choropleth(
    valid_data,
    locations="iso3",
    color="attack_density_normalized",
    hover_name="country_code",
    hover_data={"attack_count": True, "population": True, "attack_density": ":.2f"},
    color_continuous_scale=px.colors.sequential.Plasma,
    labels={'attack_density_normalized': 'Normalized Attack Density'},
    title="Cyber Attack Density by Country (Quantile Normalization)",
    range_color=[0, 1]  # 归一化后的范围是 [0, 1]
)

# 调整布局
fig.update_layout(
    margin={"r":0,"t":40,"l":0,"b":0},
    coloraxis_colorbar=dict(
        title="Normalized Density",
        tickvals=[0, 0.25, 0.5, 0.75, 1],
        ticktext=["0%", "25%", "50%", "75%", "100%"]
    ),
    plot_bgcolor="white",  # 设置图表背景颜色为白色
    paper_bgcolor="white",  # 设置纸张背景颜色为白色
    geo=dict(
        showcoastlines=True,
        coastlinecolor="Black",
        projection_type="equirectangular",  # 选择合适的地图投影
        bgcolor="white"  # 设置地图背景颜色为白色
    ),
    font=dict(color="black"),  # 设置文字颜色为黑色
)

fig.show()