In [6]:
import matplotlib.pyplot as plt
import pandas as pd
path = "Global_Cybersecurity_Threats_2015-2024.csv"
# Load the dataset
df = pd.read_csv(path)
print(df.head())

   Country  Year        Attack Type     Target Industry  \
0    China  2019           Phishing           Education   
1    China  2019         Ransomware              Retail   
2    India  2017  Man-in-the-Middle                  IT   
3       UK  2024         Ransomware  Telecommunications   
4  Germany  2018  Man-in-the-Middle                  IT   

   Financial Loss (in Million $)  Number of Affected Users Attack Source  \
0                          80.53                    773169  Hacker Group   
1                          62.19                    295961  Hacker Group   
2                          38.65                    605895  Hacker Group   
3                          41.44                    659320  Nation-state   
4                          74.41                    810682       Insider   

  Security Vulnerability Type Defense Mechanism Used  \
0          Unpatched Software                    VPN   
1          Unpatched Software               Firewall   
2              Weak 

# Datacleaning

In [None]:
import numpy as np


print("="*50)
print("INFORMACIÓN INICIAL DEL DATASET")
print("="*50)
print(f"Shape inicial: {df.shape}")
print(f"\nColumnas: {df.columns.tolist()}")
print(f"\nPrimeras filas:\n{df.head()}")
print(f"\nInfo del dataset:")
print(df.info())

# ============================================
# 1. VALORES NULOS Y DUPLICADOS
# ============================================
print("\n" + "="*50)
print("LIMPIEZA: VALORES NULOS Y DUPLICADOS")
print("="*50)

print(f"\nValores nulos por columna:")
print(df.isnull().sum())

print(f"\nFilas duplicadas: {df.duplicated().sum()}")

# Eliminar duplicados si existen
if df.duplicated().sum() > 0:
    df = df.drop_duplicates()
    print(f"Duplicados eliminados. Nuevo shape: {df.shape}")


# ============================================
# 2. LIMPIEZA DE ESPACIOS EN BLANCO
# ============================================
print("\n" + "="*50)
print("LIMPIEZA: ESPACIOS EN BLANCO")
print("="*50)

# Limpiar espacios en columnas de texto
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.strip()
        print(f"Espacios eliminados en: {col}")

# ============================================
# 3. VALIDACIÓN DE TIPOS DE DATOS
# ============================================
print("\n" + "="*50)
print("VALIDACIÓN: TIPOS DE DATOS")
print("="*50)

print(f"\nTipos de datos actuales:")
print(df.dtypes)

# Verificar que columnas numéricas sean realmente numéricas
numeric_cols = ['Year', 'Financial Loss (in Million $)', 
                'Number of Affected Users', 'Incident Resolution Time (in Hours)']

for col in numeric_cols:
    if col in df.columns:
        # Verificar si hay valores no numéricos
        non_numeric = pd.to_numeric(df[col], errors='coerce').isna().sum() - df[col].isna().sum()
        if non_numeric > 0:
            print(f"{col} tiene {non_numeric} valores no numéricos")
            df[col] = pd.to_numeric(df[col], errors='coerce')

# ============================================
# 4. VALIDACIÓN DE RANGOS
# ============================================
print("\n" + "="*50)
print("VALIDACIÓN: RANGOS DE VALORES")
print("="*50)

# Verificar Year
if 'Year' in df.columns:
    print(f"\nRango de años: {df['Year'].min()} - {df['Year'].max()}")
    invalid_years = df[(df['Year'] < 2015) | (df['Year'] > 2024)]
    if len(invalid_years) > 0:
        print(f"{len(invalid_years)} filas con años fuera del rango 2015-2024")

# Verificar valores negativos en columnas numéricas
for col in numeric_cols:
    if col in df.columns and col != 'Year':
        negative_count = (df[col] < 0).sum()
        if negative_count > 0:
            print(f"{col} tiene {negative_count} valores negativos")

# ============================================
# 5. ESTADÍSTICAS DESCRIPTIVAS
# ============================================
print("\n" + "="*50)
print("ESTADÍSTICAS DESCRIPTIVAS (POST-LIMPIEZA)")
print("="*50)

print(f"\nEstadísticas de columnas numéricas:")
print(df.describe())

print(f"\nValores únicos en columnas categóricas:")
categorical_cols = ['Country', 'Attack Type', 'Target Industry', 
                   'Attack Source', 'Security Vulnerability Type', 
                   'Defense Mechanism Used']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}: {df[col].nunique()} valores únicos")
        print(f"Top 5: {df[col].value_counts().head().to_dict()}")

# ============================================
# 6. RESUMEN FINAL
# ============================================
print("\n" + "="*50)
print("RESUMEN FINAL DEL DATA CLEANING")
print("="*50)

print(f"\nShape final: {df.shape}")
print(f"Valores nulos totales: {df.isnull().sum().sum()}")
print(f"Porcentaje de datos completos: {(1 - df.isnull().sum().sum()/(df.shape[0]*df.shape[1]))*100:.2f}%")


INFORMACIÓN INICIAL DEL DATASET
Shape inicial: (3000, 10)

Columnas: ['Country', 'Year', 'Attack Type', 'Target Industry', 'Financial Loss (in Million $)', 'Number of Affected Users', 'Attack Source', 'Security Vulnerability Type', 'Defense Mechanism Used', 'Incident Resolution Time (in Hours)']

Primeras filas:
   Country  Year        Attack Type     Target Industry  \
0    China  2019           Phishing           Education   
1    China  2019         Ransomware              Retail   
2    India  2017  Man-in-the-Middle                  IT   
3       UK  2024         Ransomware  Telecommunications   
4  Germany  2018  Man-in-the-Middle                  IT   

   Financial Loss (in Million $)  Number of Affected Users Attack Source  \
0                          80.53                    773169  Hacker Group   
1                          62.19                    295961  Hacker Group   
2                          38.65                    605895  Hacker Group   
3                          