In [9]:
import pandas as pd
import os
from dotenv import load_dotenv
from verispy import VERIS
from mlxtend.frequent_patterns import association_rules, fpgrowth

# Environment variables
load_dotenv() 

JSON_DIR = os.environ.get('JSON_DIR')

CSV_DIR = os.path.join("..", os.environ.get('CSV_DIR'))

VERIS_DF = os.environ.get('BOOLEAN_CSV_NAME')

VERIS_DF_URL = os.environ.get('BOOLEAN_CSV_URL')



In [11]:
def load_dataset(csv_dir=CSV_DIR,
                 veris_df_csv=VERIS_DF, 
                 nrows=None):
    """ 
    Loads veris_df type csv (boolean VCDB dataset) from disk as a 
    Pandas DataFrame.

    Parameters
    ---------- 
        csv_dir: str / path
            The path to read the csv from
        
        veris_df_csv: str
            The name of the csv file

    Returns
    ---------- 
    DataFrame
        The loaded veris_df dataset as Pandas DataFrame  

    """    
    veris_df = pd.read_csv(os.path.join(csv_dir, veris_df_csv),
                           index_col=0,
                           low_memory=False,
                           nrows=nrows)
    
    return veris_df

enemies = load_dataset()
enemies.head()

Unnamed: 0,action.Environmental,action.Error,action.Hacking,action.Malware,action.Misuse,action.Physical,action.Social,action.Unknown,action.environmental.notes,action.environmental.variety.Deterioration,...,victim.revenue.iso_currency_code.YER,victim.revenue.iso_currency_code.ZAR,victim.revenue.iso_currency_code.ZEC,victim.revenue.iso_currency_code.ZMK,victim.revenue.iso_currency_code.ZWD,victim.secondary.amount,victim.secondary.notes,victim.secondary.victim_id,victim.state,victim.victim_id
0,False,False,True,False,False,False,False,False,,False,...,False,False,False,False,False,,,,PA,Centerville Clinics Inc.
1,False,False,False,False,True,False,False,False,,False,...,False,False,False,False,False,,,,US-CA,Taco Bell
2,False,True,False,False,False,False,False,False,,False,...,False,False,False,False,False,,,,VA,American Diabetes Association
3,False,False,False,False,False,True,False,False,,False,...,False,False,False,False,False,,,,,South Walkerville Medical Centre
4,False,False,True,False,False,False,False,False,,False,...,False,False,False,False,False,,,,,Netfleet Domain Names


In [17]:
enemies = enemies[enemies['timeline.incident.year']>=2000]
ext = [col for col in enemies.columns if 'actor.external.country.' in col]
vict = [col for col in enemies.columns if 'victim.country.' in col]
enemies =  enemies[ext+vict]

# Building the model 
frq_items = fpgrowth(enemies, min_support = 0.002, use_colnames = True) 
# Collecting the inferred rules inside the enemies dataframe 
rules_countries = association_rules(frq_items, metric ="lift",\
                                    min_threshold = 1) 
rules_countries = rules_countries.sort_values(['confidence', 'lift'],\
                                              ascending =[False, False])

## Mainly US <-> US

KeyError: 'timeline.incident.year'

In [18]:
## Korea south -> north = 0
korea_south_north = enemies[enemies['actor.external.country.KR']==True]
korea_south_north = korea_south_north[korea_south_north['victim.country.KP']\
                                      ==True]

## Korea north to south = 18 incidents 
korea_north_south = enemies[enemies['actor.external.country.KP']==True]
korea_north_south = korea_north_south[korea_north_south['victim.country.KR']==True]

rules_countries.to_csv("usual_suspects.csv", index=False)

In [20]:
korea_soy

Unnamed: 0,actor.external.country.AD,actor.external.country.AE,actor.external.country.AF,actor.external.country.AG,actor.external.country.AI,actor.external.country.AL,actor.external.country.AM,actor.external.country.AO,actor.external.country.AQ,actor.external.country.AR,...,victim.country.VN,victim.country.VU,victim.country.WF,victim.country.WS,victim.country.XK,victim.country.YE,victim.country.YT,victim.country.ZA,victim.country.ZM,victim.country.ZW
