In [None]:
# Installation for IOC defanging - https://github.com/ioc-fang/ioc-fanger
!pip install -q ioc-fanger

In [None]:
import ioc_fanger
import pandas as pd
import re
import json
from tqdm import tqdm
import ipaddress

## Merger

In [None]:
total_data = []
for i in tqdm(range(1, 394), desc="Merging"):
    file_path = f'/home/anon/input/chunks-predictions/REP_chunks_predictions/REP_{i}.json'
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    for entry in data:
        total_data.append(entry)

In [None]:
len(total_data)

## Extractor

In [None]:
# Regular expressions to match different patterns
email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
time_regex = r'^\d{1,2}:\d{1,2}(:\d{1,2})?$'
ip_regex = r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$'
web_domain_regex = r'^([a-zA-Z0-9-]+\.[a-zA-Z]{2,})$'
url_regex = r'^https?://[^\s/$.?#][^\s/?#]*[^\s/.?#]$'
file_with_extension_regex = r'^[a-zA-Z0-9-]+\.[a-zA-Z0-9]+$'
md5_regex = r'^[a-fA-F0-9]{32}$'
sha1_regex = r'^[a-fA-F0-9]{40}$'
sha2_regex = r'^[a-fA-F0-9]{64}$'
malware_name_regex = r'^[a-zA-Z0-9-]+(/[a-zA-Z0-9-]+)+$'
cve_regex = r'^CVE-[0-9]{4}-[0-9]{4,}$'
vulnerability_id_regex = r'^[a-zA-Z0-9-]+$'
windows_environment_variable_regex = r'^%(WINDOWS|[a-zA-Z0-9_]+)%.*?(\\)?$'
unix_like_environment_variable_regex = r'^\$\{[a-zA-Z_][a-zA-Z0-9_]*\}.*?(\\)?$'

def clean_keyword(word):
    # Check if the word matches any of the special formats
    special_formats = [
        email_regex, time_regex, ip_regex, web_domain_regex, url_regex,
        file_with_extension_regex, md5_regex, sha1_regex, sha2_regex, malware_name_regex,
        cve_regex, vulnerability_id_regex, windows_environment_variable_regex, unix_like_environment_variable_regex
    ]
    for regex_pattern in special_formats:
        if re.match(regex_pattern, word):
            return word

    if len(word.split()) == 1:
        # Remove punctuation at the beginning and ending of the word
        cleaned_word = re.sub(r'^\W+|(^\W+)|\W+$', r'\1', word)
    else:
        cleaned_word = re.sub(r'^\W+|\W+$', '', word)

    """if word != cleaned_word:
        print('\t{0} --> {1}'.format(word, cleaned_word))"""

    return cleaned_word

In [None]:
def extract_entities(row):
    entity_dict = {
        "ACT": set(),
        "APT": set(),
        "DOM": set(),
        "EMAIL": set(),
        "ENCR": set(),
        "FILE": set(),
        "IDTY": set(),
        "IP": set(),
        "LOC": set(),
        "MAL": set(),
        "MD5": set(),
        "OS": set(),
        "PROT": set(),
        "SECTEAM": set(),
        "SHA1": set(),
        "SHA2": set(),
        "TIME": set(),
        "TOOL": set(),
        "URL": set(),
        "VULID": set(),
        "VULNAME": set()
    }

    if int(row['ID']) % 1000 == 0:
        print('\n-------------------------ROW {}\\75405 - {:.2%}\n'
              .format(row['ID'], int(row['ID'])/75405))

    sentences_list = [string.split() for string in row['content']]
    tags_list = [string.split(',') for string in row['tags']]

    # Loop over sentences and tags simultaneously
    
    for sentence, sentence_tags in zip(sentences_list, tags_list):

        if len(sentence) != len(sentence_tags):
            sentence_tags = sentence_tags[:-1]
            if len(sentence) != len(sentence_tags):
                print("ERROR DIFFERENT LENGTH - at ID "+str(row['ID'])+" with:")
                print(len(sentence), '-', sentence)
                print(len(sentence_tags), '-', sentence_tags)

        # Reset vars for each sentence
        entity = ''
        keyword = ''

        # Loop over sentence tags
        for idx_tag, tag in enumerate(sentence_tags):
            # If it starts a new entity
            if tag[0] == 'B':
                # If the previous token is an entity itself
                if entity != '' and keyword != '':
                    # Add to set
                    #entity_dict[entity].add(clean_keyword(keyword))
                    entity_dict[entity].add(re.sub(r'^[\W\-_]+|[\W\-_]+$', '', clean_keyword(keyword)))
                # Store new entity and verify if it's composed of more word-tokens
                entity = tag.split('-')[1]
                keyword = sentence[idx_tag]

            # If it continues an existing entity
            elif tag[0] == 'I':
                # If there isn't a starting token
                if entity == '' or tag.split('-')[1] != entity:
                    """print('\tERROR AT ID {0} with entity \'{1}\':'.format(row['ID'], entity))
                    print('\t\t', ' '.join(sentence))
                    print('\t\t', ' '.join(sentence_tags))"""
                    entity = tag.split('-')[1]
                    """print('\t\t --> keep \'{0}\''.format(entity))"""
                # Keep storing the entity and verify if it's composed of more word-tokens
                keyword += ' ' + sentence[idx_tag]

            # If it's not an entity
            elif tag[0] == 'O':
                # If the previous token is an entity
                if entity != '' and keyword != '':
                    # Add to set
                    #entity_dict[entity].add(clean_keyword(keyword))
                    entity_dict[entity].add(re.sub(r'^[\W\-_]+|[\W\-_]+$', '', clean_keyword(keyword)))
                    # Reset vars
                    entity = ''
                    keyword = ''

        # If the last token is an entity
        if entity != '' and keyword != '':
            # Add to set
            #entity_dict[entity].add(clean_keyword(keyword))
            entity_dict[entity].add(re.sub(r'^[\W\-_]+|[\W\-_]+$', '', clean_keyword(keyword)))

    return {key: list(value) for key, value in entity_dict.items()}

In [None]:
df = pd.DataFrame(total_data)

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Format the 'date' column as MM-DD-YYYY
df['date'] = df['date'].dt.strftime('%m-%d-%Y')

print('Starting extraction')

# Extract entities
df['entities'] = df.apply(extract_entities, axis=1)

print('Saving results')

# Save the result
df.to_json('REP_extracted_entities.json', orient='records', indent=4)

In [None]:
def defang_iocs_in_text(text):
    defanged_text = ioc_fanger.defang(text)
    return defanged_text

def fang_iocs_in_text(text):
    fanged_text = ioc_fanger.fang(text)
    return fanged_text

In [None]:
def is_valid_ip(ip_str):
    try:
        ipaddress.ip_address(ip_str)
        return True
    except ValueError:
        return False
    
def is_numeric(input_str):
    pattern = r'^[\d.,]+$'  # Pattern for a combination of numbers and dots/commas
    return bool(re.match(pattern, input_str))

In [None]:
entity_dict = {
        "ACT": set(),
        "APT": set(),
        "DOM": set(),
        "EMAIL": set(),
        "ENCR": set(),
        "FILE": set(),
        "IDTY": set(),
        "IP": set(),
        "LOC": set(),
        "MAL": set(),
        "MD5": set(),
        "OS": set(),
        "PROT": set(),
        "SECTEAM": set(),
        "SHA1": set(),
        "SHA2": set(),
        "TIME": set(),
        "TOOL": set(),
        "URL": set(),
        "VULID": set(),
        "VULNAME": set()
    }

complete_list = set()

for _, r in df.iterrows():
    for e, l in r['entities'].items():
        for i in l:
            if i == '':
                continue  
            fang_i = fang_iocs_in_text(i).lower()
            if not is_valid_ip(fang_i) and is_numeric(fang_i):  # Avoid to ad pure numbers
                continue
            entity_dict[e].add(fang_i)
            complete_list.add(fang_i)

In [None]:
tags = ["APT", "SECTEAM", "IDTY", "OS", "EMAIL", "LOC", "TIME", "IP", "DOM", "URL", "PROT", 
         "FILE", "TOOL", "MD5", "SHA1", "SHA2", "MAL", "ENCR", "VULNAME", "VULID", "ACT"]

for t in tags:
    file_path = t + "_list.txt"

    # Open the file for writing ('w' mode)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(sorted(entity_dict[t])))
        
file_path = "complete_list.txt"
# Open the file for writing ('w' mode)
with open(file_path, 'w', encoding='utf-8') as file:
    file.write('\n'.join(sorted(complete_list)))