In [None]:
# Installation for IOC defanging - https://github.com/ioc-fang/ioc-fanger
!pip install -q ioc-fanger

In [None]:
import datetime
import pandas as pd
import ioc_fanger
import json
import time
import re
import os
import zipfile
from collections import defaultdict
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

## Generate complete list of HF keywords

In [None]:
tags = ["MAL", "FILE", "OS", "PROT", "MD5", "SHA1", "SHA2",  "ENCR", "TOOL", "VULID", "VULNAME",
        "ACT", "APT", "SECTEAM", "IDTY", "EMAIL", "IP", "DOM", "URL",  ]  #"LOC", "TIME", 

chunk_size = 500

def save_chunk(output_directory, count, chunk):
    output_file = os.path.join(output_directory, f"chunk_{count}.txt")
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write('\n'.join(chunk))

output_directory = 'new_lists_folder'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

output_directory = 'chunks_complete_entity_list'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

complete_list = set()
idx_file = 1
count = 0
chunk = []
tag_list = []
for t in tags:
    tag = t
    print(f'Tag {t} first file index {idx_file}')
    countl=0
    new_list = []
    file_path = f'/home/anon/input/datasets-extracted-entities/HF_extracted_entities/entity_lists/{t}_list.txt'
    with open(file_path, 'r', encoding='utf-8') as file:
        new_file = []
        for line in file:
            new_file.append(line.strip())
        new_file = sorted(new_file, key=lambda x: (-len(x), x))
        for line in new_file:
            countl += 1
            line = line.strip()
            if line not in complete_list:
                complete_list.add(line)
                new_list.append(line)
                
                chunk.append(line)
                count += 1
                if count % chunk_size == 0:
                    save_chunk('chunks_complete_entity_list', idx_file, chunk)
                    tag_list.append(tag)
                    chunk = []
                    idx_file += 1                    
        if chunk:
            save_chunk('chunks_complete_entity_list', idx_file, chunk)
            tag_list.append(tag)
            chunk = []
            idx_file += 1 
            count=0
    
    print(f'{t} - Before: {countl} - After: {len(new_list)}\n')
    
    output_file = os.path.join('new_lists_folder', f"{t}_list.txt")
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write('\n'.join(new_list))

print(f'Chunks: {idx_file}')

In [None]:
len(tag_list)

## Search HF keywords in HF posts

In [None]:
input_directory_keywords = '/home/anon/working/chunks_complete_entity_list'
input_path_HF = '/home/anon/input/datasets-extracted-entities/HF_extracted_entities/HF_extracted_entities.json'
output_directory = '/home/anon/working/chunks_HF_keywords_in_HF_posts/'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

### Load HF dataframe

In [None]:
# Load HF posts
df_HF = pd.read_json(input_path_HF)

# Convert title and content columns to lowercase for efficient matching
df_HF['threadTitle'] = df_HF['threadTitle'].str.lower()
df_HF['flatContent'] = df_HF['flatContent'].str.lower()
print('Number of relevant posts: ', df_HF.shape[0])

In [None]:
def defang_iocs_in_text(text):
    defanged_text = ioc_fanger.defang(text)
    return defanged_text

def fang_iocs_in_text(text):
    fanged_text = ioc_fanger.fang(text)
    return fanged_text

In [None]:
url_pattern = r'http\S+|www\S+|https\S+|h\*\*p\S+|h\*\*ps\S+'  # Starting with http , https , www, h**p, h**ps
delimiter_pattern = r'\s+|[:,;?!{}\[\]=|]+'  # To split

def process_content(row):  # Try to preserve URLs while tokenizing - Reproducing splitting used to process content to extract kw HF side
    #content = row['threadTitle'] + ' ' + row['flatContent']
    content = ' '.join(row['content']).lower()
    tokens = re.split(f"({url_pattern})", content)  # Split the text by URLs and non-URLs
    tokens = [token.strip() for token in tokens if token.strip()]   # Remove empty tokens and trim whitespace

    # Split non-link tokens into sub_tokens
    split_tokens = []
    for token in tokens:
        if not re.match(url_pattern, token):
            sub_tokens = re.split(delimiter_pattern, token)
            split_tokens.extend(sub_tokens)
        else:
            split_tokens.append(token)

    return ' '.join(set(split_tokens))

# Function to merge entity lists
def merge_entity_lists(entities):
    merged_list = []
    for entity_type, entity_list in entities.items():
        merged_list.extend([fang_iocs_in_text(el).lower() for el in entity_list])
    return merged_list

# Initialize tqdm progress bar
pbar = tqdm(total=len(df_HF))

def apply_process_content(row):
    global pbar
    pbar.update(1)  # Update tqdm progress bar
    return process_content(row)

df_HF['tokenizedContent'] = df_HF.apply(apply_process_content, axis=1)

# Close tqdm progress bar
pbar.close()

df_HF['date'] = pd.to_datetime(df_HF['date'])
df_HF['date'] = df_HF['date'].dt.strftime('%m-%d-%Y')

# Sort the DataFrame by the 'date' column
df_HF.sort_values(by='date', inplace=True)

# Apply the merge_entity_lists function to each row and create a new column
df_HF['merged_entities'] = df_HF['entities'].apply(merge_entity_lists)

# Filter the DataFrame to only keep the selected columns
df_HF = df_HF[['ID', 'date', 'merged_entities', 'tokenizedContent']]

# Convert the filtered DataFrame to a list of dictionaries
existing_json_data = df_HF.to_dict(orient='records')

### Search with entities

In [None]:
def process_keyword(keyword):
    keyword_list = []
    for entry in existing_json_data:
        if keyword in entry['merged_entities']:
            keyword_list.append({
                'ID': entry['ID'],
                'date': entry['date']
            })
    return keyword, keyword_list

In [None]:
# Define regular expression patterns
ip_pattern = re.compile(r'^(\d+\.\d+\.\d+\.\d+)$')
pattern_to_remove = re.compile(r'^[-\'\+,/\\:=_\|\.\^~0-9]+$')
stop_words = set(stopwords.words('english'))

In [None]:
def get_HF_keywords_in_HF(keywords_path):
    print(f'Working on {keywords_path}')
    # Open the set of keywords
    with open(keywords_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    # Remove newlines and create a list
    keywords_list = [line.strip() for line in lines]
    # Filter and keep only the words that match the patterns
    keywords_list = [keyword for keyword in keywords_list
                     if ip_pattern.match(keyword) or not (pattern_to_remove.match(keyword) or keyword in stop_words or len(keyword) == 1)]

    # Create a dictionary to hold the reports by keyword
    return {keyword: [] for keyword in keywords_list}, keywords_list

In [None]:
def start_searching(HF_keywords_in_HF_dict, tag, keywords_list):
    # Use ThreadPoolExecutor to parallelize keyword processing
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks for each keyword
        keyword_futures = {executor.submit(process_keyword, keyword): keyword for keyword in keywords_list}

        # Use tqdm to track progress
        with tqdm(total=len(keywords_list), desc='Searching in entities') as pbar:
            for future in concurrent.futures.as_completed(keyword_futures):
                keyword = keyword_futures[future]
                keyword_results = future.result()
                HF_keywords_in_HF_dict[keyword] = keyword_results
                pbar.update(1)  # Update progress bar
    
    # Save resulting list
    final_structure = []

    for keyword, value_list in HF_keywords_in_HF_dict.items():
            final_structure.append({
                'keyword': keyword,
                'tag': tag,
                'matching_posts': value_list[1]
            })
            
    return final_structure

### Search with posts

In [None]:
def read_lines_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield line.strip()

def create_set_from_file(file_path):
    return set(read_lines_from_file(file_path))

def create_bulk_search_pattern(keywords_list):
    keywords_list = sorted(keywords_list, key=lambda x: (-len(x), x))
    #keyword_regex = r"\b(?:" + "|".join(re.escape(word) for word in keywords_list) + r")\b"
    keyword_regex = r"(^|\s)[^a-zA-Z0-9]*(" + "|".join(re.escape(word.lower()) for word in keywords_list) + r")[^a-zA-Z0-9]*(\s|$)"
    return re.compile(keyword_regex)

def search_matches_in_chunk(chunk, bulk_search_pattern):
    word_matching = defaultdict(list)
    for post in chunk.itertuples(index=False):
        content = post.tokenizedContent.lower()

        matches = bulk_search_pattern.findall(content)
        for word in matches:
            matched_word = word[1].lower()
            word_matching[matched_word].append({
                'ID': post.ID,
                'date': post.date
            })
    return word_matching

def search_matches(input_file_path, output_file_path, df_HF):
    # Load keywords of reports
    keywords_list = create_set_from_file(input_file_path)

    # Convert keywords_list to lowercase for efficient matching
    keywords_list_lower = {word.lower() for word in keywords_list}

    # Compile the bulk search pattern
    bulk_search_pattern = create_bulk_search_pattern(keywords_list_lower)

    # Divide the dataframe into smaller chunks for parallel processing
    chunk_size = 1000
    df_chunks = [df_HF[i:i+chunk_size] for i in range(0, df_HF.shape[0], chunk_size)]
                
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(search_matches_in_chunk, chunk, bulk_search_pattern) for chunk in df_chunks]
        word_matching_dict = {}
        for future in tqdm(futures, total=len(futures), desc='Searching in posts'):
            chunk_word_matching = future.result()
            for word, matches in chunk_word_matching.items():
                if word in word_matching_dict:
                    word_matching_dict[word].extend(matches)
                else:
                    word_matching_dict[word] = matches

    # Convert word_matching_dict to a list of dictionaries
    word_matching = [{'keyword': word, 'matching posts': matches} for word, matches in word_matching_dict.items()]

    return word_matching

### Compare

In [None]:
def compare_struct_and_save(struct_from_entity, struct_from_posts, output_path):
    for entry in struct_from_posts:
        kw = entry['keyword']
        mp = entry['matching posts']
        
        for idx, item in enumerate(struct_from_entity):
            if item['keyword'] == kw:
                item['matching_posts'].extend(mp)
                break
                
    
    # Create a dictionary to store unique IDs for each keyword using sets
    unique_ids = {}
    # Iterate through the data and remove duplicates using sets
    for item in tqdm(struct_from_entity, 'Removing duplicates'):
        keyword = item["keyword"]
        #tag = item["tag"]
        matching_posts = item["matching_posts"]
        # Create a set to store unique IDs for the current keyword
        unique_ids[keyword] = set()
        # Create a list to store unique posts for the current keyword
        unique_posts = []
        for post in matching_posts:
            post_id = post["ID"]
            # Check if the ID is not in the unique_ids set for this keyword
            if post_id not in unique_ids[keyword]:
                # Add the ID to the unique_ids set
                unique_ids[keyword].add(post_id)
                # Add the post to the unique_posts list
                unique_posts.append(post)
        # Update the matching posts for the current keyword
        item["matching_posts"] = unique_posts
    
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(struct_from_entity, outfile, indent=4)

### Execute

In [None]:
start = 1
end = start + 29  
#(1, 30) - (30, 59) - (59, 88) - (88, 117) - (117, 146) - (146, 175) - (175, 204) - (204, 233) - (233, 262) - (262, 281)

end = min(end, 281)

print('Searching from {0} to {1}'.format(start, end-1))

for idx_file in range(start, end):  
    iFilename = 'chunk_{0}.txt'.format(idx_file)
    input_path = os.path.join(input_directory_keywords, iFilename)    
    HF_keywords_in_HF_dict, keywords_list = get_HF_keywords_in_HF(input_path)

    oFilename = 'HF_keywords_in_HF_posts_{0}.json'.format(idx_file)
    output_path = os.path.join(output_directory, oFilename)
    tag = tag_list[idx_file-1]

    struct_from_entity = start_searching(HF_keywords_in_HF_dict, tag, keywords_list)
    struct_from_posts = search_matches(input_path, output_path, df_HF)
    
    compare_struct_and_save(struct_from_entity, struct_from_posts, output_path)

    print()

## Save as ZIP

In [None]:
# Directory path containing the output files to zip
source_directory = output_directory

# Zip file path
zip_file_path = '/home/anon/working/HF_keywords_in_HF_posts.zip'

# Create a ZIP archive
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Loop through all files in the directory and add them to the ZIP archive
    for root, _, files in os.walk(source_directory):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, source_directory))