In [None]:
import datetime
import pandas as pd
import json
import time
import re
import os
import zipfile
from collections import defaultdict
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:
# Aliases material
import csv

def read_alias_from_csv(filename):
    data, first_elem = [], []

  # Use the 'r' mode to open the CSV file for reading
    with open(filename, "r", newline="") as file:
        reader = csv.reader(file)
 
      # Iterate through each row in the CSV file and append it to the data list
        for row in reader:
            data.append(row)

    for line in data:
        line[0] = line[0].upper()
        first_elem.append(line[0])
    return data, first_elem

actor_groups_aliases, actor_first_elem_list = read_alias_from_csv('/home/anon/input/threat-aliases/threat_actors_aliases.csv')
malware_families_aliases, malware_first_elem_list = read_alias_from_csv('/home/anon/input/threat-aliases/threat_names_aliases.csv')

## Generate complete list of REP keywords

In [None]:
tags = ["APT", "SECTEAM", "IDTY", "OS", "EMAIL", "IP", "DOM", "URL", "PROT", 
         "FILE", "TOOL", "MD5", "SHA1", "SHA2", "MAL", "ENCR", "VULNAME", "VULID", "ACT"]  #"LOC", "TIME", 

complete_list = set()
for t in tags:
    file_path = f'/home/anon/input/datasets-extracted-entities/REP_extracted_entities/entity_lists/{t}_list.txt'
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            complete_list.add(line.strip())
complete_list = sorted(complete_list, key=lambda x: (-len(x), x))
len(complete_list)

In [None]:
chunk_size = 500

def save_chunk(output_directory, count, chunk):
    output_file = os.path.join(output_directory, f"chunk_{count}.txt")
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write('\n'.join(chunk))
        
output_directory = 'chunks_complete_entity_list'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

idx_file = 1
count = 0
chunk = []
for elem in tqdm(complete_list):
    chunk.append(elem)
    count += 1
    if count % chunk_size == 0:
        save_chunk(output_directory, idx_file, chunk)
        chunk = []
        idx_file += 1

if chunk:
    save_chunk(output_directory, idx_file, chunk)
print('Total files:', idx_file)

output_file = os.path.join(output_directory, "complete_list.txt")
with open(output_file, 'w', encoding='utf-8') as file:
    file.write('\n'.join(complete_list))

## Search REP keywords in HF posts

In [None]:
def read_lines_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield line.strip()

def create_set_from_file(file_path):
    return set(read_lines_from_file(file_path))

def create_bulk_search_pattern(keywords_list):
    keywords_list = sorted(keywords_list, key=lambda x: (-len(x), x))
    #keyword_regex = r"\b(?:" + "|".join(re.escape(word) for word in keywords_list) + r")\b"
    keyword_regex = r"(^|\s)[^a-zA-Z0-9]*(" + "|".join(re.escape(word.lower()) for word in keywords_list) + r")[^a-zA-Z0-9]*(\s|$)"
    return re.compile(keyword_regex)

def search_matches_in_chunk(chunk, bulk_search_pattern):
    word_matching = defaultdict(list)
    for post in chunk.itertuples(index=False):
        title = post.threadTitle.lower()
        content = post.tokenizedContent.lower()

        matches = bulk_search_pattern.findall(content)
        for word in matches:
            matched_word = word[1].lower()
            word_matching[matched_word].append({
                'ID': post.ID,
                'date': post.date
            })
    return word_matching

def search_matches(input_file_path, output_file_path, df_HF):
    # Load keywords of reports
    keywords_list = create_set_from_file(input_file_path)
    print('\nNumber of report keywords: ', len(keywords_list))

    print(datetime.datetime.now())
    start = time.time()
    print("Start searching for matches in file", input_file_path)

    # Convert keywords_list to lowercase for efficient matching
    keywords_list_lower = {word.lower() for word in keywords_list}

    # Compile the bulk search pattern
    print('Create regex pattern')
    bulk_search_pattern = create_bulk_search_pattern(keywords_list_lower)

    # Divide the dataframe into smaller chunks for parallel processing
    chunk_size = 1000
    df_chunks = [df_HF[i:i+chunk_size] for i in range(0, df_HF.shape[0], chunk_size)]
                
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(search_matches_in_chunk, chunk, bulk_search_pattern) for chunk in df_chunks]
        word_matching_dict = {}
        for future in tqdm(futures, total=len(futures), desc='Processing Chunks'):
            chunk_word_matching = future.result()
            for word, matches in chunk_word_matching.items():
                if word in word_matching_dict:
                    word_matching_dict[word].extend(matches)
                else:
                    word_matching_dict[word] = matches

    # Convert word_matching_dict to a list of dictionaries
    word_matching = [{'keyword': word, 'matching posts': matches} for word, matches in word_matching_dict.items()]

    end = time.time()
    print(datetime.datetime.now())
    print("Time elapsed: {} minutes".format((end - start) / 60))

    # Count non-empty lists
    total_keywords = len(keywords_list)
    print(f"The total number of keywords is: {total_keywords}")
    non_empty_lists_count = sum(1 for entry in word_matching if entry['matching posts'])
    percentage = (non_empty_lists_count / total_keywords) * 100
    print(f"The percentage of non-empty lists associated with keywords is: {percentage}%")
    
    # Create a dictionary to store unique IDs for each keyword using sets
    unique_ids = {}
    # Iterate through the data and remove duplicates using sets
    for item in word_matching:
        keyword = item["keyword"]
        matching_posts = item["matching posts"]
        # Create a set to store unique IDs for the current keyword
        unique_ids[keyword] = set()
        # Create a list to store unique posts for the current keyword
        unique_posts = []
        for post in matching_posts:
            post_id = post["ID"]
            # Check if the ID is not in the unique_ids set for this keyword
            if post_id not in unique_ids[keyword]:
                # Add the ID to the unique_ids set
                unique_ids[keyword].add(post_id)
                # Add the post to the unique_posts list
                unique_posts.append(post)
        # Update the matching posts for the current keyword
        item["matching posts"] = unique_posts

    # Save the result
    print("Saving json to %s" % output_file_path)
    with open(output_file_path, 'w') as file:
        json.dump(word_matching, file, indent=4)

In [None]:
input_directory_keywords = '/home/anon/working/chunks_complete_entity_list'
input_path_HF = '/home/anon/input/datasets-extracted-entities/HF_extracted_entities/HF_extracted_entities.json'
output_directory = '/home/anon/working/chunks_REP_keywords_in_HF_posts/'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [None]:
# Load HF posts
df_HF = pd.read_json(input_path_HF)

df_HF['date'] = pd.to_datetime(df_HF['date'])
df_HF['date'] = df_HF['date'].dt.strftime('%m-%d-%Y')
# Sort the DataFrame by the 'date' column
df_HF.sort_values(by='date', inplace=True)

# Convert title and content columns to lowercase for efficient matching
df_HF['threadTitle'] = df_HF['threadTitle'].str.lower()
df_HF['flatContent'] = df_HF['flatContent'].str.lower()
print('Number of relevant posts: ', df_HF.shape[0])

In [None]:
url_pattern = r'http\S+|www\S+|https\S+|h\*\*p\S+|h\*\*ps\S+'  # Starting with http , https , www, h**p, h**ps
delimiter_pattern = r'\s+|[:,;?!{}\[\]=|]+'  # To split

def process_content(row):  # Try to preserve URLs while tokenizing - Reproducing splitting used to process content to extract kw HF side
    #content = row['threadTitle'] + ' ' + row['flatContent']
    content = ' '.join(row['content']).lower()
    tokens = re.split(f"({url_pattern})", content)  # Split the text by URLs and non-URLs
    tokens = [token.strip() for token in tokens if token.strip()]   # Remove empty tokens and trim whitespace

    # Split non-link tokens into sub_tokens
    split_tokens = []
    for token in tokens:
        if not re.match(url_pattern, token):
            sub_tokens = re.split(delimiter_pattern, token)
            split_tokens.extend(sub_tokens)
        else:
            split_tokens.append(token)

    return ' '.join(split_tokens)

# Initialize tqdm progress bar
pbar = tqdm(total=len(df_HF))

def apply_process_content(row):
    global pbar
    pbar.update(1)  # Update tqdm progress bar
    return process_content(row)

df_HF['tokenizedContent'] = df_HF.apply(apply_process_content, axis=1)

# Close tqdm progress bar
pbar.close()

In [None]:
start = 1
end = start + 153  # 691
# (1, 154) - (154, 307) - (307, 460) - (460, 613) - (613, 691)

end = min(end, 691)

print('Searching from {0} to {1}'.format(start, end-1))

for idx_file in range(start, end):  
    iFilename = 'chunk_{0}.txt'.format(idx_file)
    input_path = os.path.join(input_directory_keywords, iFilename)

    oFilename = 'REP_keywords_in_HF_posts_{0}.json'.format(idx_file)
    output_path = os.path.join(output_directory, oFilename)

    search_matches(input_path, output_path, df_HF)

    print()

## Save as ZIP

In [None]:
# Directory path containing the output files to zip
source_directory = output_directory

# Zip file path
zip_file_path = '/home/anon/working/REP_keywords_in_HF_posts.zip'

# Create a ZIP archive
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Loop through all files in the directory and add them to the ZIP archive
    for root, _, files in os.walk(source_directory):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, source_directory))