 # Llama 3 Model Tokenizer Estimate

In [13]:
import pandas as pd
import tiktoken
import os
import glob
import logging
from IPython.display import display # For better DataFrame display in Jupyter

# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the path to the directory containing the anomaly CSV files
# Assumes this notebook is in 'notebooks/' and logs are in '../detector/anomaly_logs/'
anomaly_logs_dir = '../detector/anomaly_logs/'

# --- Specify the CSV file to analyze ---
# Set to None to automatically use the latest 'anomalies_*.csv' file
# Set to 'mock_connections.csv' to use the mock file specifically
# specific_csv_filename = None
specific_csv_filename = 'anomalies_mock_connections.csv' # Using the mock file

# --- Path Validation ---
analysis_dir_abs = os.path.abspath(anomaly_logs_dir) # Get absolute path
csv_file_path = None # Initialize

if not os.path.isdir(analysis_dir_abs):
    logging.error(f"Error: The anomaly logs directory does not exist: {analysis_dir_abs}")
    raise FileNotFoundError(f"Directory not found: {analysis_dir_abs}")
else:
    logging.info(f"Looking for anomaly CSV files in: {analysis_dir_abs}")

    # --- Select CSV File ---
    if specific_csv_filename:
        potential_path = os.path.join(analysis_dir_abs, specific_csv_filename)
        if os.path.exists(potential_path):
             csv_file_path = potential_path
             logging.info(f"Using specified file: {csv_file_path}")
        else:
             logging.error(f"Specified file '{specific_csv_filename}' not found in {analysis_dir_abs}")
             csv_file_path = None # Ensure it's None if file not found
    else:
        # Find the latest CSV file if none is specified
        try:
            csv_files = glob.glob(os.path.join(analysis_dir_abs, "anomalies_*.csv"))
            if csv_files:
                csv_file_path = max(csv_files, key=os.path.getmtime)
                logging.info(f"Found latest anomaly file: {csv_file_path}")
            else:
                logging.error(f"No 'anomalies_*.csv' files found in {analysis_dir_abs}")
        except Exception as e:
            logging.error(f"Error finding latest anomaly file: {e}")

# --- Proceed only if a CSV file was found/specified ---
if csv_file_path:

    logging.info(f"Analyzing token count for: {os.path.basename(csv_file_path)}")

    # --- Set up the instruction prompt ---
    instruction_prompt = """
    You are a highly skilled virtual cybersecurity analyst specializing in identifying
    and reporting anomalous connections within an ICS (Industrial Control System) or
    enterprise network environment.

    Your task is to analyze the following network connection data that has been flagged
    as anomalous by our baseline detection system and provide detailed insights.
    Your analysis will be included in security reports and reviewed by human experts.

    Task Overview:
    Analyze the following connection data from our network environment. Each group of data represents
    a connection conversation that was flagged as anomalous because:
    1. It contains an unknown device (OUI not in our baseline), OR
    2. It uses a protocol not in our allowed protocol baseline, OR
    3. Both of the above reasons

    Port notes: EPH is an ephemeral port (port > 1024) used by clients. Pay special attention to the protocols,
    outgoing and incoming ports, and manufacturers (MFGs).

    Response Requirements:
    For each connection group:
    1. Device Identification: Identify and describe the devices involved based on their manufacturer (MFG) names and MAC addresses.
    2. Communication Details: Specify the protocols used, IP addresses, and ports (both source and destination).
       Provide information about the purpose of the ports if known (e.g., 443 for HTTPS).
    3. Traffic Volume: Analyze the CNT field, which represents packet counts for each connection.
    4. Risk Assessment: Evaluate the risk level (Low, Medium, High, Critical) of these anomalous connections.
       Explain your reasoning based on the protocols, devices, and communication patterns.
    5. Recommendations: Suggest specific actions for security personnel (block, monitor, investigate, or allow).

    Format your response using clear headings and bullet points for readability. Security personnel will use
    your analysis to make decisions about these anomalous connections.
    """

    # --- Token Counting Setup ---
    try:
        enc = tiktoken.get_encoding("cl100k_base")
    except Exception as e:
        logging.error(f"Failed to get tiktoken encoding: {e}")
        enc = None

    def count_tokens(text):
        if enc is None:
             logging.warning("Tiktoken encoding not available, returning character count as estimate.")
             return len(text)
        try:
            if not isinstance(text, str): text = str(text)
            return len(enc.encode(text))
        except Exception as e:
            logging.error(f"Error encoding text for token count: {e}")
            return 0

    # --- Calculate and Print Tokens ---

    # 1. Instruction Prompt Tokens
    instruction_tokens = count_tokens(instruction_prompt)
    print("-" * 60)
    print(f"\033[1;34mInstruction Prompt:\033[0m (Tokens: {instruction_tokens})")
    print("-" * 60)

    # 2. Prepare CSV Data and Groups
    groups = []
    df_cleaned = pd.DataFrame() # Initialize an empty DataFrame

    try:
        # Read CSV, skipping bad lines (comma-only separators)
        df = pd.read_csv(csv_file_path, on_bad_lines='skip')

        # --- Grouping Logic ---
        current_group_rows = []
        for index, row in df.iterrows():
            is_separator = row.isnull().all()
            if is_separator:
                if current_group_rows:
                    groups.append(pd.DataFrame(current_group_rows))
                    current_group_rows = []
            else:
                 if not row.isnull().all():
                     current_group_rows.append(row.to_dict())
        if current_group_rows:
            groups.append(pd.DataFrame(current_group_rows))
        # --- End Grouping Logic ---

        # Also, get the full cleaned DataFrame (needed for the full CSV token count)
        # Reread or reuse df if skipping didn't modify it in place
        df_read_again = pd.read_csv(csv_file_path, on_bad_lines='skip')
        df_cleaned = df_read_again.dropna(how='all')


        if df_cleaned.empty:
             logging.warning(f"CSV file '{os.path.basename(csv_file_path)}' contains no data after removing bad/blank rows.")
        elif not groups:
             logging.warning(f"CSV file '{os.path.basename(csv_file_path)}' resulted in zero conversation groups after parsing, though data might exist.")
        else:
             logging.info(f"Parsed {len(groups)} conversation groups from the CSV.")

             # 3. Calculate Tokens for Full CSV Data

             df_cleaned_string = df_cleaned.to_string(index=False) # String for token count
             df_cleaned_tokens = count_tokens(df_cleaned_string)
             total_full_csv_tokens = instruction_tokens + df_cleaned_tokens

             print(f"\033[1;34mAnalysis of Sending ENTIRE CSV Content:\033[0m")

             # --- Display the entire DataFrame ---
             print("\nDataFrame used for ENTIRE CSV token count (all valid rows):")
             # Use a context manager to temporarily set display options
             with pd.option_context(
                 'display.max_rows', None,     # Show all rows
                 'display.max_columns', None,  # Show all columns
                 'display.width', 1000         # Adjust width if needed for wide tables
             ):
                    display(df_cleaned) # Use display() for better notebook rendering
            # --- End display block ---

             print(f"\nTokens in full cleaned CSV data: {df_cleaned_tokens}")
             print(f"\033[1;31mTotal estimated tokens (instruction + ENTIRE cleaned data): {total_full_csv_tokens}\033[0m")
             print("(This represents sending the whole file content at once)")
             print("-" * 60)


             # 4. Calculate Tokens for Each Group + Prompt (Streaming Simulation)
             print(f"\033[1;34mAnalysis of Sending Data per Conversation Group (Streaming Simulation):\033[0m")
             all_group_tokens = []
             max_group_token_count = 0
             min_group_token_count = float('inf')

             for i, group_df in enumerate(groups):
                 group_string = group_df.to_string(index=False)
                 group_data_tokens = count_tokens(group_string)
                 total_group_prompt_tokens = instruction_tokens + group_data_tokens
                 all_group_tokens.append(total_group_prompt_tokens)
                 max_group_token_count = max(max_group_token_count, total_group_prompt_tokens)
                 min_group_token_count = min(min_group_token_count, total_group_prompt_tokens)

                 print(f"\n--- Group {i+1} ---")
                 # <<< UNCOMMENT THE NEXT LINE >>>
                 print("DataFrame for this group:")
                 display(group_df) # Display the DataFrame for context
                 # <<< END UNCOMMENT >>>
                 print(f"\nTokens in group data: {group_data_tokens}")
                 print(f"\033[1;32mTotal estimated tokens (instruction + Group {i+1} data): {total_group_prompt_tokens}\033[0m")


             # 5. Summary Statistics for Group Token Counts
             if all_group_tokens:
                 avg_group_token_count = sum(all_group_tokens) / len(all_group_tokens)
                 print("-" * 60)
                 print(f"\033[1;34mSummary for Per-Group Token Counts:\033[0m")
                 print(f"Number of Groups Analyzed: {len(all_group_tokens)}")
                 print(f"Minimum Tokens (Prompt + 1 Group): {min_group_token_count}")
                 print(f"Maximum Tokens (Prompt + 1 Group): {max_group_token_count}")
                 print(f"Average Tokens (Prompt + 1 Group): {avg_group_token_count:.0f}")
             print("-" * 60)


    except pd.errors.EmptyDataError:
        logging.error(f"Error: CSV file '{os.path.basename(csv_file_path)}' is completely empty or unreadable after skipping bad lines.")
    except Exception as e:
        logging.error(f"Error processing CSV file '{os.path.basename(csv_file_path)}': {e}")

else:
    # This block executes if csv_file_path remained None after trying to find a file
    logging.error("No valid CSV file selected or found. Token counting cannot proceed.")

2025-04-09 17:12:27,434 - INFO - Looking for anomaly CSV files in: /Users/ptr/Documents/Projects/AnomaLLMy/detector/anomaly_logs
2025-04-09 17:12:27,435 - INFO - Using specified file: /Users/ptr/Documents/Projects/AnomaLLMy/detector/anomaly_logs/anomalies_mock_connections.csv
2025-04-09 17:12:27,437 - INFO - Analyzing token count for: anomalies_mock_connections.csv
2025-04-09 17:12:27,448 - INFO - Parsed 6 conversation groups from the CSV.


------------------------------------------------------------
[1;34mInstruction Prompt:[0m (Tokens: 399)
------------------------------------------------------------
[1;34mAnalysis of Sending ENTIRE CSV Content:[0m

DataFrame used for ENTIRE CSV token count (all valid rows):


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,TCP,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,443,112.0
1,TCP,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,443,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,98.0
2,TCP,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,80,3.0
3,TCP,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,80,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,1.0
5,UDP,f8:75:a4:dd:ee:ff,Dell Inc,192.168.1.55,EPH,aa:bb:cc:00:11:22,UNKNOWN,192.168.1.20,12345,5.0
6,UDP,aa:bb:cc:00:11:22,UNKNOWN,192.168.1.20,EPH,f8:75:a4:dd:ee:ff,Dell Inc,192.168.1.55,54321,2.0
8,UDP,b8:27:eb:11:22:33,Raspberry Pi Foundation,192.168.1.30,5353,01:00:5e:00:00:fb,MULTICAST,224.0.0.251,5353,15.0
9,UDP,c2:30:6b:44:55:66,"Hangzhou H3C Technologies Co., Limited",192.168.1.88,5353,01:00:5e:00:00:fb,MULTICAST,224.0.0.251,5353,8.0
11,ARP,00:15:17:77:88:99,Intel Corporate,192.168.1.1,,12:34:56:78:9a:bc,UNKNOWN,192.168.1.150,,2.0
12,ARP,12:34:56:78:9a:bc,UNKNOWN,192.168.1.150,,00:15:17:77:88:99,Intel Corporate,192.168.1.1,,1.0



Tokens in full cleaned CSV data: 903
[1;31mTotal estimated tokens (instruction + ENTIRE cleaned data): 1302[0m
(This represents sending the whole file content at once)
------------------------------------------------------------
[1;34mAnalysis of Sending Data per Conversation Group (Streaming Simulation):[0m

--- Group 1 ---
DataFrame for this group:


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,TCP,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,443,112.0
1,TCP,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,443,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,98.0
2,TCP,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,80,3.0
3,TCP,de:ad:be:ef:00:01,UNKNOWN,104.20.15.1,80,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,EPH,1.0



Tokens in group data: 268
[1;32mTotal estimated tokens (instruction + Group 1 data): 667[0m

--- Group 2 ---
DataFrame for this group:


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,UDP,f8:75:a4:dd:ee:ff,Dell Inc,192.168.1.55,EPH,aa:bb:cc:00:11:22,UNKNOWN,192.168.1.20,12345,5.0
1,UDP,aa:bb:cc:00:11:22,UNKNOWN,192.168.1.20,EPH,f8:75:a4:dd:ee:ff,Dell Inc,192.168.1.55,54321,2.0



Tokens in group data: 145
[1;32mTotal estimated tokens (instruction + Group 2 data): 544[0m

--- Group 3 ---
DataFrame for this group:


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,UDP,b8:27:eb:11:22:33,Raspberry Pi Foundation,192.168.1.30,5353,01:00:5e:00:00:fb,MULTICAST,224.0.0.251,5353,15.0
1,UDP,c2:30:6b:44:55:66,"Hangzhou H3C Technologies Co., Limited",192.168.1.88,5353,01:00:5e:00:00:fb,MULTICAST,224.0.0.251,5353,8.0



Tokens in group data: 160
[1;32mTotal estimated tokens (instruction + Group 3 data): 559[0m

--- Group 4 ---
DataFrame for this group:


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,ARP,00:15:17:77:88:99,Intel Corporate,192.168.1.1,,12:34:56:78:9a:bc,UNKNOWN,192.168.1.150,,2.0
1,ARP,12:34:56:78:9a:bc,UNKNOWN,192.168.1.150,,00:15:17:77:88:99,Intel Corporate,192.168.1.1,,1.0



Tokens in group data: 147
[1;32mTotal estimated tokens (instruction + Group 4 data): 546[0m

--- Group 5 ---
DataFrame for this group:


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,ICMP,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,,00:15:17:77:88:99,Intel Corporate,192.168.1.1,,4.0
1,ICMP,00:15:17:77:88:99,Intel Corporate,192.168.1.1,,a0:d3:c1:aa:bb:cc,"Apple, Inc.",192.168.1.150,,4.0



Tokens in group data: 151
[1;32mTotal estimated tokens (instruction + Group 5 data): 550[0m

--- Group 6 ---
DataFrame for this group:


Unnamed: 0,PROTOCOL,SRCMAC,SRCMFG,SRCIP,SRCPORT,DSTMAC,DSTMFG,DSTIP,DSTPORT,CNT
0,TCP,00:50:56:ab:cd:ef,VMware Inc,10.0.0.5,EPH,f0:09:0d:12:34:56,Arista Networks Inc,10.0.0.10,9999,25.0
1,TCP,f0:09:0d:12:34:56,Arista Networks Inc,10.0.0.10,9999,00:50:56:ab:cd:ef,VMware Inc,10.0.0.5,EPH,18.0



Tokens in group data: 155
[1;32mTotal estimated tokens (instruction + Group 6 data): 554[0m
------------------------------------------------------------
[1;34mSummary for Per-Group Token Counts:[0m
Number of Groups Analyzed: 6
Minimum Tokens (Prompt + 1 Group): 544
Maximum Tokens (Prompt + 1 Group): 667
Average Tokens (Prompt + 1 Group): 570
------------------------------------------------------------
