 # Llama 3 Model Tokenizer Estimate

In [15]:
import pandas as pd
import tiktoken
import os

# Use this huggingface https://huggingface.co/spaces/Xenova/the-tokenizer-playground

# Get the current working directory where the Jupyter notebook is located
current_dir = os.getcwd()  # Get the current working directory (where the notebook is)

# Path to the CSV file
csv_file_path = os.path.abspath(os.path.join(current_dir, '../../data/raw/mock_connections.csv'))

# Set up the instruction prompt
instruction_prompt = """
You are a highly skilled virtual cybersecurity analyst specializing in identifying and reporting anomalous connections within an ICS (Industrial Control System) environment. Your task is to analyze the following connection data and provide detailed insights for inclusion in our security reports. Your analysis will be reviewed by human experts.

Task Overview:
Analyze the following connection data from an ICS environment. Each group of data represents a connection conversation. Some IP addresses may be from local private networks, while others may be external, potentially indicating internet-based communication.

Port notes: EPH is an ephemeral port used by devices in an ICS environment. Pay attention to the outgoing and incoming ports. Provide context to port usage if possible.

Response Requirements:
For each connection conversation:
1. Device Identification: Identify and describe the devices involved, including their manufacturer (MFG) names and MAC addresses.
2. Communication Details: Specify the MFG names of the communicating devices, their IP addresses, and the ports used for outgoing and incoming traffic. Provide information about the ports used if possible.
3. Traffic Volume: Calculate the total sum of (CNT) packets exchanged between the devices.
4. Implications and Concerns: Analyze what is happening within each conversation. Explain if it needs further research. Note: If these connections are in the csv file it means that they weren't a part of the baseline/allowlist. They were picked up as an anomaly.
"""

# Token Counting Function
enc = tiktoken.get_encoding("cl100k_base")

# Function to count tokens
def count_tokens(text):
    tokens = enc.encode(text)
    return len(tokens)

# Count tokens for the instruction prompt
instruction_tokens = count_tokens(instruction_prompt)
print(f"\033[4;37mInstruction Prompt:\033[0m\n {instruction_prompt}")
print(f"\033[36mNumber of tokens in instruction prompt: {instruction_tokens}\033[0m")

# Load a CSV into a Pandas DataFrame
df = pd.read_csv(csv_file_path)

# Remove rows where all columns are NaN (empty rows)
df_cleaned = df.dropna(how='all')  # Drops only rows where all columns are NaN

# Convert the cleaned DataFrame (no rows where all values are NaN) to a string (for token counting)
df_cleaned_string = df_cleaned.to_string()

# Count tokens for the cleaned DataFrame content
df_cleaned_tokens = count_tokens(df_cleaned_string)

# Print the cleaned DataFrame (with all data)
print(f"\n\033[4;37mCleaned DataFrame (All rows after removing fully empty rows):\033[0m\n{df_cleaned_string}")
print(f"\n\033[33mNumber of tokens in the cleaned DataFrame (no fully empty rows): {df_cleaned_tokens}\033[0m")


# Total tokens (instruction + cleaned full DataFrame)
total_cleaned_tokens = instruction_tokens + df_cleaned_tokens
print(f"\033[32mTotal number of tokens (instruction + cleaned full DataFrame): {total_cleaned_tokens}\033[0m")

# Extract and print the first two rows of cleaned connection data
df_first_2_cleaned_rows = df_cleaned.head(2)  # Get the first 2 valid rows after cleaning NaNs
df_first_2_cleaned_rows_string = df_first_2_cleaned_rows.to_string()

# Count tokens for the first 2 cleaned rows of connection data
df_first_2_cleaned_rows_tokens = count_tokens(df_first_2_cleaned_rows_string)
first_2_cleaned_rows_total_tokens = instruction_tokens + df_first_2_cleaned_rows_tokens
print(f"\n\033[4;37mFirst 2 Rows of Cleaned Connection Data:\033[0m\n{df_first_2_cleaned_rows_string}")
print(f"\n\033[33mNumber of tokens in the first 2 rows of cleaned connection data: {df_first_2_cleaned_rows_tokens}\033[0m")
print(f"\033[32mTotal number of tokens (instruction + first 2 rows of cleaned connection data): {first_2_cleaned_rows_total_tokens}\033[0m")


[4;37mInstruction Prompt:[0m
 
You are a highly skilled virtual cybersecurity analyst specializing in identifying and reporting anomalous connections within an ICS (Industrial Control System) environment. Your task is to analyze the following connection data and provide detailed insights for inclusion in our security reports. Your analysis will be reviewed by human experts.

Task Overview:
Analyze the following connection data from an ICS environment. Each group of data represents a connection conversation. Some IP addresses may be from local private networks, while others may be external, potentially indicating internet-based communication.

Port notes: EPH is an ephemeral port used by devices in an ICS environment. Pay attention to the outgoing and incoming ports. Provide context to port usage if possible.

Response Requirements:
For each connection conversation:
1. Device Identification: Identify and describe the devices involved, including their manufacturer (MFG) names and MAC a

 # Using Ollama to get LLM generation data including the prompt_eval_count = tokens

In [13]:
import pandas as pd
import tiktoken
from transformers import LlamaTokenizer
import os
from langchain.llms import Ollama

ollama = Ollama(model="llama3.2:3b")  # Initialize Ollama with a specific model

text = """
You are a highly skilled virtual cybersecurity analyst specializing in identifying and reporting anomalous connections within an ICS (Industrial Control System) environment. Your task is to analyze the following connection data and provide detailed insights for inclusion in our security reports. Your analysis will be reviewed by human experts.

Task Overview:
Analyze the following connection data from an ICS environment. Each group of data represents a connection conversation. Some IP addresses may be from local private networks, while others may be external, potentially indicating internet-based communication.

Port notes: EPH is an ephemeral port used by devices in an ICS environment. Pay attention to the outgoing and incoming ports. Provide context to port usage if possible.

Response Requirements:
For each connection conversation:
1. Device Identification: Identify and describe the devices involved, including their manufacturer (MFG) names and MAC addresses.
2. Communication Details: Specify the MFG names of the communicating devices, their IP addresses, and the ports used for outgoing and incoming traffic. Provide information about the ports used if possible.
3. Traffic Volume: Calculate the total sum of (CNT) packets exchanged between the devices.
4. Implications and Concerns: Analyze what is happening within each conversation. Explain if it needs further research. Note: If these connections are in the csv file it means that they weren't a part of the baseline/allowlist. They were picked up as an anomaly.

"""  # Input text

# Set the token encoder (optional)
token_encoder = "simple"  # or "none" for no tokenization

# Generate text using Ollama
response = ollama.generate([text], token_encoder=token_encoder)  # Pass the text as a list

# Print the full response object to explore its structure
print(response)

# Accessing the generated text (this will depend on the structure of `response`)
# Uncomment this if you know the structure, e.g., response.generations[0].text or similar
# print(response.generations[0].text)  # Example of how to access the generated text if it's stored in a 'generations' attribute


generations=[[GenerationChunk(text="I'll analyze the connection data provided and provide detailed insights for inclusion in your security reports.\n\nConnection Data:\n\n1. Device Identification:\n\t* Device A: Manufacturer (MFG) = Siemens, MAC Address = 00:11:22:33:44:55\n\t* Device B: Manufacturer (MFG) = Rockwell Automation, MAC Address = 77:88:99:00:11:22\n\n2. Communication Details:\n\t* Outgoing Port: EPH ( ephemeral port ) - Port range not provided\n\t* Incoming Port: 8050 ( TCP )\n\t* IP Address: 192.168.1.100 ( Local private network )\n\n3. Traffic Volume:\n\t* CNT ( Packets exchanged ): 12\n\n4. Implications and Concerns:\nThis connection appears to be an internal communication between Device A and Device B within the ICS environment. The use of EPH for outgoing traffic suggests that this might not be a legitimate, expected communication. The incoming port is 8050, which could potentially be used by Rockwell Automation's control software.\n\nFurther research is warranted to 

OSError: llama is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`