In [1]:
import pandas as pd
df = pd.read_csv('dataset/synthetic_logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [3]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [35]:
# Import required libraries
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm  # for better progress tracking

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define batch size
BATCH_SIZE = 32  # You can adjust this based on your memory constraints

def batch_encode(texts, model, batch_size=32):
    """
    Encode texts in batches to prevent memory issues
    """
    embeddings = []
    # Process in batches with progress bar
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        try:
            batch_embeddings = model.encode(batch, show_progress_bar=False)
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i}: {e}")
            continue
    return np.array(embeddings)

# Generate embeddings for log messages in batches
try:
    embeddings = batch_encode(df['log_message'].tolist(), model, BATCH_SIZE)
    print(f"Generated embeddings shape: {embeddings.shape}")
    print("First two embeddings:")
    print(embeddings[:2])
except Exception as e:
    print(f"An error occurred: {e}")

  7%|█████▍                                                                             | 5/76 [00:52<12:22, 10.45s/it]


KeyboardInterrupt: 

In [41]:
# Apply DBSCAN clustering on embeddings
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the dataframe
df['cluster'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [21]:
pip install sentence-transformers

Collecting sentence-transformersNote: you may need to restart the kernel to use updated packages.

  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Downloading huggingface_hub-0.29.2-py3-none-any.whl (468 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
   -------------------------------------

In [43]:
df[df.cluster==1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,bert,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,bert,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,bert,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,bert,1


In [51]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.cluster import DBSCAN

# Assuming you have your embeddings and df with log_messages ready
# If you don't have embeddings, you'll need to create them first
# embeddings = ... # Your embedding creation code here

# Run DBSCAN to generate labels
clustering = DBSCAN(eps=0.5, min_samples=5).fit(embeddings)
labels = clustering.labels_

# Create a dictionary of cluster sizes
cluster_sizes = Counter(labels)

# Sort clusters by size (excluding noise points labeled as -1)
sorted_clusters = {k: v for k, v in sorted(cluster_sizes.items(), 
                                         key=lambda x: x[1], 
                                         reverse=True) 
                  if k != -1}

# Print cluster sizes and sample messages
print("Clusters sorted by size:")
for cluster_id, size in sorted_clusters.items():
    if size > 10:  # Only show clusters with more than 10 records
        print(f"\nCluster {cluster_id}: {size} records")
        print("Sample messages from this cluster:")
        
        # Get indices of messages in this cluster
        cluster_indices = np.where(labels == cluster_id)[0]
        
        # Print 5 sample messages
        print("\n".join(f"{i+1}. {msg}" for i, msg in 
              enumerate(df['log_message'].iloc[cluster_indices[:5]])))
        print("-" * 80)

Clusters sorted by size:

Cluster 0: 809 records
Sample messages from this cluster:
1. nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
2. nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
3. nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
4. nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54

In [79]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [59]:
classify_with_regex('Account with iD 5351 created by User634.')

'User Action'

In [81]:
log_exaample = 'User User395 Logged In'
print(classify_with_regex(log_exaample))

None


In [89]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


In [97]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [101]:
# Group by target_labels and count occurrences
label_counts = df_non_regex['target_label'].value_counts()

# Filter labels with 5 or fewer rows
small_labels = label_counts[label_counts <= 5]

# Print the results
print("Target labels with 5 or fewer rows:")
print(small_labels)

# Optional: To see the actual rows for each of these labels
for label in small_labels.index:
    print(f"\nRows for label '{label}':")
    print(df_non_regex[df_non_regex['target_label'] == label])

Target labels with 5 or fewer rows:
target_label
Workflow Error         4
Name: count, dtype: int64

Rows for label 'Workflow Error':
                timestamp     source  \
60    2025-10-06 16:55:23  LegacyCRM   
377   2025-06-24 12:16:29  LegacyCRM   
1325  2025-04-17 07:33:44  LegacyCRM   
2217  2025-05-12 09:46:54  LegacyCRM   

                                            log_message    target_label  \
60    Lead conversion failed for prospect ID 7842 du...  Workflow Error   
377   Customer follow-up process for lead ID 5621 fa...  Workflow Error   
1325  Escalation rule execution failed for ticket ID...  Workflow Error   
2217  Task assignment for TeamID 3425 could not comp...  Workflow Error   

     complexity  cluster regex_label  
60          llm       24        None  
377         llm       62        None  
1325        llm      105        None  
2217        llm      133        None  

                timestamp     source  \
255   2025-05-03 16:55:35  LegacyCRM   
1734  2025-04

In [110]:
df_non_legacy = df[df_non_regex!='LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem', nan,
       'ThirdPartyAPI'], dtype=object)

In [112]:
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())
embeddings[:2]

KeyboardInterrupt: 

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Prepare the data
X = embeddings_filtered  # Your embeddings
y = df_non_legacy['target_label'].values  # Your target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, multi_class='auto')
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Print the results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'embeddings_filtered' is not defined