In [1]:
%pip install -r requirements.txt

Collecting streamlit==1.1.0 (from -r requirements.txt (line 3))
  Using cached streamlit-1.1.0-py2.py3-none-any.whl.metadata (1.1 kB)
Using cached streamlit-1.1.0-py2.py3-none-any.whl (8.3 MB)
Installing collected packages: streamlit
  Attempting uninstall: streamlit
    Found existing installation: streamlit 1.37.1
    Uninstalling streamlit-1.37.1:
      Successfully uninstalled streamlit-1.37.1
Successfully installed streamlit-1.1.0
Note: you may need to restart the kernel to use updated packages.


# # Web Traffic Log-Based Q&A System
# 
# This project uses web traffic logs to create a question-answering system powered by a Retrieval-Augmented Generation (RAG) model. The system processes logs, stores them in a vector database, and uses two different language models (LLaMA 3 and Google T5) to generate answers to user queries.


Imports and Setups

In [2]:
import re
"""
Import necessary libraries and modules for the Q&A BOT ]]notebook.
This code imports the following libraries and modules:
- re: Regular expression operations.
- pandas: Data manipulation and analysis.
- numpy: Numerical computing.
- datetime: Date and time manipulation.
- faiss: Efficient similarity search and clustering of dense vectors.
- tqdm: Progress bar for loops and tasks.
- langchain.vectorstores: Vector stores for language embeddings.
- langchain_huggingface: Hugging Face embeddings for language models.
- langchain.docstore: Document store for storing and retrieving documents.
- langchain.schema: Schema for defining document structure.
- transformers: State-of-the-art natural language processing models.
- langchain_huggingface: Hugging Face pipeline for language models.
- langchain.chains: Retrieval-based question answering model.
The code also checks for GPU availability, clears the GPU cache, and sets the device to either "cuda" or "cpu" based on availability.
Note: The code assumes that the necessary libraries and modules are already installed.
"""
import pandas as pd
import numpy as np
from datetime import datetime
import faiss
from tqdm.notebook import tqdm
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Check for GPU availability
import torch
# Clear GPU cache before and after running the model
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")





Log parsing and preprocessing

In [None]:
import pandas as pd
from langchain.document_loaders import CSVLoader
import re
# The following code handles the loading, parsing, and preprocessing of web traffic logs. The logs are parsed to extract useful fields, and the data is then cleaned and structured for further use.

# Function to parse the page content
def parse_page_content(page_content):
    """
    Parses the content of a web traffic log entry.
    
    Args:
        page_content (str): A single log entry as a string.
        
    Returns:
        dict: A dictionary containing parsed fields like IP, identity, user, datetime, method, etc.
    """
    pattern = re.compile(
        r'IP: (?P<ip>[\d\.]+)\n'
        r'Identity: (?P<identity>.+)\n'
        r'User: (?P<user>.+)\n'
        r'Timestamp: (?P<datetime>.+)\n'
        r'Request: (?P<method>\w+) (?P<url>.+) HTTP/\d\.\d\n'
        r'Status: (?P<status>\d+)\n'
        r'Size: (?P<size>\d+)\n'
        r'Referer: (?P<referer>.+)\n'
        r'User-Agent: (?P<user_agent>.+)'
    )
    match = pattern.search(page_content)
    if match:
        return match.groupdict()
    return {}

# Function to downsample the CSV file
def downsample_csv(csv_file_path, sample_size, output_file_path):
    df = pd.read_csv(csv_file_path)
    if sample_size and sample_size < len(df):
        df = df.sample(n=sample_size)
    df.to_csv(output_file_path, index=False)
    return output_file_path

# Load and preprocess logs using CSVLoader
def load_and_preprocess_logs(csv_file_path, sample_size=10000):
    """
    Loads and preprocesses web traffic logs from a CSV file.
    
    Args:
        csv_file_path (str): Path to the CSV file containing log data.
        sample_size (int): Number of samples to load from the file.
        
    Returns:
        pd.DataFrame: A DataFrame containing processed log data with additional fields for analysis.
    """
    # Downsample the CSV file
    downsampled_csv = downsample_csv(csv_file_path, sample_size, 'downsampled_logs.csv')
    
    # Load the downsampled CSV file
    loader = CSVLoader(file_path=downsampled_csv)
    documents = loader.load()
    
    # Debug: Print the first few documents to check if they are loaded correctly
    print("Loaded documents:", documents[:5])
    
    # Convert documents to DataFrame
    data = []
    for doc in documents:
        parsed_data = parse_page_content(doc.page_content)
        data.append(parsed_data)
    df = pd.DataFrame(data)
    
    # Debug: Print the DataFrame to check if it is populated correctly
    print("DataFrame head:", df.head())
    
    # Preprocess logs
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%b/%Y:%H:%M:%S %z', errors='coerce')
    df['hour'] = df['datetime'].dt.hour
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    df['weekday'] = df['datetime'].dt.weekday
    df['status'] = df['status'].astype(int, errors='ignore')
    df['size'] = df['size'].astype(int, errors='ignore')
    df['status_category'] = df['status'] // 100
    
    # Create a text field for embedding
    df['text'] = df.apply(lambda row: f"{row['method']} {row['url']} (Status: {row['status']}, Size: {row['size']}, IP: {row['ip']})", axis=1)
    
    return df

# Example usage
processed_logs = load_and_preprocess_logs('../processed_logs_sample.csv', sample_size=10000)

print(processed_logs.head())

Vector Database Setup

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# The following section creates a vector database using FAISS and stores the processed log data for efficient retrieval during query processing.

"""
Initialize the embedding model using HuggingFaceEmbeddings.
Parameters:
- model_name (str): The name of the Hugging Face model to use for embeddings.
Returns:
- embeddings (HuggingFaceEmbeddings): The initialized HuggingFaceEmbeddings object.
"""
"""
Create documents for vector store.
Parameters:
- processed_logs (DataFrame): The processed logs containing 'text', 'datetime', and 'user_agent' columns.
Returns:
- documents (list): The list of documents for the vector store, with additional information appended.
"""
"""
Split documents into smaller chunks.
Parameters:
- chunk_size (int): The size of each chunk.
- chunk_overlap (int): The overlap between consecutive chunks.
Returns:
- texts (list): The list of split documents.
"""
"""
Create the vector store using FAISS.
Parameters:
- texts (list): The list of split documents.
- embeddings (HuggingFaceEmbeddings): The initialized HuggingFaceEmbeddings object.
Returns:
- vectorstore (FAISS): The created FAISS vector store.
"""
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create documents for vector store
documents = [
    f"{row['text']} (Datetime: {row['datetime']}, User Agent: {row['user_agent']})"
    for _, row in processed_logs.iterrows()
]

# Split documents
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.create_documents(documents)

# Create the vector store
vectorstore = FAISS.from_documents(texts, embeddings)

Rag Model Setup

In [None]:
import logging
# The RAG (Retrieval-Augmented Generation) model combines retrieval from the vector store and language generation using the LLaMA 3 model. It is set up here to handle user queries.


"""""
This code initializes a retrieval-based question answering (QA) system for analyzing web traffic log entries. The system uses the OllamaLLM language model and the RetrievalQA chain from the langchain library.
The main steps in the code are as follows:
1. Set up logging for the system.
2. Initialize the OllamaLLM language model with specific parameters.
3. Define a template for generating prompts for the QA system.
4. Create a PromptTemplate object with the defined template and input variables.
5. Create a RetrievalQA chain using the OllamaLLM model, a retriever, and the PromptTemplate.
6. The RetrievalQA chain is ready to be used for answering questions based on the provided log entries.
."""""
from langchain.prompts import PromptTemplate
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA

# Set up logging
logging.basicConfig(level=logging.INFO)

# Initialize OllamaLLM
llm = OllamaLLM(
    model="llama3",  # or the specific model you're using
    temperature=0.7,
    top_p=0.95,
)



template = """"
    You are an expert cybersecurity analyst specializing in web traffic log analysis. You have access to detailed log entries from a high-traffic website. Your task is to provide an in-depth analysis of these logs to answer specific questions posed by users.

    When analyzing the logs, follow these guidelines:
    1. **Data Integrity:** Ensure the accuracy and completeness of the data by cross-referencing multiple log entries where applicable.
    2. **Pattern Recognition:** Identify and explain any significant patterns or anomalies in user behavior, such as:
       - Repeated access from specific IP addresses.
       - Unusual patterns in user-agent strings (indicating bots, crawlers, or potential attackers).
       - Consistent access to specific pages or endpoints at unusual times.
    3. **Contextual Correlation:** Correlate log entries across different dimensions (e.g., IP, time, URL) to build a coherent narrative of the events.
    4. **Security Implications:** Assess and highlight any potential security concerns, such as:
       - Signs of DDoS attacks.
       - Unusual traffic spikes that could indicate brute force attempts.
       - Access patterns that suggest vulnerability scanning.
    5. **Detailed Justification:** For each observation or conclusion, provide specific log entries as evidence. Explain how these logs lead to your conclusions.

    If the information requested is not available in the logs or if you are unable to determine an answer, clearly state that the data is inconclusive.

    Below are the relevant log entries:

    {context}

    Based on the above logs, answer the following question with detailed reasoning and examples:

    Question: {question}
    """

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# Create the RAG chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

Asking a Question

In [None]:
# %%
# The following function allows users to input questions about the web traffic logs. The RAG model then processes the query and provides answers, along with relevant log entries as evidence.

def ask_question(question):
    """
    Processes a user query using the RAG model and returns an answer with relevant source documents.
    
    Args:
        question (str): The user's question about the web traffic logs.
        
    Returns:
        None: Prints the question, the generated answer, and source documents.
    """
    result = rag_chain({"query": question})
    print(f"Question: {question}")
    print(f"Answer: {result['result']}")
    print("\nSource Documents:")
    for i, doc in enumerate(result['source_documents'], 1):
        print(f"{i}. {doc.page_content[:200]}...")

In [None]:
ask_question("Are there any unusual patterns in user-agent strings that might indicate bot activity or potential attackers?")
ask_question("Which HTTP methods are predominantly used in the logs, and what does this tell us about the nature of the traffic?")