In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import os
import re
from datetime import datetime
import faiss
import numpy as np

# T5 modeli yükleme
# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Keys
# PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
# PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
# os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

log_file_path = "secrcomp.log"

def preprocess_logs(file_path):
    with open(file_path, "r") as file:
        log_lines = file.readlines()

    log_pattern = r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(.*?) (.*?) HTTP.*" (\d{3}) (\d+) "(.*?)" "(.*?)" (\d+)'

    parsed_logs = []
    for line in log_lines:
        match = re.match(log_pattern, line)
        if match:
            parsed_logs.append(match.groups())

    df = pd.DataFrame(parsed_logs, columns=[
        "ip", "timestamp", "method", "endpoint",
        "status_code", "response_size", "referer",
        "user_agent", "response_time"
    ]).drop(columns=['referer'])

    # Function to format date and time as text
    def format_datetime(dt):
        day = dt.strftime("%d")
        month = dt.strftime("%B")
        year = dt.year
        hour = dt.strftime("%I")
        minute = dt.strftime("%M")
        second = dt.strftime("%S")
        week_day = dt.strftime("%A")

        formatted_str = (f"In {year}, on {month} {day}, which is a {week_day}, at {hour}:{minute}:{second}.")
        return formatted_str

    # Function to update timestamp with formatted date and time
    def update_timestamp(timestamp):
        try:
            # Convert timestamp string to datetime object
            dt = datetime.strptime(timestamp, '%d/%b/%Y:%H:%M:%S %z')

            # Format datetime object to the desired string format
            formatted_datetime = format_datetime(dt)

            # Return formatted string
            return f"{dt.strftime('%d/%b/%Y:%H:%M:%S %z')} ({formatted_datetime})"
        except ValueError:
            # Handle incorrect timestamp format
            return timestamp

    # Update timestamps in the DataFrame
    df['timestamp'] = df['timestamp'].apply(update_timestamp)


    # # Sort DataFrame by timestamp
    # df = df.sort_values(by='timestamp')

    # # Drop the auxiliary column used for sorting
    # df = df.drop(columns=['timestamp'])

    # Create sentences
    sentences = df.apply(
        lambda row: f"Request from IP {row['ip']} on {row['timestamp']} using {row['method']} method to endpoint {row['endpoint']} resulted in status code {row['status_code']} with a response size of {row['response_size']} bytes, response time of {row['response_time']} ms, and user agent ({row['user_agent']}).",
        axis=1
    ).tolist()

    return sentences

sentences = preprocess_logs(log_file_path)
# sentences = [sentences[i:i + 5000] for i in range(0, len(sentences), 5000)]

# sentences = "\n".join(sentences)
sentences

['Request from IP 108.90.172.80 on 14/Jul/2023:10:50:16 +0300 (In 2023, on July 14, which is a Friday, at 10:50:16.) using PUT method to endpoint /usr/admin resulted in status code 303 with a response size of 5014 bytes, response time of 2871 ms, and user agent (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36).',
 'Request from IP 202.158.36.216 on 22/Feb/2023:04:18:59 +0300 (In 2023, on February 22, which is a Wednesday, at 04:18:59.) using PUT method to endpoint /usr/login resulted in status code 500 with a response size of 5005 bytes, response time of 2562 ms, and user agent (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36).',
 'Request from IP 110.103.4.112 on 21/Dec/2023:01:44:35 +0300 (In 2023, on December 21, which is a Thursday, at 01:44:35.) using GET method to endpoint /usr resulted in status code 502 with a response size of 5034 

In [5]:
from sentence_transformers import SentenceTransformer
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM



sentence_model_1 = SentenceTransformer('all-MiniLM-L6-v2')
sentence_model_2 = SentenceTransformer('all-MiniLM-L12-v2')
sentence_model_3 = SentenceTransformer('all-mpnet-base-v2')




In [9]:
def embed_sentences(sentences):

    print("Embedding yapılıyor...")
    embeddings = sentence_model_1.encode(sentences)

    print("Embedding yapıldı.")

    return embeddings

def fass_index(embeddings):

    print("İndex oluşturuluyor...")

    # FAISS index oluşturma
    print(embeddings.shape)
    # embeddings = embeddings.reshape(1, -1)
    d = embeddings.shape[1]  # Gömme boyutu
    nlist = 100  # Bölme sayısı
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
    index.train(np.array(embeddings))  # Eğitimi gerçekleştir
    index.add(np.array(embeddings))  # Vektörleri index'e ekle

    return index

def relevantlogs(question, index):

    query_embedding = sentence_model_1.encode([question])
    # print("Loglar aranıyor...")
    D, I = index.search(query_embedding, k=20)  # En yakın 5 komşuyu ara

    relevant = [sentences[idx] for idx in I[0]]
    # print("Loglar bulundu.\n")

    return relevant

In [7]:
embeddings = embed_sentences(sentences)

Embedding yapılıyor...
Embedding yapıldı.


In [8]:
index = fass_index(embeddings)

İndex oluşturuluyor...
(50000, 384)


In [19]:
import ollama

# query = " How many IP addresses were sent POST requests to the endpoint /usr/admin ?"
# query = "How many POST requests were sent in March 2023 ?"
query = "On which date did the 404 status code come from the requests?"

relevant_logs = relevantlogs(query, index)
relevant_logs = "\n".join(relevant_logs)
print(f"Soru: {query}\n")
# print(f"Logs: {relevant_logs}\n")

prompt = f" Answer the question according to the context. Don't explain too much. Question: {query} Context: {relevant_logs}"
response = ollama.generate(model='llama3.1:8b',
                        prompt=prompt)

print(f"Cevap: {response['response']}\n")

Soru: On which date did the 404 status code come from the requests?

Cevap: These logs appear to be from a web server or application that is tracking requests made to various endpoints. Here's an analysis of the data:

**Common patterns:**

1. All requests are POST methods.
2. The user agent is consistently "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0", indicating that all requests were made using the same version of Firefox on Windows 10.
3. All requests resulted in a 404 status code, which means "Not Found" or "Page Not Found".

**Interesting findings:**

1. There are multiple instances of requests to different endpoints:
	* `/usr/admin/developer`
	* `/usr/admin`
	* `/usr`
2. The response sizes and times vary slightly, but all are relatively small ( < 5KB) and quick (<4 seconds).
3. Requests were made from various IP addresses over a period of several months.

**Possible interpretations:**

1. **Spam or bot activity:** Given the consistent user agen

Soru: On which date did the 404 status code come from the requests?

Cevap: These logs appear to be from a web server or application that is tracking requests made to various endpoints. Here's an analysis of the data:

**Common patterns:**

1. All requests are POST methods.
2. The user agent is consistently "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0", indicating that all requests were made using the same version of Firefox on Windows 10.
3. All requests resulted in a 404 status code, which means "Not Found" or "Page Not Found".

**Interesting findings:**

1. There are multiple instances of requests to different endpoints:
	* `/usr/admin/developer`
	* `/usr/admin`
	* `/usr`
2. The response sizes and times vary slightly, but all are relatively small ( < 5KB) and quick (<4 seconds).
3. Requests were made from various IP addresses over a period of several months.

**Possible interpretations:**

1. **Spam or bot activity:** Given the consistent user agent and repeated requests to different endpoints, it's possible that these logs indicate spam or bot activity.
2. **Error in application configuration:** Another possibility is that there was an error in configuring the application or web server, leading to 404 errors for valid routes.
3. **Test or stress testing:** It's also possible that someone was intentionally testing the application or simulating heavy traffic to identify potential issues.
...
3. Review network access logs from neighboring servers or ISPs to see if similar patterns exist.

If you'd like me to dig deeper, please provide more context or information about the application and its environment!

Soru:  How many IP addresses were sent POST requests to the endpoint /usr/admin ?

Cevap: Based on the provided information, I will attempt to identify any potential issues or patterns in the requests made to the `/usr/admin` endpoint.

**Observations:**

1. **Variety of Status Codes**: There are multiple status codes returned for the same endpoint (200, 502), indicating different responses from the server.
2. **Different Response Sizes**: The response sizes vary significantly across different requests, ranging from 4944 bytes to 5185 bytes.
3. **User Agent and IP Address Patterns**: The User Agents appear to be identical for all requests (Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0), suggesting a consistent client environment. However, the IP addresses are diverse, indicating different clients or users making these requests.
4. **Some Status Codes appear frequently**: The status code 502 appears in three instances (requests from IPs 69.51.183.103, 33.73.212.67, and 181.244.42.215), while the other status codes are less frequent.

**Possible Interpretations:**

1. **Server Configuration Issues**: The varied response sizes and different status codes could indicate configuration issues or inconsistencies on the server-side.
2. **Client-Side Behavior Variance**: Although the User Agent is consistent, different clients or users may be experiencing different behavior when interacting with the `/usr/admin` endpoint.
3. **Overloaded Server Resource**: Some requests resulting in status code 502 might suggest that the server is experiencing resource overload or contention for resources.

**Recommendations:**

1. **Verify Server Configuration**: Ensure that the server configuration and setup are correct, and there are no inconsistencies in handling requests to the `/usr/admin` endpoint.
2. **Analyze Client-Side Behavior**: Investigate client-side behavior differences by analyzing User Agent data from logs or using other analytics tools.
3. **Identify Performance Bottlenecks**: Look for performance bottlenecks on the server-side that could be causing some requests to fail with status code 502.

Please let me know if you would like me to revise any of these observations, interpretations, or recommendations based on more specific context.

Soru: How many POST requests were sent in March 2023 ?

Cevap: This is a log of failed requests to a web server. Here are the details:

**Five distinct IPs have made POST requests to /usr endpoint**

1. **134.55.179.255**: 
	* Date: 05/Dec/2023
	* Time: 10:36:59
	* Status Code: 500 (Internal Server Error)
	* Response Size: 5081 bytes
	* Response Time: 4303 ms
	* User Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0
2. **182.143.12.18**: 
	* Date: 29/Aug/2023
	* Time: 06:51:54
	* Status Code: 500 (Internal Server Error)
	* Response Size: 4951 bytes
	* Response Time: 3462 ms
	* User Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0
3. **210.134.207.22**: 
	* Date: 01/Jul/2023
	* Time: 12:31:04
	* Status Code: 500 (Internal Server Error)
	* Response Size: 4973 bytes
...
3. **Test network connectivity**: Test the network connectivity between the clients and the server to ensure there are no issues.

It is essential to investigate further to determine the root cause of this issue, and it would be best to consult with a web development expert or the application owner for further guidance.