In [10]:
import pandas as pd
from pandas import DataFrame


In [11]:
import pandas as pd
import re
import os

# define a function to parse each log entry
def parse_log_entry(log_entry):
    # define a regular expression pattern to extract information from the log entry
    pattern = r'(?P<server_name>[\w.-]+) (?P<remote_host>\S+) (?P<remote_logname>\S+) (?P<remote_user>\S+) \[(?P<timestamp>[^\]]+)\] "(?P<request_method>[A-Z]+) (?P<requested_url>\S+) HTTP/\d\.\d" (?P<status_code>\d+) (?P<bytes_sent>\d+) "(?P<referer>[^"]+)" "(?P<user_agent>[^"]+)"'
    
    # match the pattern against the log entry
    match = re.match(pattern, log_entry)
    
    if match:
        return match.groupdict()
    else:
        return None


logs_directory = "logs"

# list all files in the directory
log_files = [f for f in os.listdir(logs_directory) if os.path.isfile(os.path.join(logs_directory, f))]

# list to store parsed log entries
all_log_entries = []

# process each log file
for log_file in log_files:
    log_file_path = os.path.join(logs_directory, log_file)
    
    # read log file line by line and parse each entry
    with open(log_file_path, 'r') as file:
        for line in file:
            log_entry = parse_log_entry(line.strip())
            if log_entry:
                all_log_entries.append(log_entry)

# convert list of dictionaries to DataFrame
df = pd.DataFrame(all_log_entries)

# display the DataFrame
print(df.head(10))  # Displaying the first 10 rows of the combined DataFrame

             server_name   remote_host remote_logname remote_user  \
0      sitgesanytime.com   47.76.35.19              -           -   
1  www.sitgesanytime.com   47.76.35.19              -           -   
2  www.sitgesanytime.com  40.77.167.53              -           -   
3      sitgesanytime.com   47.76.35.19              -           -   
4  www.sitgesanytime.com   47.76.35.19              -           -   
5      sitgesanytime.com   47.76.35.19              -           -   
6  www.sitgesanytime.com   47.76.35.19              -           -   
7      sitgesanytime.com   47.76.35.19              -           -   
8  www.sitgesanytime.com   47.76.35.19              -           -   
9      sitgesanytime.com   47.76.35.19              -           -   

                    timestamp request_method  \
0  22/Jan/2024:00:00:00 +0100           HEAD   
1  22/Jan/2024:00:00:01 +0100           HEAD   
2  22/Jan/2024:00:00:06 +0100            GET   
3  22/Jan/2024:00:00:06 +0100           HEAD   


In [12]:
df['request_method']

0          HEAD
1          HEAD
2           GET
3          HEAD
4          HEAD
           ... 
1076917     GET
1076918     GET
1076919     GET
1076920     GET
1076921     GET
Name: request_method, Length: 1076922, dtype: object

In [13]:
print(df['requested_url'])

print(df['status_code'])

print(df['bytes_sent'])


0          /fr/pag492/explora-platges-i-ports-2/id12/les-...
1          /fr/pag492/explora-platges-i-ports-2/id12/les-...
2          /ca/noticias/84/sitges-obt%C3%A9-el-certificat...
3          /fr/pag492/explora-platges-i-ports-2/id12/les-...
4          /fr/pag492/explora-platges-i-ports-2/id12/les-...
                                 ...                        
1076917    /plantilles/turisme/eltemps/wstemps.php?site=1...
1076918                /plantilles/turisme/img/temps3/31.svg
1076919                                                    /
1076920                                                 /en/
1076921     /plantilles/turisme/css/estils-capcalera.css?v=3
Name: requested_url, Length: 1076922, dtype: object
0          301
1          200
2          404
3          301
4          200
          ... 
1076917    200
1076918    200
1076919    301
1076920    200
1076921    200
Name: status_code, Length: 1076922, dtype: object
0           4840
1           5223
2           2509
3          

In [14]:
# save the DataFrame to a CSV file
csv_file_path = "combined_logs.csv"
df.to_csv(csv_file_path, index=False)

print(f"CSV file saved successfully: {csv_file_path}")

CSV file saved successfully: combined_logs.csv


In [15]:
print(df['user_agent'][0])

Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3928.157 Safari/537.36


### Variables insights
* server_name is **not** important for training the models **(not done)**
* remote_host is important and has been split into octets and normalized **(done)**
* remote_logname is **not** important for training the models **(not done)**
* remote_user **may** be important, we just have to consider that there's "-" values that need to be handled **(not done)**
* timestamp is important and has been split into hour, second, day, and cyclic encoding for training NN (sine, cosine) **(done)**
* request_method is important and it has been one-hot encoded, not label because model may learn that there's an order between things **(done)**
* requested_url is important and it has been vectorized using word embeddings with dimensionality 300 **(done)**
* status_code is important and we need to one hot encode (possibly) the feature and analyze its unique values **(not done)**
* bytes_sent **may** be important but we also need to take into account that its value is properly normalized within a range **(not done)**
* referer and user_agent are both important and need to be analyzed (possibly embeddings to vectorize) **(not done)**