### Conversión de dataset JSON a CSV

In [1]:
import pandas as pd
import re
import csv
from io import StringIO

In [2]:
# Cargar los datos del archivo JSON
try:
    df = pd.read_json('0X_nombre_dataset.json')
    # Imprimir los primeros 5 registros para verificar que se cargaron los datos correctamente
    print(df.head()) 
except ValueError as e:
    print("Error loading JSON:", e)

            timestamp          hostname       service  \
0 1900-11-30 06:39:00  ip-172-31-27-153  CRON[21882]:   
1 1900-11-30 06:47:01  ip-172-31-27-153  CRON[22087]:   
2 1900-11-30 06:47:03  ip-172-31-27-153  CRON[22087]:   
3 1900-11-30 07:07:14  ip-172-31-27-153  sshd[22116]:   
4 1900-11-30 07:07:35  ip-172-31-27-153  sshd[22118]:   

                                             message  
0  pam_unix(cron:session): session closed for use...  
1  pam_unix(cron:session): session opened for use...  
2  pam_unix(cron:session): session closed for use...  
3      Connection closed by 122.225.103.87 [preauth]  
4      Connection closed by 122.225.103.87 [preauth]  


In [3]:
# Definir las columnas que se van a utilizar como dataframes
df['time'] = pd.to_datetime(df['timestamp'])
df['YYYY'] = df['time'].dt.year.apply(lambda x: f'{x:4}')
df['MM'] = df['time'].dt.month.apply(lambda x: f'{x:02}')
df['DD'] = df['time'].dt.day.apply(lambda x: f'{x:02}')
df['hh:mm'] = df['time'].dt.strftime('%H:%M')

In [4]:
# Funciones para extraer las distintas columnas del campo principal 'message'
def extract_ip(message):
    match = re.search(r'\b(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b', message)
    return match.group(0) if match else None

df['IP'] = df['message'].apply(extract_ip)

def extract_port(message):
    common_ports = {
        '21': 'FTP', '22': 'SSH', '23': 'Telnet', '25': 'SMTP', '53': 'DNS',
        '80': 'HTTP', '110': 'POP3', '143': 'IMAP', '443': 'HTTPS', '587': 'SMTP',
        '3306': 'MySQL', '5432': 'PostgreSQL', '8080': 'HTTP-alt'
    }
    matches = re.findall(r'\b\d{2,5}\b', message)
    ports = [port for port in matches if port in common_ports]
    return ports[0] if ports else None

df['Port'] = df['message'].apply(extract_port)

def extract_keyword(message):
    keywords = ['error', 'fatal', 'failure', 'exception', 'warning', 'critical', 'denied', 'unreachable', 'timeout', 'failed']
    pattern = re.compile('|'.join(keywords), re.IGNORECASE)
    match = pattern.search(message)
    return match.group(0) if match else None

def extract_action(message):
    actions = ['restarted', 'stopped', 'started', 'deployed']
    pattern = r'\b(' + '|'.join(actions) + r')\b'
    match = re.search(pattern, message, re.IGNORECASE)
    return match.group(0) if match else None

df['Action'] = df['message'].apply(extract_action)

df['Keyword'] = df['message'].apply(extract_keyword)

df['User'] = df['message'].apply(lambda x: re.findall(r'user\s+(\w+)', x)[0] if re.findall(r'user\s+(\w+)', x) else None)

def extract_interface(message):
    pattern = r'\b(eth0|eth[12]|wlan0|lo)\b'
    match = re.search(pattern, message)
    return match.group(0) if match else None

df['Interface'] = df['message'].apply(extract_interface)


df['UID'] = df['message'].apply(lambda x: re.findall(r'uid=(\d+)', x)[0] if re.findall(r'uid=(\d+)', x) else None)

df['Protocol'] = df['message'].apply(lambda x: re.findall(r'\b(TCP|UDP)\b', x, re.IGNORECASE)[0] if re.findall(r'\b(TCP|UDP)\b', x, re.IGNORECASE) else None)

df['Component'] = df['message'].apply(lambda x: re.findall(r'\b(RAS)\b', x)[0] if re.findall(r'\b(RAS)\b', x) else None)

df['Severity'] = df['message'].apply(lambda x: re.findall(r'\b(KERNEL|DISCOVERY|EMERGENCY|ALERT|CRITICAL|ERROR|WARNING|NOTICE|INFO|DEBUG)\b', x)[0] if re.findall(r'\b(KERNEL|DISCOVERY|EMERGENCY|ALERT|CRITICAL|ERROR|WARNING|NOTICE|INFO|DEBUG)\b', x) else None)

df['Type'] = df['message'].apply(lambda x: re.findall(r'\b(INFO|FATAL|SEVERE|WARNING|ERROR)\b', x, re.IGNORECASE)[0] if re.findall(r'\b(INFO|FATAL|SEVERE|WARNING|ERROR)\b', x, re.IGNORECASE) else None)

df['Thread ID'] = df['message'].apply(lambda x: re.findall(r'\bthread\s+(\d+)\b', x, re.IGNORECASE)[0] if re.findall(r'\bthread\s+(\d+)\b', x, re.IGNORECASE) else None)

df['message'] = df['message'].apply(lambda x: x.replace('\t', ' ').replace('"', '""'))

df = df[['YYYY', 'MM', 'DD', 'hh:mm', 'hostname', 'service', 'User', 'IP', 'Port', 'Keyword', 'Interface', 'UID', 'Action', 'Protocol', 'Component', 'Severity', 'Type', 'Thread ID', 'message']]

output = StringIO()
df.to_csv(output, sep='\t', index=False, header=False)
output.seek(0)

lines = output.readlines()
formatted_lines = []

# Definición de las columnas del dataset en el orden correcto
headers = ['YYYY', 'MM', 'DD', 'hh:mm', 'Hostname', 'Service', 'User', 'IP', 'Port', 'Keyword', 'Interface', 'UID', 'Action', 'Protocol', 'Component', 'Severity', 'Type', 'Thread ID', 'Message']
max_widths = [max(max(len(row[i]) for row in (line.split('\t') for line in lines)), len(header)) for i, header in enumerate(headers)]

# Preparación del formato de las columnas
header_line = "".join(f'{header:<{max_widths[i]}}\t' for i, header in enumerate(headers))
formatted_lines.append(header_line.strip() + '\n')

In [5]:
# Formateo de las líneas del dataset
for line in lines:
    row = line.split('\t')
    # Verificación del número de columnas
    if len(row) == len(max_widths):
        formatted_line = "".join(f'{item.strip():<{max_widths[i]}}\t' for i, item in enumerate(row))
        formatted_lines.append(formatted_line.strip() + '\n')
    else:
        print("Error: Row has incorrect number of columns", row)

In [6]:
# Escritura de las líneas formateadas en un archivo CSV
with open('0X_nombre_dataset.csv', 'w') as file:
    file.writelines(formatted_lines)