### Conversión de dataset de tipo de BGL de JSON a CSV

In [49]:
import pandas as pd
import re
import csv
from io import StringIO
import json

In [50]:
# Corregir posibles errores en la línea
def repair_json_line(line):
    # Reemplazar comillas simples por dobles
    line = line.replace("'", '"')  
    line = re.sub(r'(?<=,|\{)\s*([^"{\s][^:]*?)\s*:', r' "\1":', line)  # Asegurar propiedades encerradas en comillas
    return line

In [51]:
# Cargar un archivo CSV y devolver un DataFrame
def load_and_clean_json(filename):
    repaired_content = []
    line_number = 0
    with open(filename, 'r') as file:
        lines = file.readlines()
        try:
            for line in lines:
                line_number += 1
                repaired_line = repair_json_line(line)
                repaired_content.append(repaired_line)
            data = json.loads(''.join(repaired_content))
            return pd.DataFrame(data)
        except json.JSONDecodeError as e:
            print(f"Error decodeando el fichero JSON después del intento: {e}")
            error_line_content = lines[line_number - 1] if line_number > 0 and line_number <= len(lines) else "Line number out of range."
            print(f"Contenido de la línea problemática {line_number}: {error_line_content}")
            # Devolver un DataFrame vacío en caso de error
            return pd.DataFrame()  

In [52]:
# Obtener el contenido de una línea de un fichero
def get_line_content(filename, line_number):
    try:
        with open(filename, 'r') as file:
            for current_line, content in enumerate(file, start=1):
                if current_line == line_number:
                    return content
    except Exception as e:
        return f"Error reading line: {str(e)}"

In [53]:
# Cargar y limpiar el archivo JSON
df = load_and_clean_json('0X_nombre_dataset.json')
if df is not None and not df.empty:
    print(df.head())
else:
    print("Fallo al cargar el archivo JSON.")

                    timestamp             hostname  service  \
0  2005-06-03T15:53:35.497192  R02-M1-N0-C:J12-U11  unknown   
1  2005-06-03T15:53:36.019787  R02-M1-N0-C:J12-U11  unknown   
2  2005-06-03T15:53:36.523022  R02-M1-N0-C:J12-U11  unknown   
3  2005-06-03T15:53:36.781271  R02-M1-N0-C:J12-U11  unknown   
4  2005-06-03T15:53:37.370598  R02-M1-N0-C:J12-U11  unknown   

                                             message  
0  KERNEL INFO instruction cache parity error cor...  
1  KERNEL INFO instruction cache parity error cor...  
2  KERNEL INFO instruction cache parity error cor...  
3  KERNEL INFO instruction cache parity error cor...  
4  KERNEL INFO instruction cache parity error cor...  


In [55]:
# Suponiendo que df ya está cargado con la data del JSON
try:
    # Convertir la columna 'timestamp' a las columnas 'time', 'YYYY', 'MM', 'DD' y 'hh:mm'
    df['time'] = pd.to_datetime(df['timestamp'], errors='coerce', infer_datetime_format=True)
    df['YYYY'] = df['time'].dt.year.apply(lambda x: f'{x:4}')
    df['MM'] = df['time'].dt.month.apply(lambda x: f'{x:02}')
    df['DD'] = df['time'].dt.day.apply(lambda x: f'{x:02}')
    df['hh:mm'] = df['time'].dt.strftime('%H:%M')

    # Muestra los resultados de la conversión y las nuevas columnas
    print(df[['time', 'YYYY', 'MM', 'DD', 'hh:mm']])
except Exception as e:
    print(f"Error processing dates: {e}")

  df['time'] = pd.to_datetime(df['timestamp'], errors='coerce', infer_datetime_format=True)


                             time    YYYY   MM   DD  hh:mm
0      2005-06-03 15:53:35.497192  2005.0  6.0  3.0  15:53
1      2005-06-03 15:53:36.019787  2005.0  6.0  3.0  15:53
2      2005-06-03 15:53:36.523022  2005.0  6.0  3.0  15:53
3      2005-06-03 15:53:36.781271  2005.0  6.0  3.0  15:53
4      2005-06-03 15:53:37.370598  2005.0  6.0  3.0  15:53
...                           ...     ...  ...  ...    ...
164651 2005-06-07 12:20:50.907487  2005.0  6.0  7.0  12:20
164652 2005-06-07 12:20:50.931702  2005.0  6.0  7.0  12:20
164653 2005-06-07 12:20:50.955966  2005.0  6.0  7.0  12:20
164654 2005-06-07 12:20:50.982019  2005.0  6.0  7.0  12:20
164655 2005-06-07 12:20:51.005901  2005.0  6.0  7.0  12:20

[164656 rows x 5 columns]


In [56]:
# Funciones para extraer las distintas columnas del campo principal 'message'
def extract_ip(message):
    match = re.search(r'\b(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b', message)
    return match.group(0) if match else None

df['IP'] = df['message'].apply(extract_ip)

def extract_port(message):
    common_ports = {
        '21': 'FTP', '22': 'SSH', '23': 'Telnet', '25': 'SMTP', '53': 'DNS',
        '80': 'HTTP', '110': 'POP3', '143': 'IMAP', '443': 'HTTPS', '587': 'SMTP',
        '3306': 'MySQL', '5432': 'PostgreSQL', '8080': 'HTTP-alt'
    }
    matches = re.findall(r'\b\d{2,5}\b', message)
    ports = [port for port in matches if port in common_ports]
    return ports[0] if ports else None

df['Port'] = df['message'].apply(extract_port)

def extract_keyword(message):
    keywords = ['error', 'fatal', 'failure', 'exception', 'warning', 'critical', 'denied', 'unreachable', 'timeout', 'failed']
    pattern = re.compile('|'.join(keywords), re.IGNORECASE)
    match = pattern.search(message)
    return match.group(0) if match else None

def extract_action(message):
    actions = ['restarted', 'stopped', 'started', 'deployed']
    pattern = r'\b(' + '|'.join(actions) + r')\b'
    match = re.search(pattern, message, re.IGNORECASE)
    return match.group(0) if match else None

df['Action'] = df['message'].apply(extract_action)

df['Keyword'] = df['message'].apply(extract_keyword)

df['User'] = df['message'].apply(lambda x: re.findall(r'user\s+(\w+)', x)[0] if re.findall(r'user\s+(\w+)', x) else None)

def extract_interface(message):
    pattern = r'\b(eth0|eth[12]|wlan0|lo)\b'
    match = re.search(pattern, message)
    return match.group(0) if match else None

df['Interface'] = df['message'].apply(extract_interface)


df['UID'] = df['message'].apply(lambda x: re.findall(r'uid=(\d+)', x)[0] if re.findall(r'uid=(\d+)', x) else None)

df['Protocol'] = df['message'].apply(lambda x: re.findall(r'\b(TCP|UDP)\b', x, re.IGNORECASE)[0] if re.findall(r'\b(TCP|UDP)\b', x, re.IGNORECASE) else None)

df['Component'] = df['message'].apply(lambda x: re.findall(r'\b(RAS)\b', x)[0] if re.findall(r'\b(RAS)\b', x) else None)

df['Severity'] = df['message'].apply(lambda x: re.findall(r'\b(KERNEL|DISCOVERY)\b', x)[0] if re.findall(r'\b(KERNEL|DISCOVERY)\b', x) else None)

df['Type'] = df['message'].apply(lambda x: re.findall(r'\b(INFO|FATAL|SEVERE|WARNING|ERROR)\b', x, re.IGNORECASE)[0] if re.findall(r'\b(INFO|FATAL|SEVERE|WARNING|ERROR)\b', x, re.IGNORECASE) else None)

df['Thread ID'] = df['message'].apply(lambda x: re.findall(r'\bthread\s+(\d+)\b', x, re.IGNORECASE)[0] if re.findall(r'\bthread\s+(\d+)\b', x, re.IGNORECASE) else None)

df['message'] = df['message'].apply(lambda x: x.replace('\t', ' ').replace('"', '""'))

df = df[['YYYY', 'MM', 'DD', 'hh:mm', 'hostname', 'service', 'User', 'IP', 'Port', 'Keyword', 'Interface', 'UID', 'Action', 'Protocol', 'Component', 'Severity', 'Type', 'Thread ID', 'message']]

output = StringIO()
df.to_csv(output, sep='\t', index=False, header=False)
output.seek(0)

lines = output.readlines()
formatted_lines = []

headers = ['YYYY', 'MM', 'DD', 'hh:mm', 'Hostname', 'Service', 'User', 'IP', 'Port', 'Keyword', 'Interface', 'UID', 'Action', 'Protocol', 'Component', 'Severity', 'Type', 'Thread ID', 'Message']
max_widths = [max(max(len(row[i]) for row in (line.split('\t') for line in lines)), len(header)) for i, header in enumerate(headers)]

header_line = "".join(f'{header:<{max_widths[i]}}\t' for i, header in enumerate(headers))
formatted_lines.append(header_line.strip() + '\n')

In [57]:
# Preparación de las líneas de datos
for line in lines:
    row = line.split('\t')
    # Comprobar que el número de columnas es correcto
    if len(row) == len(max_widths):
        formatted_line = "".join(f'{item.strip():<{max_widths[i]}}\t' for i, item in enumerate(row))
        formatted_lines.append(formatted_line.strip() + '\n')
    else:
        print("Error: El número de columnas no coincide con el número de cabeceras.", row)

In [58]:
# Escritura de las líneas formateadas en un archivo CSV
with open('0X_nombre_dataset.csv', 'w') as file:
    file.writelines(formatted_lines)