In [3]:
import pandas as pd
import os
import re

# Function to parse log file and extract relevant information
def parse_log_file(log_file_path):
    # Open log file
    with open(log_file_path, 'r', encoding="utf8", errors='ignore') as f:
        log_lines = f.readlines()

    # Extract relevant information from log lines
    log_entries = []
    for line in log_lines:
        # Skip lines that don't contain errors or warnings
        if 'ERROR' not in line and 'WARNING' not in line:
            continue

        # Extract timestamp and log message
        timestamp_regex = r'(?:\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})|(?:\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2})|(?:\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3})'
        timestamp = re.search(timestamp_regex, line)
        if not timestamp:
            continue
        timestamp = timestamp.group(0)

        log_message = re.sub(r'^.*?:\s', '', line.strip())

        # Add log entry to list
        log_entries.append((timestamp, log_message))

    # Convert log entries into a Pandas DataFrame
    log_df = pd.DataFrame(log_entries, columns=['timestamp', 'log_message'])

    return log_df

# Function to preprocess log data for machine learning classification
def preprocess_logs(log_df):
    # Tokenize log messages
    log_df['tokens'] = log_df['log_message'].str.split()

    # Extract relevant features
    log_df['error'] = log_df['log_message'].apply(lambda x: 1 if 'ERROR' in x else 0)
    log_df['warning'] = log_df['log_message'].apply(lambda x: 1 if 'WARNING' in x else 0)

    # Drop irrelevant columns
    log_df = log_df.drop(columns=['log_message'])

    return log_df

# Define log files to process
log_files = ['dataset/system-logs/Mac.log', 'dataset/system-logs/Windows.log', 'dataset/system-logs/Android.log']

# Process each log file and save results as CSV file
for log_file in log_files:
    # Determine output file name based on input file name
    file_name = os.path.basename(log_file)
    output_file_name = f"dataset/system-logs/multiple-system-log-dataset/extracted-data/{os.path.splitext(file_name)[0]}_extracted.csv"

    # Parse log file and preprocess data
    log_df = parse_log_file(log_file)
    processed_df = preprocess_logs(log_df)

    # Save processed data as CSV file
    processed_df.to_csv(output_file_name, index=False)

    # Print information about the generated dataset
    print(f"{output_file_name} generated with {len(processed_df)} entries")
    
    
# For Linux extraction
# Function to parse log file and extract relevant information
def parse_log_file(log_file_path):
    # Open log file
    with open(log_file_path, 'r', encoding="utf8", errors='ignore') as f:
        log_lines = f.readlines()

    # Extract relevant information from log lines
    log_entries = []
    for line in log_lines:
        # Extract timestamp and log message using the regex pattern
        timestamp_regex = r'(\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2})'
        log_match = re.search(timestamp_regex, line)
        if log_match:
            timestamp = log_match.group(1)
            log_message = re.sub(r'^.*?:\s', '', line.strip())

            # Check for severity levels and assign labels
            error = 0
            warning = 0
            for pattern, label in severity_levels.items():
                if re.search(pattern, log_message):
                    if label in ['error', 'emergency', 'alert', 'critical']:
                        error = 1
                    elif label == 'warning':
                        warning = 1
                    break

            # Add log entry to list
            log_entries.append((timestamp, log_message, error, warning))
        else:
            logger.warning(f"Timestamp not found in line: {line.strip()}")

    if not log_entries:
        # Handle the case when log_entries is empty (no relevant data was found in the log file)
        logger.warning(f"No relevant data found in the log file: {log_file_path}")
        return None  # Return None to indicate that no data was processed for this log file

    # Convert log entries into a Pandas DataFrame
    log_df = pd.DataFrame(log_entries, columns=['timestamp', 'log_message', 'error', 'warning'])

    return log_df

# Function to preprocess log data for machine learning classification
def preprocess_logs(log_df):
    # Tokenize log messages
    log_df['tokens'] = log_df['log_message'].str.split()

    # Extract relevant features
    log_df['error'] = log_df['log_message'].apply(lambda x: 1 if 'ERROR' in x else 0)
    log_df['warning'] = log_df['log_message'].apply(lambda x: 1 if 'WARNING' in x else 0)

    # Drop the 'log_message' column and move 'tokens' column to the second position
    log_df = log_df[['timestamp', 'tokens', 'error', 'warning']]

    return log_df

# Define the severity level keywords and their corresponding labels
severity_levels = {
    r'(EMERG|PANIC)': 'emergency',
    r'ALERT': 'alert',             # Add 'ALERT' as an error
    r'(CRIT|CRITICAL)': 'critical',
    r'(ERR|ERROR|FAILED)': 'error', # Make 'FAILED' case-sensitive
    r'(WARNING|WARN)': 'warning',
    r'NOTICE': 'notice',
    r'(INFO|INFORMATIONAL)': 'info',
    r'DEBUG': 'debug',
}

# Define log files to process
log_files = ['dataset/system-logs/Linux.log']

# Process each log file and save results as CSV file
for log_file in log_files:
    try:
        # Determine output file name based on input file name
        file_name = os.path.basename(log_file)
        output_directory = "dataset/system-logs/multiple-system-log-dataset/extracted-data/"
        os.makedirs(output_directory, exist_ok=True)  # Create output directory if it doesn't exist
        output_file_name = os.path.join(output_directory, f"{os.path.splitext(file_name)[0]}_extracted.csv")

        # Parse log file and preprocess data
        log_df = parse_log_file(log_file)

        # Check if log_df is None (no relevant data found in the log file)
        if log_df is None:
            continue  # Skip processing this log file

        # Drop the 'log_message' column and move 'tokens' column to the second position
        log_df = preprocess_logs(log_df)

        # Save processed data as CSV file
        log_df.to_csv(output_file_name, index=False, sep=',')

        # Print information about the generated dataset
        print(f"{output_file_name} generated with {len(log_df)} entries")

    except Exception as e:
        logger.error(f"Error processing {log_file}: {str(e)}")



dataset/system-logs/multiple-system-log-dataset/extracted-data/Mac_extracted.csv generated with 737 entries
dataset/system-logs/multiple-system-log-dataset/extracted-data/Windows_extracted.csv generated with 188 entries
dataset/system-logs/multiple-system-log-dataset/extracted-data/Android_extracted.csv generated with 1434 entries
dataset/system-logs/multiple-system-log-dataset/extracted-data/Linux_extracted.csv generated with 25567 entries
