In [5]:
import pandas as pd
import os

# Read in the log files
df_mac = pd.read_csv('dataset/system-logs/multiple-system-log-dataset/extracted-data/Mac_extracted.csv')
df_win = pd.read_csv('dataset/system-logs/multiple-system-log-dataset/extracted-data/Windows_extracted.csv')
df_android = pd.read_csv('dataset/system-logs/multiple-system-log-dataset/extracted-data/Android_extracted.csv')

# Concatenate the dataframes into a single dataframe
df_logs = pd.concat([df_mac, df_win, df_android])

# Drop any rows with missing timestamps or tokens
df_logs = df_logs.dropna(subset=['timestamp', 'tokens'])

# Convert tokens column to string type
df_logs['tokens'] = df_logs['tokens'].astype(str)

# Fill missing error and warning values using forward fill
df_logs['error'] = df_logs['error'].fillna(method='ffill')
df_logs['warning'] = df_logs['warning'].fillna(method='ffill')

# Extract only the columns we need
df_logs = df_logs[['timestamp', 'tokens', 'error', 'warning']]

# Add Label column based on file name
df_logs['Label'] = df_logs.index.get_level_values(0).astype(str).str.split('/').str[-1].str.split('.').str[0]


# Save preprocessed data to CSV files
df_logs.to_csv('dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Mac_preprocessed.csv', index=False)
df_logs.to_csv('dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Windows_preprocessed.csv', index=False)
df_logs.to_csv('dataset/system-logs/multiple-system-log-dataset/preprocessed-data/Android_preprocessed.csv', index=False)
preprocessed_dir = 'dataset/system-logs/multiple-system-log-dataset/preprocessed-data'
print(os.listdir(preprocessed_dir))



['Android_preprocessed.csv', 'dataset', '.ipynb_checkpoints', 'Mac_preprocessed.csv', 'Windows_preprocessed.csv']
