In [None]:
PROJECT_DIR = '/home/thanuja/Dropbox/capstone/'
APP_SYS_NAME = 'BGL'
#APP_SYS_NAME = 'Thunderbird'

In [None]:
BASE_DIR = PROJECT_DIR + 'output/'
RAW_DIR = PROJECT_DIR + 'raw_files/'

# Imports

In [None]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, confusion_matrix , precision_score, recall_score
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import PrecisionRecallDisplay
from scipy.spatial.distance import cosine
import warnings
import matplotlib.pyplot as plt
import sys
import subprocess
import random

sys.path.append(PROJECT_DIR) # this is done to make the import of ad_feature_extraction work
from ad_feature_extraction import parsers

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Preprocessing

In [None]:
input_data = pd.read_csv(BASE_DIR + APP_SYS_NAME + '_all_params.csv')

print("Input_data Shape:",input_data.shape)
print(input_data['label'].value_counts())

sns.countplot(x=input_data['label'])
plt.show()

In [None]:
le = LabelEncoder()
input_data['label'] = le.fit_transform(input_data['label']) # false:0 and true:1

X = input_data[['tfidf_text']].dropna()
y = input_data.loc[X.index,'label'].values

print("columns for the X file"+ str(X.columns))
print(X.shape,y.shape)

In [None]:
# do tfidf and kmeans on input_data and get homegeneity and completeness scores.
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X['tfidf_text'])
print('tfidf shape', X_tfidf.shape)

k = input_data['clusters'].max() + 1
print('Using k = ', k)
kmeans = KMeans(k)
kmeans.fit(X_tfidf)

pred = kmeans.predict(X_tfidf)
labels = y

print("_______________________________________________")
print(f"Completeness Score: \n {completeness_score(labels, pred)}\n")
print(f"Homogeneity Score: \n {homogeneity_score(labels, pred)}\n") 

In [None]:
# count log lines within each cluster
output = X.copy()
output['cluster2'] = pred
output['label'] = input_data.loc[X.index, 'label']
print(output.groupby(['cluster2', 'label']).count())

# Use label to determine which clusters are anomalies

In [None]:
print(output.groupby(['cluster2', 'label']).count())
counts_by_label_df = output.groupby(['cluster2', 'label']).count().reset_index()
pos_df = counts_by_label_df[counts_by_label_df['label'] == 1]
neg_df = counts_by_label_df[counts_by_label_df['label'] == 0]
anomaly_clusters = []
normal_clusters = []
for _, row in pos_df.iterrows():
    cluster = row['cluster2']
    pos = row['tfidf_text']
    neg = neg_df[neg_df["cluster2"] == cluster]['tfidf_text'].sum()
    print(cluster, 'pos neg', pos, neg)
    if pos > neg:
        anomaly_clusters.append(cluster)

for _, row in neg_df.iterrows():
    cluster = row['cluster2']
    neg = row['tfidf_text']
    pos = pos_df[pos_df["cluster2"] == cluster]['tfidf_text'].sum()
    if neg >= pos:
        normal_clusters.append(cluster)

print('anomaly clusters', anomaly_clusters)
print('normal clusters', normal_clusters)

# Create summary report for clusters. Give example log line for each cluster.

In [None]:
summary_file_name = BASE_DIR + APP_SYS_NAME + '_cluster_summary.csv'
output_file = open(summary_file_name, 'w')
with output_file:
    writer = csv.writer(output_file)
    writer.writerow(['is_anomaly', 'anomaly_count', 'normal_count',
                     'cluster', 'pct_of_total',
                     'most_similar_sample', 'most_similar_distance',
                     'least_similar_sample', 'least_similar_distance'])
    
    def print_clusters(clusters, label):
        num_rows = X.shape[0]
        for ac in clusters:
            indices = X.index[pred == ac]
            rows_for_cluster = X_tfidf[pred == ac]
            centroid = kmeans.cluster_centers_[ac]
            least_distance = 0
            most_similar = None
            greatest_distance = 1
            least_similar = None
            for i in range(rows_for_cluster.shape[0]):
                row = np.asarray(rows_for_cluster[i].todense()).reshape(-1)
                distance = cosine(centroid, row)
                if most_similar is None or least_distance > distance:
                    least_distance = distance
                    most_similar = i
        
                if least_similar is None or greatest_distance < distance:
                    greatest_distance = distance
                    least_similar = i
                
            pct = round(np.count_nonzero(pred == ac) / num_rows, 2)
            writer.writerow([1 if label == 'anomaly' else 0,
                             pos_df[pos_df["cluster2"] == ac]['tfidf_text'].sum(),
                             neg_df[neg_df["cluster2"] == ac]['tfidf_text'].sum(),
                             ac, pct,
                             input_data.loc[indices[most_similar], 'text'], round(least_distance, 3),                  
                             input_data.loc[indices[least_similar], 'text'], round(greatest_distance, 3)])
    print_clusters(anomaly_clusters, 'anomaly')
    print_clusters(normal_clusters,  ' normal')
    
summary_df = pd.read_csv(summary_file_name)
summary_df

# Use TFIDF and KMeans from above to predict anomalies in original raw file

In [None]:
cluster_to_label_map = np.zeros(k)
for c in anomaly_clusters:
    cluster_to_label_map[c] = 1
    
for c in normal_clusters:
    cluster_to_label_map[c] = 0

def process_raw_file(raw_filename, parser):
    block_size = 100000 # entire dataset will not fit in memory, so do a block at a time
    file_path = raw_filename.split('/')
    with open(raw_filename, "r", encoding="utf8", errors='ignore') as raw_file:
        count = 0
        fp_total = tp_total = fn_total = 0
        row_block = []
        y_block = np.zeros(block_size)
        for line in raw_file:
            line = line.rstrip('\n')
            epochts,text,is_anomaly,filename = parser(file_path, line)
            y_block[count % block_size] = 1 if is_anomaly else 0
            count += 1
            row_block.append(text)
            
            if count % block_size == 0:
                X_block = tfidf.transform(row_block)
                block_clusters = kmeans.predict(X_block)
                block_pred = np.zeros(block_size)
                for i in range(block_clusters.shape[0]):
                    block_pred[i] = cluster_to_label_map[block_clusters[i]]
                fp = (block_pred == 1) & (y_block == 0)
                tp = (block_pred == 1) & (y_block == 1)
                fn = (block_pred == 0) & (y_block == 1)
                fp_total += np.count_nonzero(fp)
                tp_total += np.count_nonzero(tp)
                fn_total += np.count_nonzero(fn)
                print(count, fp_total, tp_total, fn_total)
                row_block = []

            # Remove this condition to process full file.
            if count > 3000000: break
        print(raw_filename, 'precision', tp_total / (fp_total + tp_total))
        print(raw_filename, 'recall', tp_total / (fn_total + tp_total))

process_raw_file(RAW_DIR + APP_SYS_NAME + '/' + APP_SYS_NAME + '.log', parsers[APP_SYS_NAME])

# Compute precision, recall, homogeneity, confusion matrix

In [None]:
def report(report_name, pred, labels):
    label_pred = np.zeros(pred.size)
    for i in range(pred.size):
        label_pred[i] = cluster_to_label_map[pred[i]]
    kmeans_report = pd.DataFrame(classification_report(labels, label_pred, output_dict=True))
    print("{report_name} Result:\n================================================")        
    print(f"Accuracy Score: {accuracy_score(labels, label_pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{kmeans_report}")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(labels, label_pred)}\n")
    
    print("_______________________________________________")
    print(f"Completeness Score: \n {completeness_score(labels, pred)}\n")
    print("_______________________________________________")
    print(f"Homogeneity Score: \n {homogeneity_score(labels, pred)}\n")
    
    print("_______________________________________________")
    print(f'Precision Score: \n {precision_score(labels, label_pred)}')
    print("_______________________________________________")
    print(f'Recall Score: \n {recall_score(labels, label_pred)}')
    print("_______________________________________________")
    print(f'Roc AUC Score: \n {roc_auc_score(labels, label_pred)}')
   
    ConfusionMatrixDisplay.from_predictions(labels, label_pred)
    plt.show()
    
    PrecisionRecallDisplay.from_predictions(labels, label_pred, name=report_name)
    plt.show()

report(APP_SYS_NAME, pred, output.label.values)