In [None]:
PROJECT_DIR = ''
APP_SYS_NAME = 'BGL'
#APP_SYS_NAME = 'Thunderbird'

BASE_DIR = PROJECT_DIR + 'output/'

In [None]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, confusion_matrix , precision_score, recall_score, f1_score
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import PrecisionRecallDisplay
from scipy.spatial.distance import cosine
import warnings
import matplotlib.pyplot as plt
import sys
import subprocess
import random
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM
import matplotlib.pyplot as plt

random.seed(2)
sys.path.append(PROJECT_DIR) # this is done to make the import of ad_feature_extraction work
from ad_feature_extraction import parsers

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from matplotlib.lines import Line2D

random_state=5

In [None]:
input_data = pd.read_csv(BASE_DIR + APP_SYS_NAME + '_clusters2.csv')

print("Input_data Shape:",input_data.shape)
print(input_data['label'].value_counts())

sns.countplot(x=input_data['label'])
plt.show()

In [None]:
def createGradientBoostingClassifier():
    return GradientBoostingClassifier(random_state=0)

def createLogisticRegression():
    return LogisticRegression(class_weight='balanced', random_state=random_state)

def createXGBClassifier():
    return XGBClassifier(booster="gbtree", n_estimators=2, max_depth=2, learning_rate=0.3, objective='binary:logistic')

def createIsolationForest(contamination=0.05):
    return IsolationForest(contamination=contamination, random_state=random_state)

def createSGDOneClassSVM(nu = 0.05):
    return SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True, random_state=42, tol=1e-4)

def getSupervisedModels():
    return [createGradientBoostingClassifier(), createLogisticRegression(), createXGBClassifier()]

def getUnsupervisedModels(threshold=0.05):
    return [createIsolationForest(threshold), createSGDOneClassSVM(threshold)]

# Effect of cluster filtering

In [None]:
clusters = input_data['clusters'].values
clusters2 = input_data['cluster2'].values

print("Between first and second clustering:____________________________")
print(f"Completeness Score: \n {completeness_score(clusters, clusters2)}\n")
print(f"Homogeneity Score: \n {homogeneity_score(clusters, clusters2)}\n") 

In [None]:
labels = input_data['label']
print("Between first clusters and label:____________________________")
print(f"Completeness Score: \n {completeness_score(labels, clusters)}\n")
print(f"Homogeneity Score: \n {homogeneity_score(labels, clusters)}\n") 

print("Between second clusters and label:____________________________")
print(f"Completeness Score: \n {completeness_score(labels, clusters2)}\n")
print(f"Homogeneity Score: \n {homogeneity_score(labels, clusters2)}\n") 



# Parameter effect on supervised learning

In [None]:
def report(report_name, label_pred, labels):
    report = pd.DataFrame(classification_report(labels, label_pred, output_dict=True))
    print(f"{report_name} Result:\n================================================")        
    print(f"Accuracy Score: {accuracy_score(labels, label_pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{report}")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(labels, label_pred)}\n")
    print("_______________________________________________")
    print(f'F1 Score: \n {f1_score(labels, label_pred)}')
    print("_______________________________________________")
    print(f'Precision Score: \n {precision_score(labels, label_pred)}')
    print("_______________________________________________")
    print(f'Recall Score: \n {recall_score(labels, label_pred)}')
    print("_______________________________________________")
    print(f'Roc AUC Score: \n {roc_auc_score(labels, label_pred)}')
   
    ConfusionMatrixDisplay.from_predictions(labels, label_pred)
    plt.show()
    
    PrecisionRecallDisplay.from_predictions(labels, label_pred, name=report_name)
    plt.show()

In [None]:
train = pd.read_csv(BASE_DIR + APP_SYS_NAME + '_train_params.csv')
test = pd.read_csv(BASE_DIR + APP_SYS_NAME + '_test_params.csv')

#print("Train Shape:",train.shape,"Test Shape:",test.shape)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
ohe = OneHotEncoder()

X_clusters = input_data.copy()
X_dummies = pd.get_dummies(X_clusters['clusters'])
X_clusters['label'] = le.fit_transform(X_clusters['label'])

train_dummies = pd.get_dummies(train['clusters'])
train['label'] = le.fit_transform(train['label'])

test_dummies = pd.get_dummies(test['clusters'])
test['label'] = le.fit_transform(test['label'])

for col in train_dummies.columns:
    if not col in test_dummies.columns:
        continue
    col_name = 'c_' + str(col)
    train[col_name] = train_dummies[col]
    test[col_name] = test_dummies[col]

for col in X_dummies.columns:
    col_name = 'c_' + str(col)
    X_clusters[col_name] = X_dummies[col]

for col in train.columns:
    if col not in test.columns:
        test[col] = 0

for col in test.columns:
    if col not in train.columns:
        train[col] = 0

cols_without_params = [col for col in train.columns if col.startswith('c_')]
cols_with_params = cols_without_params + [col for col in train.columns if col.startswith('p_')]

def evaluate_training_set(report_name, cols):
    # Supervised
    X_train = train[cols]
    y_train = train.loc[:,'label']
    X_test = test[cols]
    y_test = test.loc[:,'label']
    for clf in getSupervisedModels():
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)
        model_name = type(clf).__name__
        report(report_name + ' ' + model_name + ' Train', train_pred, y_train)
        report(report_name + ' ' + model_name + ' Test', test_pred, y_test)

def evaluate_unsupervised(report_name, cols):
    # Unsupervised
    X = X_clusters[cols]
    y = X_clusters.loc[:,'label']
    for clf in getUnsupervisedModels():
        model_pred = clf.fit_predict(X)
        model_pred[model_pred == 1] = 0
        model_pred[model_pred == -1] = 1
        model_name = type(clf).__name__
        report(report_name + ' ' + model_name, model_pred, y)
    
evaluate_training_set('With Params', cols_without_params)
evaluate_training_set('Without Params', cols_with_params)

cols_without_params = [col for col in X_clusters.columns if col.startswith('c_')]
cols_with_params = cols_without_params + [col for col in X_clusters.columns if col.startswith('p_')]

evaluate_unsupervised('With Params', cols_without_params)
evaluate_unsupervised('Without Params', cols_with_params)

# Sliding Window

In [None]:
sliding_window_df = pd.read_csv(BASE_DIR + APP_SYS_NAME + '_sliding_window.csv')
cluster_cols = [col for col in sliding_window_df.columns if col.startswith('cluster_')]

#label_field = 'precision_label'
label_field = 'recall_label'

def simple_split(df):
    split_on = int(len(df.values)*0.50)
    train = df.values[:split_on]
    test = df.values[split_on:]
    train_df = pd.DataFrame(data=train, columns=df.columns)
    test_df = pd.DataFrame(data=test, columns=df.columns)
    return train_df, test_df

train_df, test_df = simple_split(sliding_window_df)
train_df[label_field] = train_df[label_field].astype('int')
test_df[label_field] = test_df[label_field].astype('int')

X_win_train = train_df[cluster_cols].astype('int')
y_win_train = train_df.loc[:, label_field]
X_win_test = test_df[cluster_cols].astype('int')
y_win_test = test_df.loc[:, label_field]

for clf in getSupervisedModels():
    clf.fit(X_win_train, y_win_train)
    train_pred = clf.predict(X_win_train)
    test_pred = clf.predict(X_win_test)
    model_name = type(clf).__name__
    #report('Sliding Window ' + model_name + ' Train', train_pred, y_win_train)
    report('Sliding Window ' + model_name + ' Test', test_pred, y_win_test)


In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM
from sklearn.neighbors import LocalOutlierFactor

X_window = sliding_window_df[cluster_cols].astype('int')
y_precision = sliding_window_df['precision_label']
y_recall = sliding_window_df['recall_label']

for clf in getUnsupervisedModels():
    model_pred = clf.fit_predict(X_window)
    model_pred[model_pred == 1] = 0
    model_pred[model_pred == -1] = 1
    model_name = type(clf).__name__
    #report(model_name + ' Precision', model_pred, y_precision)
    report('Sliding Window ' + model_name + ' Recall', model_pred, y_recall)


# TFIDF Before and After Filtering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

for txt_col in ['text', 'tfidf_text']:
    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(train[txt_col])
    print('Shape for', txt_col, X_train_tfidf.shape)
    y_train = train.loc[:,'label']
    X_test_tfidf = tfidf.transform(test[txt_col])
    y_test = test.loc[:,'label']

    for clf in getSupervisedModels():
        clf.fit(X_train_tfidf, y_train)
        train_pred = clf.predict(X_train_tfidf)
        test_pred = clf.predict(X_test_tfidf)
        model_name = type(clf).__name__
        report('TFIDF ' + txt_col + ' ' + model_name + ' Test', test_pred, y_test)

    
    X_tfidf = tfidf.fit_transform(input_data[txt_col])
    y = input_data.loc[:,'label']
    for clf in getUnsupervisedModels():
        model_pred = clf.fit_predict(X_tfidf)
        model_pred[model_pred == 1] = 0
        model_pred[model_pred == -1] = 1
        model_name = type(clf).__name__
        report('TFIDF ' + txt_col + ' ' + model_name + ' Recall', model_pred, y)


# Unsupervised Thresholds vs F1 Score

In [None]:
X_tfidf = tfidf.fit_transform(input_data['tfidf_text'])
y = input_data.loc[:,'label']
thresholds = np.linspace(0.05, 0.5, num=10)
score_map = {'IsolationForest': {'f1s': [], 'precisions': [], 'recalls': []},
             'SGDOneClassSVM': {'f1s': [], 'precisions': [], 'recalls': []},
            }
for threshold in thresholds:
    for clf in getUnsupervisedModels(threshold):
        model_pred = clf.fit_predict(X_tfidf)
        model_pred[model_pred == 1] = 0
        model_pred[model_pred == -1] = 1
        model_name = type(clf).__name__
        f1 = f1_score(labels, model_pred)
        precision = precision_score(labels, model_pred)
        recall = recall_score(labels, model_pred)
        score_map[model_name]['f1s'].append(f1)
        score_map[model_name]['precisions'].append(precision)
        score_map[model_name]['recalls'].append(recall)
        print(f'{model_name}  F1 Score @ {round(threshold, 2)}: {round(f1, 4)}')
        print(f'{model_name} Precision @ {round(threshold, 2)}: {round(precision, 4)}')
        print(f'{model_name}    Recall @ {round(threshold, 2)}: {round(recall, 4)}')

In [None]:

for model_name, results in score_map.items():
    f1s = results['f1s']
    precisions = results['precisions']
    recalls = results['recalls']
    plt.figure(figsize=(5, 3),dpi=250)
    plt.plot(thresholds, f1s, label='F1')
    plt.plot(thresholds, precisions, label='precision')
    plt.plot(thresholds, recalls, label='recall')
    plt.legend(loc='upper left')
    plt.title('Performance by threshold for ' + model_name + ' on ' + APP_SYS_NAME)
    plt.ylabel('score')
    plt.xlabel('nu' if model_name == 'SGDOneClassSVM' else 'contamination')
    plt.show()