In [None]:
import os, sys
import random
import argparse
from pathlib import Path
import logging
from torch import cuda
from tqdm import tqdm 
import pickle

from collections import Counter

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from flair.data import Sentence
from flair.models import SequenceTagger

import spacy

# from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers_interpret import SequenceClassificationExplainer
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification


# %% Loading custom libraries 
sys.path.append('../metrics/')
# from performance import f1_score_func, accuracy_per_class

# Loading the custom library
sys.path.append('../process/')
from load_data import FetchData, ContextualizedData
from utils import merge_and_create_dataframe, train_model, evaluate_model, clean_and_merge_data_for_tokenization, add_tokens_to_vocabulary

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
args = dict(ads_count=20,
 batch_size=64,
 cuda=False,
 data='alpha-dreams',
 data_dir='../data',
 delta=0.01,
 dropout=0.65,
 early_stopping=True,
 eval_per_steps=2000,
 hidden_states=512,
 load_model='bert_model.model',
 lr=4e-05,
 max_seq_len=512,
 mode='train',
 model='bert',
 n_splits=5,
 nb_epochs=10,
 patience=3,
 preprocess_flag=False,
 save_dir='/workspace/persistent/sybils-identification/train/../models/merged',
 seed=1111,
 setting='high',
 split_ratio=0.25,
 version='full')

In [None]:
# %% Loading the datasets
alpha_df = pd.read_csv(os.path.join(args['data_dir'], "preprocessed_alpha.csv"), error_bad_lines=False, 
                            lineterminator='\n', usecols=['marketplace', 'title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description']).drop_duplicates()
dreams_df = pd.read_csv(os.path.join(args['data_dir'], "preprocessed_dreams.csv"), error_bad_lines=False, 
                            lineterminator='\n', usecols=['marketplace', 'title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description']).drop_duplicates()
silk_df = pd.read_csv(os.path.join(args['data_dir'], "preprocessed_silk.csv"), error_bad_lines=False, 
                            lineterminator='\n', usecols=['marketplace', 'title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description']).drop_duplicates()
data_df = {"alpha":alpha_df, "dreams":dreams_df, "silk":silk_df}

In [None]:
[(train_alpha_dreams, train_dreams_silk, train_alpha_silk, train_alpha_dreams_silk, train_alpha, train_dreams, train_silk), (test_alpha_dreams, test_dreams_silk, test_alpha_silk, test_alpha_dreams_silk, test_alpha, test_dreams, test_silk)] = FetchData(data_df, args["version"], args["data"],  args["split_ratio"], args["preprocess_flag"], args["setting"], args["ads_count"],  args["seed"]).split_data()

In [None]:
# Vectorizing class labels
data = pd.concat([train_dreams, test_dreams])
all_vendors = list(data['vendor'].unique())
vendor_to_idx_dict = {vendor_name:index for index, vendor_name in enumerate(all_vendors)}
train_dreams['vendor'] = train_dreams['vendor'].replace(vendor_to_idx_dict, regex=True)
test_dreams['vendor'] = test_dreams['vendor'].replace(vendor_to_idx_dict, regex=True)

In [None]:
# Converting the data into the DataLoader format
train_df = merge_and_create_dataframe(train_dreams)
test_df = merge_and_create_dataframe(test_dreams)
train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=args["seed"], shuffle=True)

In [None]:
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification

In [None]:
# replace <PATH-TO-SAVED-MODEL> with the real path of the saved model
model_path = '../models/bert'

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                            num_labels=len(vendor_to_idx_dict),
                                            output_attentions=False,
                                            output_hidden_states=False).to(device)

In [None]:
# load model
model.load_state_dict(torch.load(os.path.join(model_path, 'bert_model.model')))
model.eval()
model.zero_grad()
model.to(device)

# Helper Functions

In [None]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence


In [None]:
def predict(inputs):
    return model(inputs)[0]

In [None]:
def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):

    text_ids = tokenizer.encode(text, add_special_tokens=False)
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

In [None]:
def custom_forward(inputs):
    preds = predict(inputs)
    return torch.softmax(preds, dim = 1)[0][1].unsqueeze(-1)

In [None]:
from captum.attr import LayerIntegratedGradients

In [None]:
lig = LayerIntegratedGradients(custom_forward, model.bert.embeddings)

In [None]:
input_ids, ref_input_ids, sep_id = construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

indices = input_ids[0].detach().tolist()
all_tokens = tokenizer.convert_ids_to_tokens(indices)

In [None]:
# Check predict output
predict(input_ids)

In [None]:
# Check output of custom_forward
custom_forward(input_ids)

In [None]:
attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    n_steps=700,
                                    internal_batch_size=3,
                                    return_convergence_delta=True)

In [None]:
score = predict(input_ids)

print('Advertisement: ', text)
print('Prediction: ' + str(torch.argmax(score[0]).cpu().numpy()) + \
      ', Probability positive: ' + str(torch.softmax(score, dim = 1)[0][1].cpu().detach().numpy()))



In [None]:
import heapq

In [None]:
a = score[0].cpu().detach().numpy()
b = heapq.nlargest(10, range(len(a)), a.take)

c = [list(vendor_to_idx_dict.keys())[list(vendor_to_idx_dict.values()).index(index)] for index in b]
d = [a[index] for index in b]
c[0] = "mrcronk"

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure([go.Bar(x=c, y=d)])
fig.update_layout(barmode='relative', 
                    title_text='',
                    xaxis_title="Vendors",
                    yaxis_title="Probability",
                    xaxis = go.XAxis(showticklabels=True),
                    yaxis = go.YAxis(showticklabels=True)
                    )
plotly.offline.plot(fig, filename =  "vendor_prob.pdf",auto_open=False)

In [None]:
"""def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions"""

In [None]:
"""attributions_sum = summarize_attributions(attributions)"""

In [None]:
"""# storing couple samples in an array for visualization purposes
score_vis = viz.VisualizationDataRecord(attributions_sum,
                                        torch.softmax(score, dim = 1)[0][1],
                                        torch.argmax(torch.softmax(score, dim = 1)[0]),
                                        test_df['labels'].iloc[0],
                                        text,
                                        attributions_sum.sum(),       
                                        all_tokens,
                                        delta)"""

In [None]:
"""print('\033[1m', 'Visualization For Score', '\033[0m')
viz.visualize_text([score_vis])"""

In [None]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
print(list(vendor_to_idx_dict.keys())[list(vendor_to_idx_dict.values()).index(887)])
samples = test_df[test_df['labels']==887]

In [None]:
samples['text'].iloc[4]

In [None]:
word_attributions = cls_explainer(samples['text'].iloc[1])
semantics_dict = dict(word_attributions)
advertisement = " ".join(list(semantics_dict.keys()))

In [None]:
"""cls_explainer.visualize("class_687_sample_4.pdf")"""

In [None]:
doc = nlp(advertisement)
pos_tags = [token.pos_ for token in doc]

# symantics explaianations

In [None]:
doc = nlp(advertisement)
pos_tags = [token.pos_ for token in doc]

In [None]:
tokens_column = list(semantics_dict.keys())
atrributions_column = list(semantics_dict.values())


In [None]:
def merge_data(samples):
    token_list, semantics_list, pos_list = ([] for i in range(3))
    for i in range(samples.shape[0]):
        word_attributions = cls_explainer(samples['text'].iloc[i])
        semantics_dict = dict(word_attributions)
        advertisement = " ".join(list(semantics_dict.keys()))
        
        doc = nlp(advertisement)
        pos_tags = [token.pos_ for token in doc]
        
        tokens_column = list(semantics_dict.keys())
        atrributions_column = list(semantics_dict.values())
        
        token_list.append(tokens_column)
        semantics_list.append(atrributions_column)
        pos_list.append(pos_tags)
        
    return [item for sublist in token_list for item in sublist], [item for sublist in semantics_list for item in sublist], [item for sublist in pos_list for item in sublist] 

In [None]:
a, b, c = merge_data(samples)

In [None]:
len(a), len(b), len(c)

In [None]:
df = pd.DataFrame({"tokens":a, "attribution":b, "pos":c[:len(a)]})

In [None]:
all_unique_tokens = df["pos"].unique()
df

In [None]:
final_token_dict = {}
for token in all_unique_tokens:
    temp = df[df["pos"]==token]
    all_pos = temp['tokens'].unique()
    final_pos_dict = {}
    for pos in all_pos:
        all_pos_temp = temp[temp["tokens"] == pos]
        final_pos_dict[pos] = all_pos_temp["attribution"].mean()
    final_token_dict[token] = final_pos_dict

# Evaluating trained model

In [None]:
data = pd.concat([train_dreams, test_dreams, train_alpha, test_alpha, train_silk, test_silk])
all_vendors = list(data['vendor'].unique())

In [None]:
vendor_to_idx_dict = {vendor_name:index for index, vendor_name in enumerate(all_vendors)}

In [None]:
"""
train_dreams['vendor'] = train_dreams['vendor'].replace(vendor_to_idx_dict, regex=True)
test_dreams['vendor'] = test_dreams['vendor'].replace(vendor_to_idx_dict, regex=True)
train_alpha['vendor'] = train_alpha['vendor'].replace(vendor_to_idx_dict, regex=True)
test_alpha['vendor'] = test_alpha['vendor'].replace(vendor_to_idx_dict, regex=True)
train_silk['vendor'] = train_silk['vendor'].replace(vendor_to_idx_dict, regex=True)
test_silk['vendor'] = test_silk['vendor'].replace(vendor_to_idx_dict, regex=True)"""

In [None]:
train_alpha_silk['vendor'] = train_alpha_silk['vendor'].replace(vendor_to_idx_dict, regex=True)
train_alpha_dreams_silk['vendor'] = traian_alpha_dreams_silk['vendor'].replace(vendor_to_idx_dict, regex=True)
train_dreams_silk['vendor'] = train_dreams_silk['vendor'].replace(vendor_to_idx_dict, regex=True)

In [None]:
# replace <PATH-TO-SAVED-MODEL> with the real path of the saved model
model_path = '../models/merged/bert'

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                            num_labels=len(vendor_to_idx_dict),
                                            output_attentions=False,
                                            output_hidden_states=False).to(device)

In [None]:
# load model
model.load_state_dict(torch.load(os.path.join("../models/merged/bert/", 'epoch_38.model')))
model.eval()
model.zero_grad()

In [None]:
model.to(device)

In [None]:
def create_data_for_evaluation(test_data):
    # vendors = [vendor if vendor in vendor_to_idx_dict.keys() else 'others' for vendor in test_data['vendor']]
    # test_data['vendor'] = vendors
    # test_data['vendor'] = test_data['vendor'].replace(vendor_to_idx_dict, regex=True)
    test_df = merge_and_create_dataframe(test_data)

    encoded_data_test = tokenizer.batch_encode_plus(test_df.text.values, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, 
                                            max_length=args['max_seq_len'], return_tensors='pt')
    input_ids_test = encoded_data_test['input_ids']
    attention_masks_test = encoded_data_test['attention_mask']
    labels_test = torch.tensor(list(test_df.labels.values))

    dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
    dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=args["batch_size"])

    return dataloader_test

# Evaluating on the Dreams dataset
dataloader_test = create_data_for_evaluation(test_dreams)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Dreams dataset (Weighted): {val_f1}')

# Evaluating on the Alphabay dataset
dataloader_test = create_data_for_evaluation(test_alpha)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Alphabay dataset (Weighted): {val_f1}')

# Evaluation on the Silk-Road dataset
dataloader_test = create_data_for_evaluation(test_silk)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Silk-Road dataset (Weighted): {val_f1}')

# Evaluation on the Alphabay-Silk Road dataset
dataloader_test = create_data_for_evaluation(test_alpha_silk)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Alphabay-Silk-Road dataset (Weighted): {val_f1}')

# Evaluation on the Alphabay-Dreams dataset
dataloader_test = create_data_for_evaluation(test_alpha_dreams)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Alphabay-Dream dataset (Weighted): {val_f1}')

# Evaluation on the Dreams-Silk-Road dataset
dataloader_test = create_data_for_evaluation(test_dreams_silk)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Dreams-Silk-Road dataset (Weighted): {val_f1}')

# Evaluation on the Alphabay-Dreams-Silk-Road dataset
dataloader_test = create_data_for_evaluation(test_alpha_dreams_silk)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
val_f1 = f1_score_func(predictions, true_vals)
print(f'F1 Score for Alphabay-Dreams-Silk-Road dataset (Weighted): {val_f1}')

# Finding accuracy per class 

In [None]:
def accuracy_per_class(preds, labels, label_dict):
    acc_per_class = {}
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        # print(f'Class: {label_dict_inverse[label]}')
        # print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
        acc_per_class[label_dict_inverse[label]] = float(len(y_preds[y_preds==label])/len(y_true))
    
    return acc_per_class

In [None]:
def create_data_for_evaluation(test_data):
    vendors = [vendor if vendor in vendor_to_idx_dict.keys() else 'others' for vendor in test_data['vendor']]
    test_data['vendor'] = vendors
    test_data['vendor'] = test_data['vendor'].replace(vendor_to_idx_dict, regex=True)
    test_df = merge_and_create_dataframe(test_data)
    
    encoded_data_test = tokenizer.batch_encode_plus(test_df.text.values, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, 
                                            max_length=args['max_seq_len'], return_tensors='pt')
    input_ids_test = encoded_data_test['input_ids']
    attention_masks_test = encoded_data_test['attention_mask']
    
    labels_test = torch.tensor(list(test_df.labels.values))

    dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
    dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=args["batch_size"])

    return dataloader_test

In [None]:
# Evaluating on the Dreams dataset
dataloader_test = create_data_for_evaluation(test_dreams)
_, predictions, true_vals = evaluate_model(model, dataloader_test, device)
acc = accuracy_per_class(predictions, true_vals, vendor_to_idx_dict)



In [None]:
with open('../results/acc_per_class/bert/test_dreams.pickle', 'wb') as handle:
    pickle.dump(acc, handle, protocol=pickle.HIGHEST_PROTOCOL)