In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from ast import literal_eval
from tensorflow.random import set_seed

RANDOM_SEED = 0
np.random.seed(seed=RANDOM_SEED)
set_seed(RANDOM_SEED)


df = pd.read_pickle("Fraud Detection with Natural Language Processing.pkl")
print("dataset shape: ", df.shape)
action_vocab = pd.read_csv("vocab.csv")

# one line is broken, throw it away
broken_times = df[df.times.apply(lambda x: x[-1]!="]")]
assert broken_times.shape[0] == 1
assert broken_times.iloc[0].is_fraud==0

# ignore the single broken line
df = df[df.times.apply(lambda x: x[-1]=="]")]

#build the raw text, using the names and the (index-inverted) tokens 
action_names = action_vocab.Name.to_list()
id_to_action = {str(i):a for i,a in enumerate(action_names)}
action_to_id = {a:str(i) for i,a in enumerate(action_names)}

# Recall to cast the strings into lists
df.actions = df.actions.apply(literal_eval)

df["times"] = df.times.apply(literal_eval).apply(lambda x: [i/1000 for i in x])
df["Action time mean"] = df.times.apply(np.mean)
df["Action time std"] = df.times.apply(np.std)
df["log(amount)"] = df.Amount.apply(np.log)
df["Transaction Type"] = df.is_fraud.apply(lambda x: "Fraud" if x else "Non Fraud")
df["time_to_first_action"] = df.times.apply(lambda x: x[1] if len(x)>1 else 0)
df["actions_str"] = df.actions.apply(lambda x: " ".join([id_to_action[str(i[0])] for i in x if len(i)>0]))
df["total_time_to_transaction"] = df.times.apply(sum)/1000
df['actions'] = df['actions'].apply(lambda x: [item for sublist in x for item in sublist])
df.head(2)