### Jupyter Settings

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%load_ext autoreload
%autoreload 2

#### Read data

In [None]:
import pandas as pd
DATASET = '../data/CSIC/csic-for-extractor.csv'
df = pd.read_csv(DATASET, sep=',', dtype={'text':str, 'type':str}, low_memory=False)
df.loc[df.type != '99999', 'type'] = 'malicious'
df.loc[df.type == '99999', 'type'] = 'normal'
df.rename(columns={"type": "target"},inplace=True)

In [None]:
df[df.target == 'malicious'].shape

In [None]:
df[df.target == 'normal'].shape

In [None]:
df.head(10)

In [None]:
import sys
sys.path.append("..")

#### Train model

In [None]:
import lime
from tpe_model import text_preprocess
from tpe_model import text_model_generator
df, label_map = text_preprocess(df)
print(label_map)
tmg = text_model_generator(df)
model = tmg.model_trainer()

#### Explain single sample

In [None]:
from tpe_core import get_instance_explained
from lime.lime_text import LimeTextExplainer

# Warning The pickle module is not secure. Only unpickle data you trust.Reference: https://docs.python.org/3/library/pickle.html
# import pickle
# with open("model.test", 'rb') as f:
#     model = pickle.load(f, encoding='bytes')

labels = list(label_map.values())
get_instance_explained(df, 30633, model, label_map, 'malicious')

In [None]:
get_instance_explained(df, 0, model, label_map, 'normal')

#### Generate signature rules in batch and verification

In [None]:
from tpe_rule_validation import rule_matching_evaluation
match_result, rules_tobe_validate, matched_rules = rule_matching_evaluation(df
                         , seed_num=2000
                         , rein_num=2000
                         , eval_num=1000
                         , model=model
                         , label_map=label_map
                         , refer_label='malicious'
                         , lime_flag = True
                         , scan_flag=True
                         , content_direction='backward'
                         , xcol_name='text'
                         , n_cores=20)

In [None]:
# show a case
print('A match case...')
rule_index = 1
rule_num = matched_rules.iloc[[rule_index]].index[0]
print('rule_num is %d' % rule_num)
print(matched_rules.loc[[rule_num]]['rule_strings'])
print('----------------------------------------------------------------------')

pd.options.display.max_colwidth = 1000
print(match_result.loc[match_result.rule_num == rule_num]['text'])
print('Total matched number %d' % match_result.loc[match_result.rule_num == rule_num].shape[0])

In [None]:
matched_rules.shape[0]

### Backups

#### Generate lime rules in batch

In [None]:
df_malicious = df[df['target'] == label_map['malicious']].sample(1000, random_state=1)
df_malicious

from tpe_core import get_rules
rules_seed = get_rules(df_malicious, model, label_map, 'malicious', scan_flag=False)

In [None]:
rules_seed