In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def load_model(model_name, model_path):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    return model, device

def load_tokenizer(model_name):
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
    return tokenizer

def predict_label(model, tokenizer, device, new_string):
    inputs = tokenizer(new_string, return_tensors="pt", max_length=13, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_label = torch.argmax(logits, dim=1).item()
    return predicted_label

model_name = 'bert-base-cased'
model_path = 'bert_model.pth'
model, device = load_model(model_name, model_path)
tokenizer = load_tokenizer(model_name)

strings = ["amazon.com 123456","amazon.com ref 123456","amazon.com ref france","amazon.com ref s6g5hr", "amazon.com ref france1","paypal * alessandro","paypal * zara","booking.com","booking.com rf GHB234","booking.com ref GHB234","paypal * zara ref","paypal * 123456 ref","paypal * 123456"]

for new_string in strings:
    predicted_label = predict_label(model, tokenizer, device, new_string)
    print("String ",new_string, " - Predicted label:", predicted_label)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

String  amazon.com 123456  - Predicted label: 1
String  amazon.com ref 123456  - Predicted label: 1
String  amazon.com ref france  - Predicted label: 0
String  amazon.com ref s6g5hr  - Predicted label: 1
String  amazon.com ref france1  - Predicted label: 0
String  paypal * alessandro  - Predicted label: 0
String  paypal * zara  - Predicted label: 0
String  booking.com  - Predicted label: 0
String  booking.com rf GHB234  - Predicted label: 0
String  booking.com ref GHB234  - Predicted label: 1
String  paypal * zara ref  - Predicted label: 0
String  paypal * 123456 ref  - Predicted label: 1
String  paypal * 123456  - Predicted label: 1


In [7]:
df = pd.read_csv('maple.csv')
df

Unnamed: 0,post_mrt_clean_transaction_string
0,0000nyctaxi7m41 brooklyn ny
1,07958 436 109 bicester ox26 gbr
2,1 new parade bournemout gbr
3,1 shore street ulullapool gbr
4,1 spring road clclacton-on gbr
...,...
1725210,zin london gbr
1725211,"zipp mobility ltd belfield, dub irl"
1725212,znanje d.o.o. 21000 split hrv
1725213,zoo sushi bubbletea stratford-upo gbr


In [8]:
from tqdm import tqdm
# tqdm for pandas
tqdm.pandas()

In [9]:
df['pii'] = df['post_mrt_clean_transaction_string'].progress_apply(lambda x: predict_label(model, tokenizer, device, x))

100%|██████████| 1725215/1725215 [5:21:08<00:00, 89.54it/s]   


In [10]:
df.value_counts('pii')

pii
0    1671315
1      53900
dtype: int64

In [12]:
df.to_csv('maple_pii.csv', index=False)

In [13]:
df

Unnamed: 0,post_mrt_clean_transaction_string,pii
0,0000nyctaxi7m41 brooklyn ny,1
1,07958 436 109 bicester ox26 gbr,0
2,1 new parade bournemout gbr,0
3,1 shore street ulullapool gbr,0
4,1 spring road clclacton-on gbr,0
...,...,...
1725210,zin london gbr,0
1725211,"zipp mobility ltd belfield, dub irl",0
1725212,znanje d.o.o. 21000 split hrv,1
1725213,zoo sushi bubbletea stratford-upo gbr,0


In [23]:
df_rules = pd.read_excel('pull_request_02_04_2024.xlsx')
df_rules = df_rules[df_rules['WordToCheck'].notnull() | df_rules['Pattern'].notnull()]
rules = '|'.join(df_rules['Pattern'].to_list())
rules

'party\\sbritain|\\byorkshire\\sparty\\b|north\\seast\\spart|\\bgwlad\\b|churches\\stogether|islamophobia|\\bUnison\\b|\\bchristian\\b|(?!.*118118.*)[0-9\\*]{6,}[a-zA-Z0-9]{0,}|^\\s(m(?!(\\sand|\\s&|\\smotos|\\ss(?:pencer)?\\s))|mm(?!(\\se&))|mlle|mme)\\s(?:(?:[a-z]+ )+[a-z]+)|(?<=kindle\\s[a-z]{5})[a-z0-9\\*]{5,}|(?<=metapay\\s)[a-z0-9]{10}|(?<=uber\\s)[a-z0-9]{5}\\b|(?<=hillcrest_far\\s)[a-z0-9]{8}\\b|(?<=tuckers_grave\\s)[a-z0-9]{8}\\b|[a-z0-9]{8}(?=\\schance\\s)|[a-z0-9]{8}(?=\\stipsport\\.net)|(?<=riotgam\\*lol)\\s*[a-z0-9]{12}\\b|(?<=coins)\\s?\\+?[a-z0-9]{12}\\b|(?<=betsson\\*)[a-z0-9]{11}\\b|(?<=vertrieb\\sgmbh)\\s*[a-z0-9]{6}\\b|(?<!primaprix\\s)\\bdr[\\s\\.]+(?!martens |slot |juchheim |foodst |max |noodles |mobile|frankenstein|eckert )(?:\\*{5}\\s)?[a-z]{3,}(?:\\s[a-z]{3,})?|(?<=UBER)(?!\\s?TRIP|\\s?EAT|\\s?\\.c|fone)[^a-zA-Z0-9]{0,5}[a-zA-Z0-9]{3,}[^a-zA-Z0-9]|(?<=deezer\\s)[a-z0-9]{5,13}|ebay\\s?sale|toys\\s?r\\s?us|love\\s?crafts|\\bEvri\\b|\\bsmarty\\b|\\bZynga\\b|Bigfish

In [26]:
df['mrt_matches'] = np.where(df['post_mrt_clean_transaction_string'].str.contains(rf'{rules}'), 1, 0)

  df['mrt_matches'] = np.where(df['post_mrt_clean_transaction_string'].str.contains(rf'{rules}'), 1, 0)


In [28]:
df.value_counts('mrt_matches')

mrt_matches
0    1713160
1      12055
dtype: int64

In [42]:
round((df[df['mrt_matches']==1].shape[0] / df[df['pii']==1].shape[0])*100,2)

22.37

In [43]:
round((df[df['pii']==1].shape[0]/df.shape[0])*100,2)

3.12

In [44]:
round((df[df['mrt_matches']==1].shape[0]/df.shape[0])*100,2)

0.7

In [46]:
df.to_csv('maple_pii_mrt.csv', index=False)

In [52]:
df[(df['mrt_matches']==1) & (df['pii']==0)]

Unnamed: 0,post_mrt_clean_transaction_string,pii,mrt_matches
20,1622-0227-sup.opencor nueva andaluc esp,0,1
78,9246-0384-sup.ex. beni benidorm esp,0,1
969,db vertrieb gmbh lgu769 deu,0,1
2201,metapay 6wsjymke32 dublin irl,0,1
4681,bandcamp tufstuf recor 402-935-7733 ca,0,1
...,...,...,...
1723229,bandcamp lewis jack in 402-935-7733 ca,0,1
1723747,garden cafe 0127 373 5187 gbr,0,1
1724860,tgi friday's stockton on t gbr,0,1
1724861,tgi fridays braintree 689 gbr,0,1
