In [1]:
from transformers import pipeline

# NER MODEL
ner = pipeline(task="ner",model="vblagoje/bert-english-uncased-finetuned-pos")
result = ner("Dear Support Team,\nFirstly, could we get an idea on how long orders like mine (order ref# AA111B) generally take before their delivery status progresses further? Secondly, what exactly are stabilizers and why should one consider upgrading lubed variants?\nYour assistance will be much appreciated.")

In [3]:
for data in result:
    if data['entity'] == 'NUM':
        print(data)

{'entity': 'NUM', 'score': 0.765352, 'index': 23, 'word': '##11', 'start': 92, 'end': 94}
{'entity': 'NUM', 'score': 0.5837095, 'index': 24, 'word': '##1', 'start': 94, 'end': 95}


In [131]:
import pandas as pd

data_path = "../data/raw-dataset.xlsx"
dataset = pd.read_excel(data_path)
dataset.head()
# rename email column to question
dataset.rename(columns = {'email':'question'}, inplace = True)

In [132]:
wmo_filtered_dataset = dataset[dataset["category"] == "where_is_my_order"]
wmo_filtered_dataset.head()

Unnamed: 0,category,question
0,where_is_my_order,"Hello, just checking in on the status of my or..."
1,where_is_my_order,It's getting frustrating not knowing where my ...
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke..."
4,where_is_my_order,"Hello, I haven't received any updates regardin..."
6,where_is_my_order,Subject: me not find order where is keybord\n\...


In [133]:
len(wmo_filtered_dataset)

167

In [134]:
def split_text_to_word(text):
    return "x"

In [135]:
df_tokens = wmo_filtered_dataset['question'].str.split()
df_tokens.head()

0    [Hello,, just, checking, in, on, the, status, ...
1    [It's, getting, frustrating, not, knowing, whe...
3    [Hey,, I, recently, ordered, a, custom, mechan...
4    [Hello,, I, haven't, received, any, updates, r...
6    [Subject:, me, not, find, order, where, is, ke...
Name: question, dtype: object

In [136]:
dummy_data = pd.DataFrame({'category' : ['where_is_my_order'], 'question' : ['I have order with #123412 (123123) AASDAS #ASDAS']})
wmo_filtered_dataset = pd.concat([dummy_data,wmo_filtered_dataset],ignore_index=True)
wmo_filtered_dataset

Unnamed: 0,category,question
0,where_is_my_order,I have order with #123412 (123123) AASDAS #ASDAS
1,where_is_my_order,"Hello, just checking in on the status of my or..."
2,where_is_my_order,It's getting frustrating not knowing where my ...
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke..."
4,where_is_my_order,"Hello, I haven't received any updates regardin..."
...,...,...
163,where_is_my_order,"Dear Support,\nI ordered a custom mech keyboar..."
164,where_is_my_order,"Hi, I ordered a custom mechanical keyboard fro..."
165,where_is_my_order,"Dear Customer Support Team,\n\nI have recently..."
166,where_is_my_order,"Hello, just wondering when I should expect my ..."


In [137]:
wmo_filtered_dataset['tokens'] = wmo_filtered_dataset['question'].str.split()
wmo_filtered_dataset

Unnamed: 0,category,question,tokens
0,where_is_my_order,I have order with #123412 (123123) AASDAS #ASDAS,"[I, have, order, with, #123412, (123123), AASD..."
1,where_is_my_order,"Hello, just checking in on the status of my or...","[Hello,, just, checking, in, on, the, status, ..."
2,where_is_my_order,It's getting frustrating not knowing where my ...,"[It's, getting, frustrating, not, knowing, whe..."
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke...","[Hey,, I, recently, ordered, a, custom, mechan..."
4,where_is_my_order,"Hello, I haven't received any updates regardin...","[Hello,, I, haven't, received, any, updates, r..."
...,...,...,...
163,where_is_my_order,"Dear Support,\nI ordered a custom mech keyboar...","[Dear, Support,\nI, ordered, a, custom, mech, ..."
164,where_is_my_order,"Hi, I ordered a custom mechanical keyboard fro...","[Hi,, I, ordered, a, custom, mechanical, keybo..."
165,where_is_my_order,"Dear Customer Support Team,\n\nI have recently...","[Dear, Customer, Support, Team,, I, have, rece..."
166,where_is_my_order,"Hello, just wondering when I should expect my ...","[Hello,, just, wondering, when, I, should, exp..."


In [138]:
import re
regex_digit = r"\d"
regex_upper_case = r"^[A-Z]{4,}$"
hashtag = r"^#[A-Z]{4,}$"
def generate_tag_by_tokens(tokens:list)->list:
    res = []
    for token in tokens:
        # print(token)
        is_digit = re.search(regex_digit, token)
        is_upper_case = re.search(regex_upper_case, token)
        is_contains_hashtag = re.search(hashtag, token)
        if is_digit or is_upper_case or is_contains_hashtag:
            res.append(1)
        else:
            res.append(0)
    return res

In [139]:
wmo_filtered_dataset["ner_tags"] = wmo_filtered_dataset["tokens"].apply(lambda tokens: generate_tag_by_tokens(tokens))
wmo_filtered_dataset.head()

Unnamed: 0,category,question,tokens,ner_tags
0,where_is_my_order,I have order with #123412 (123123) AASDAS #ASDAS,"[I, have, order, with, #123412, (123123), AASD...","[0, 0, 0, 0, 1, 1, 1, 1]"
1,where_is_my_order,"Hello, just checking in on the status of my or...","[Hello,, just, checking, in, on, the, status, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,where_is_my_order,It's getting frustrating not knowing where my ...,"[It's, getting, frustrating, not, knowing, whe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke...","[Hey,, I, recently, ordered, a, custom, mechan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,where_is_my_order,"Hello, I haven't received any updates regardin...","[Hello,, I, haven't, received, any, updates, r...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [141]:
# save to excel
wmo_filtered_dataset.to_excel("../data/ner_dataset.xlsx", index=False)

In [143]:
# load dataset cleaned
cleaned_data_path = "../data/ner-dataset-cleaned.xlsx"
ner_dataset = pd.read_excel(cleaned_data_path)
ner_dataset.head()

Unnamed: 0,category,question,tokens,ner_tags
0,where_is_my_order,I have order with #123412 (123123) AASDAS #ASDAS,"['I', 'have', 'order', 'with', '#123412', '(12...","[0, 0, 0, 0, 1, 1, 1, 1]"
1,where_is_my_order,"Hello, just checking in on the status of my or...","['Hello,', 'just', 'checking', 'in', 'on', 'th...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,where_is_my_order,It's getting frustrating not knowing where my ...,"[""It's"", 'getting', 'frustrating', 'not', 'kno...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,where_is_my_order,"Hey, I recently ordered a custom mechanical ke...","['Hey,', 'I', 'recently', 'ordered', 'a', 'cus...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,where_is_my_order,"Hello, I haven't received any updates regardin...","['Hello,', 'I', ""haven't"", 'received', 'any', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [10]:
model_checkpoint = "distilbert-base-uncased"
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
tokenizer(wmo_filtered_dataset['email'][0])

{'input_ids': [101, 7592, 1010, 2074, 9361, 1999, 2006, 1996, 3570, 1997, 2026, 2344, 2004, 2009, 2038, 2042, 2058, 1037, 2733, 2144, 2026, 5309, 1998, 2145, 2053, 14409, 1012, 2115, 25732, 3433, 2052, 2022, 6551, 12315, 1012, 4283, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}