In [3]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict

from tqdm import tqdm
tqdm.pandas()

import spacy
#import neuralcoref
assert spacy.__version__ == '2.1.0'

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##### To download correct Spacy & Neural Coref Version
```
pip uninstall spacy 
pip uninstall neuralcoref
pip install spacy==2.1.0 
pip install neuralcoref --no-binary neuralcoref
```

# Pre-Processing

In [6]:
new_rawdata1 = pd.read_csv('2018-JAN-June.csv', encoding = "utf-8", engine="python")
new_rawdata2 = pd.read_csv('2018-JULY-DEC.csv', encoding = "utf-8", engine="python")
new_rawdata3 = pd.read_csv('2019.csv', encoding = "utf-8", engine="python")
new_rawdata4 = pd.read_csv('2020.csv', encoding = "utf-8", engine="python")

In [7]:
new_rawData_all = new_rawdata1.append([new_rawdata2, new_rawdata3, new_rawdata4])
#new_rawData_all = new_rawdata3
new_rawdata = new_rawData_all.loc[((new_rawData_all['Country']=="USA")|
                                (new_rawData_all['Country']=="United Kingdom")|
                                (new_rawData_all['Country']=="Canada")|
                                (new_rawData_all['Country']=="South Africa")|
                                (new_rawData_all['Country']=="New Zealand")|
                                (new_rawData_all['Country']=="Ireland")|
                                (new_rawData_all['Country']=="Australia"))&\
                               (new_rawData_all['Category']=="10 - Troubleshooting*")]

In [8]:
rawdf = new_rawdata.loc[new_rawdata['Product Line'].isnull()==False][['Case Number','Severity','Priority','Country','Date/Time Opened','Product Line','Subject','Customer Request','Answer To Customer']].reset_index(drop=True)
print(rawdf.shape)
print(rawdf['Product Line'].unique())
rawdf.drop_duplicates(inplace=True)
rawdf.reset_index(inplace=True)

(2914, 9)
['PTCCB - MCCBS,EARTH LEAK&SWITC']


In [9]:
## Convert the Severity and Proirity values from Categorical to Ordinal 
rawdf.loc[rawdf['Priority'].isnull()==True,"PRIORITY"]=0
rawdf.loc[rawdf['Priority']=='Normal',"PRIORITY"]=1
rawdf.loc[rawdf['Priority']=='Serious',"PRIORITY"]=2
rawdf.loc[rawdf['Priority']=='High',"PRIORITY"]=3

rawdf.loc[rawdf['Severity'].isnull()==True,"SEVERITY"]=0
rawdf.loc[rawdf['Severity']=='Normal',"SEVERITY"]=1
rawdf.loc[rawdf['Severity']=='High Financial Impact',"SEVERITY"]=5
rawdf.loc[rawdf['Severity']=='Shutdown / Downtime in Operations',"SEVERITY"]=6
rawdf.loc[rawdf['Severity']=='Customer Relationship Risk',"SEVERITY"]=9
rawdf.loc[rawdf['Severity']=='Potential Safety Issue',"SEVERITY"]=10
rawdf.loc[rawdf['Severity']=='Potential cyber issue',"SEVERITY"]=10

In [10]:
rawdf['Date raw'] = pd.to_datetime(rawdf['Date/Time Opened'], format='%d/%m/%Y %I:%M %p')
rawdf['Date'] = pd.to_datetime(rawdf['Date raw'].dt.strftime('1/%m/%Y'), format='%d/%m/%Y')

# NER Training 
- NLP module train to recognize all SE reference numbers in a customer request
    - Begin with pre-trained Spacy Language Model (medium)

In [225]:
import en_core_web_md

nlp = en_core_web_md.load()

  return f(*args, **kwds)
100%|██████████| 40155833/40155833 [00:03<00:00, 13079957.33B/s]


<spacy.lang.en.English at 0x7f7390ef9668>

In [1042]:
nlp = spacy.load('./spacy_ner_may11')

### Training Functions

In [1053]:
from spacy.util import minibatch, compounding
from spacy.util import decaying
import random
import numpy as np

def custom_optimizer(optimizer, learn_rate=0.0001, beta1=0.9, beta2=0.999, eps=1e-8, L2=1e-6, max_grad_norm=1.0):
    """
    Function to customizer spaCy default optimizer
    """
    
    optimizer.learn_rate = learn_rate
    optimizer.beta1 = beta1
    optimizer.beta2 = beta2
    optimizer.eps = eps
    optimizer.L2 = L2
    optimizer.max_grad_norm = max_grad_norm
    
    return optimizer


def train_ner(TRAINING_DATA, EPOCHS, lr, log_every=100):
    optimizer = nlp.entity.create_optimizer()
    
    unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    # TRAINING THE MODEL
    with nlp.disable_pipes(*unaffected_pipes):
        
        optimizer = nlp.resume_training(component_cfg={"ner": {"conv_window": 3, "self_attn_depth": 2}})
        optimizer = custom_optimizer(optimizer, lr)
        
        dropout = decaying(0.6, 0.2, 1e-4)
        
        for i in range(EPOCHS):
            # shuufling examples  before every iteration
            random.shuffle(TRAINING_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAINING_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                            texts,  
                            annotations,  
                            drop=next(dropout),
                            sgd=optimizer,
                            losses=losses,
                        )
            if (i+1) % log_every == 0: print(f"Iteration {i+1} ==> Loss: ", losses['ner'])

### Unsupervised Dataset Gen
- Takes patterns and builds dataset from Customer Request
    - Enter keywords e.g. breaker, generator
    - Regex search for catalog #'s e.g. HOM120

In [1044]:
from spacy.matcher import Matcher

def build_ner_training(txt, pattern, coref=False):
    matcher = Matcher(nlp.vocab)
    
    for pattern in patterns:
        # set singular and plural patterns as well as base part catcher
        
        pattern1 = [{'DEP':'ROOT','OP':'?'},
                {'DEP':'meta','OP':'?'},
                {'DEP':'compound','OP':'?'},
                {'DEP':'nsubj','OP':'?'},
                {'LOWER': pattern}]
    
        pattern2 = [{'DEP':'ROOT','OP':'?'},
                {'DEP':'meta','OP':'?'},
                {'DEP':'compound','OP':'?'},
                {'DEP':'nsubj','OP':'?'},
                {'LOWER': pattern + 's'}]
        
        # general regex to capture prt nbr
        pattern3 = [{'DEP':'ROOT','OP':'?'},
                {'DEP':'meta','OP':'?'},
                {'DEP':'compound','OP':'?'},
                {'DEP':'nsubj','OP':'?'},
                {'TEXT': {'REGEX': '(\w*\d[\w\d]+)'}}]
        
    
        if len(pattern.split(' ')) == 2:
            # drop last element
            pattern1.pop()
            pattern2.pop()
            
            # add new multi word pattern
            pattern1.append({'LOWER': pattern.split(' ')[0]})
            pattern1.append({'LOWER': pattern.split(' ')[1]})
            pattern2.append({'LOWER': pattern.split(' ')[0]})
            pattern2.append({'LOWER': pattern.split(' ')[1] + 's'})
    
        matcher.add("PRODUCT", None, pattern1, pattern2, pattern3)
    
    TRAINING_DATA = []

    for i, doc in enumerate(nlp.pipe(txt)):
        
        # Resolve coreferences
        if coref:
            doc = nlp(doc._.coref_resolved)
        
        # Match on the doc and create a list of matched spans
        spans = [doc[start:end] for match_id, start, end in matcher(doc)]
        
        # Get (start character, end character, label) tuples of matches
        entities = [(span.start_char, span.end_char, "PRODUCT") for span in spans]
        
        # Format the matches as a (doc.text, entities) tuple
        training_example = (doc.text, {"entities": entities})
        
        # Append the example to the training data
        TRAINING_DATA.append(training_example)
        
    print(f'{len(TRAINING_DATA)} Examples Captured')
        
    return TRAINING_DATA

#### Generate unsupervised Training Data

In [1055]:
patterns = ['breaker', 'generator', 'trip unit']

flatten_list = lambda docs: [x for sublist in docs for x in sublist]

train_docs = []
for pat in patterns:
    #docs = [x for x in flatten_list(rawdf['CustReqCorpus']) if pat in x]
    docs = [clean_request(x) for x in rawdf['Customer Request'] if pat in x]
    train_docs.append(docs)
    
train_docs = flatten_list(train_docs)
    
TRAIN_DATA = build_ner_training(train_docs, patterns, coref=False)

1248 Examples Captured


In [1056]:
TRAIN_DATA[40]

('Your request has been received by your CIC Team and is currently under review Return Authorization Request Detailed Information Date PM EST March Requestor soltero Heroldo heroldo soltero vertivco com Account LIEBERT NORTH AMERICA INC Account Number PO Number LBR747876 Return Reason For return of Inoperative material Credit will be determined after receipt and inspection of material If unit is found to be out of warranty unit may be returned to you at your expense If found to be operative after testing charges may be invoiced to the Distributor Under the heading Return Product if Credit Denied if you choose YES the product will be returned to you at your expense If you choose NO the product will be scrapped by Schneider Electric Please provide detailed description of Observed Failure Application for each product listed Return Reason details Circuit breaker does not make the change of state RG No Line Catalog Product Return Product if Credit Denied Return Qty Original Order Requested 

In [1058]:
train_ner(TRAIN_DATA, 250, lr=0.00001, log_every=25)

Iteration 25 ==> Loss:  16136.99470347059
Iteration 50 ==> Loss:  10653.724902643924
Iteration 75 ==> Loss:  9216.664163781461
Iteration 100 ==> Loss:  9179.992964203926
Iteration 125 ==> Loss:  9647.478213392704
Iteration 150 ==> Loss:  9937.395929641778
Iteration 175 ==> Loss:  10657.21062855414
Iteration 200 ==> Loss:  10773.646825882599
Iteration 225 ==> Loss:  8691.716869966018
Iteration 250 ==> Loss:  8912.381784371002


## Train on annotated data
- Small sample dataset generated to fine-tune model further.
    - generate using ```spacy_annotator``` lib

In [125]:
import ast

TRAIN_DATA = pd.read_csv('annot_2019v1.csv')['annotations']
TRAIN_DATA = list(TRAIN_DATA.apply(lambda x: ast.literal_eval(x)))

In [126]:
TRAIN_DATA[5]

('keeps getting a fail on ground fault through second injection wants to set it up according to the coordination studies as far as he knows the breaker is set up according to the studies is there a chance that the setting may be conflicting with the tests Ig J 4 OFF Ir 1 8 Isd 6 1 OFF Ii 8',
 {'entities': [(138, 149, 'PRODUCT')]})

In [127]:
len(TRAIN_DATA)

61

#### Add Emal Garbage label to NER

In [1062]:
ner = nlp.get_pipe("ner")

add_ents = ['PRODUCT','EMAIL GARBAGE']

for ent in add_ents:
    if 'extra_labels' in ner.cfg and ent in ner.cfg['extra_labels']:
        pass
    else:
        ner.add_label(ent)
    
ner.cfg['extra_labels'] = add_ents

In [128]:
train_generator = (i for i in TRAIN_DATA)

In [130]:
instance = next(train_generator)
txt, ents = instance
inds = [x[:2] for x in list(ents.values())[0]]

print('EXAMPLE:\n', txt, '\n', '\nLABELS:')
for ex in inds:
    print(txt[ex[0]:ex[1]])

EXAMPLE:
 We have a CM3000 with a ecc21 but cannot change the IP from the display itself what would cause this is it a sign of a bad ethernet card  
 
LABELS:
CM3000 with a ecc21


In [None]:
nlp.begin_training()
train_ner(TRAIN_DATA, 1000, lr=0.001)

Iteration 100 ==> Loss:  785.9439191968231
Iteration 200 ==> Loss:  246.44435745156196
Iteration 300 ==> Loss:  61.73124403851792
Iteration 400 ==> Loss:  22.847422277885567
Iteration 500 ==> Loss:  10.302024816423266
Iteration 600 ==> Loss:  15.612178293243394
Iteration 700 ==> Loss:  20.15935285399992


In [None]:
nlp.to_disk('./spacy_ner_may17')
print('Saved NER Model')

## Evaluation

#### Load Saved Model

In [4]:
nlp = spacy.load('./spacy_ner_may11')
#neuralcoref.add_to_pipe(nlp)

In [11]:
sample_txts = (item for item in rawdf['Customer Request'].sample(1000))

In [16]:
doc = nlp(next(sample_txts))

spacy.displacy.render(doc, style='ent')

## Issue Finder
- Start from transformers bert-large fine tuned Q/A model

In [17]:
import transformers
transformers.__version__

'4.7.0'

In [18]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

## Customer Request Processing

In [19]:
def find_issue(txt, ent, question, verbose=True):
    
    query = question + ent + '?'
    
    inputs = tokenizer(query, txt, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
    
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    
    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    confidence_raw = answer_start_scores[0][answer_start] + answer_end_scores[0][answer_end]
    confidence_score = 1/(1 + np.exp(-confidence_raw.detach().numpy()))
    
    if verbose:
        print(f"Question: {question}")
        print(f"Answer: {answer}")
    return str(answer), confidence_score

In [22]:
flatten_list = lambda docs: [x for sublist in docs for x in sublist]
id_ents = lambda txt: [ent.text for ent in nlp(txt).ents if (ent.label_ == 'PRODUCT')]

def clean_request(txt):
    txt  = re.sub('[^A-Za-z0-9]+', ' ', txt)
    txt = re.sub(r'\b[0-9]+\b\s*', '', txt)
    return txt

def sentence_splitter(txt):
    sents = sent_tokenize(txt)
    sents = flatten_list([sent.split('\n') for sent in sents])
    return sents

def split_request(txt):
    """
    split sentences into chunks < 512
    characters to feed to qa relation
    extraction model.
    """
    sent_staging = []
    paragraph = []
    for sent in sentence_splitter(txt):
        sent = clean_request(sent)
        staged_len = sum([len(chunk) for chunk in sent_staging])
        if len(sent) + staged_len < 512:
            sent_staging.append(sent)
        else:
            paragraph.append('. '.join(sent_staging))
            if len(sent) < 512:
                sent_staging = [sent]
            
    paragraph.append('. '.join(sent_staging))
    return paragraph

def resolve_relations(txt, THRESHOLD=0.9):
    """
    loop through all relation/question pairs
    in the query dict to extract key relation
    values from request.
    Return all pairs w/ score greater than
    THRESHOLD value.
    """
    query_dict = { 
        'CONTEXT' : "What is the context of the ",
        'ISSUE'   : "What is the issue with the ",
        'LOCATION': "What is the location of the "
    }
    
    triplets = []
    ents = set(id_ents(txt))
    
    # need to move upstream as this gets 'PRODUCT' triples
    #  for every passage in large texts... to much noise
    #if len(ents) == 0: ents = ['PRODUCT']
        
    for ent in ents:
        for relation, question in query_dict.items():
            ans, score = find_issue(txt, ent, question, verbose=False)
            if score > THRESHOLD:
                triplets.append(((ent, relation, ans, score)))
    return triplets

def process_triplets(triplets):
    """
    loop through triplets append non-empty results
    choose highest confidence product, response 
    pair for all responses 
    """
    final_triplets = []
    answers = defaultdict(list)
    
    for trip in triplets:
        if trip[2] != '': answers[trip[2]].append(trip)
            
    # remove duplicate answers 
    for ans in answers.keys():
        final_triplets.append(sorted(
            answers[ans], key=lambda x: x[2],
            reverse=True)[0:3][0]
            )
        

    return final_triplets
    #return [trip[:3] for trip in final_triplets]
    
def process_request(txt):
    chunks = split_request(txt)
    if len(chunks) == 1:
        triplets = (resolve_relations(txt))
    else:
        triplets = flatten_list([resolve_relations(chunk) for chunk in chunks if chunk != ''])
    triplets = process_triplets(triplets)
    return triplets

In [23]:
doc = nlp(next(sample_txts))

spacy.displacy.render(doc, style='ent')
print(process_request(doc.text))

[('switchgear', 'ISSUE', 'not operating when pressed with voltage present', 0.9808951575498895), ('switchgear', 'LOCATION', 'whitechapel', 0.9691624712420791), ('NSX100', 'CONTEXT', 'tm d integral trip facility is not operating when pressed with voltage present', 0.9925898560999983), ('breakers', 'CONTEXT', 'it looks as if there may be a batch issue', 0.9475549028566288), ('breakers', 'ISSUE', 'a batch issue', 0.998371765172819)]


# Test on full dataset

In [24]:
new_rawdata1 = pd.read_csv('2018-JAN-June.csv', encoding = "utf-8", engine="python")
new_rawdata4 = pd.read_csv('2020.csv', encoding = "utf-8", engine="python")
new_rawdata3 = pd.read_csv('2019.csv', encoding = "utf-8", engine="python")
new_rawdata2 = pd.read_csv('2018-JULY-DEC.csv', encoding = "utf-8", engine="python")

In [25]:
new_rawData_all = new_rawdata1.append([new_rawdata2, new_rawdata3, new_rawdata4])
#new_rawData_all = new_rawdata4
new_rawdata = new_rawData_all.loc[((new_rawData_all['Country']=="USA")|
                                (new_rawData_all['Country']=="United Kingdom")|
                                (new_rawData_all['Country']=="Canada")|
                                (new_rawData_all['Country']=="South Africa")|
                                (new_rawData_all['Country']=="New Zealand")|
                                (new_rawData_all['Country']=="Ireland")|
                                (new_rawData_all['Country']=="Australia"))&\
                               (new_rawData_all['Category']=="10 - Troubleshooting*")]

In [26]:
rawdf = new_rawdata.loc[new_rawdata['Product Line'].isnull()==False][['Case Number','Severity','Priority','Country','Date/Time Opened','Product Line','Subject','Customer Request','Answer To Customer']].reset_index(drop=True)
print(rawdf.shape)
print(rawdf['Product Line'].unique())
rawdf.drop_duplicates(inplace=True)
rawdf.reset_index(inplace=True)

(2914, 9)
['PTCCB - MCCBS,EARTH LEAK&SWITC']


In [27]:
rawdf['triplets'] = rawdf['Customer Request'].progress_apply(process_request)

100%|██████████| 2451/2451 [26:48<00:00,  1.52it/s]  


In [28]:
rawdf['Date raw'] = pd.to_datetime(rawdf['Date/Time Opened'], format='%d/%m/%Y %I:%M %p')
rawdf['Date'] = pd.to_datetime(rawdf['Date raw'].dt.strftime('1/%m/%Y'), format='%d/%m/%Y')

In [29]:
rawdf.head()

Unnamed: 0,index,Case Number,Severity,Priority,Country,Date/Time Opened,Product Line,Subject,Customer Request,Answer To Customer,triplets,Date raw,Date
0,0,46181136,Normal,Normal,USA,10/2/2018 3:41 am,"PTCCB - MCCBS,EARTH LEAK&SWITC",PJA36040CU43CE1ACMOLV and EBX510,Name: David Rosell\nPhone: 7862264495\nConcern...,He had the trip unit door closed. With it ope...,"[(EBX510, ISSUE, has changed the trip unit, 0....",2018-02-10 03:41:00,2018-02-01
1,1,48164862,Normal,Normal,USA,24/4/2018 3:11 am,"PTCCB - MCCBS,EARTH LEAK&SWITC",RJF36160U44A having nuisance tripping,Name: Albert Sarkis\nPhone: (209) 744-1513\nCo...,This breaker appears to have a damaged di/dt b...,"[(RJF36160U44A, CONTEXT, having nuisance tripp...",2018-04-24 03:11:00,2018-04-01
2,2,46224073,Normal,Normal,USA,12/2/2018 8:22 pm,"PTCCB - MCCBS,EARTH LEAK&SWITC",TEX - Square D warranty claim,Please process the attached supplier recovery ...,Initiate TEX,[],2018-02-12 20:22:00,2018-02-01
3,3,46237499,Normal,Normal,USA,13/2/2018 2:37 am,"PTCCB - MCCBS,EARTH LEAK&SWITC",TEX Request for Test Report on InOperative Pro...,A Request for Test Report on InOperative Produ...,Initiate TEX,"[(LC36400 breaker, ISSUE, stuck in the closed ...",2018-02-13 02:37:00,2018-02-01
4,4,45184334,Normal,Normal,United Kingdom,3/1/2018 5:54 pm,"PTCCB - MCCBS,EARTH LEAK&SWITC",faulty - 10321742. 22177952,2301002465\nLV431629 x 1 Faulty\nFailed when t...,22177952,"[(LV431629, ISSUE, faulty failed when trying t...",2018-01-03 17:54:00,2018-01-01


In [30]:
rawdf[['Case Number', 'Date', 'Severity', 'Customer Request', 'triplets']].to_csv('6_21_validation.csv')

#### Generate Knowledge Graph Triples

In [953]:
triplets = np.asarray([trip for trip in rawdf['triplets'] if trip is not []])

In [957]:
triplet_df = pd.DataFrame(flatten_list(triplets), columns=['HEAD','RELATION','TAIL'])

In [None]:
np.savetxt('bFO_kg_triplet.tsv', np.asarray(triplet_df), delimiter='\t', fmt='%s')

### TODO
- NER Additional Tags
    - Product
    - Location
    - Contact
    - Organization
- **How to generate more training data**
    - Fine-Tune T5 module on previous samples
        - use model to generate more training data...
    - Manually build dataset
    - Better samples from Matcher f(x)