In [41]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

In [42]:
initialize = True

In [43]:
# lemmatization
if initialize:
    nltk.download('words')

[nltk_data] Downloading package words to /ifxhome/manna/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [44]:
tickets = pd.read_excel('Ticket Details.xlsx', header=1 , sheet_name = "Sheet 1", names=['Ticketid', 'Calendar Week', 'Ticket Created Time', 'Priority', 'Site', 'Problem Description', 'General Category', 'Sub Category', 'Resolution', 'Customer Department', 'Predicted General Category', 'Predicted Sub Category'])
#tickets.columns = tickets.columns.str.strip()
tickets['Input'] = np.nan
tickets['Lot Number'] = np.nan
tickets = tickets.fillna('')
print(tickets.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9581 entries, 0 to 9580
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Ticketid                    9581 non-null   object
 1   Calendar Week               9581 non-null   object
 2   Ticket Created Time         9581 non-null   object
 3   Priority                    9581 non-null   object
 4   Site                        9581 non-null   object
 5   Problem Description         9581 non-null   object
 6   General Category            9581 non-null   object
 7   Sub Category                9581 non-null   object
 8   Resolution                  9581 non-null   object
 9   Customer Department         9581 non-null   object
 10  Predicted General Category  9581 non-null   object
 11  Predicted Sub Category      9581 non-null   object
 12  Input                       9581 non-null   object
 13  Lot Number                  9581 non-null   obje

In [45]:
corpus_words = words.words()
valid_words = ['dn', 'alf', 'jcbe', 'stms']
corpus_words.append(valid_words)

In [46]:
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()

In [47]:
print(valid_words)

['dn', 'alf', 'jcbe', 'stms']


In [48]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.append([r'infineon',
    r'com'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /ifxhome/manna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
# stop words but phrases, used to remove pointless words, unnecessary if TF-IDF is used
remove_phrases = [r'(1)',
    r'(2)',
    r'(3)',
    r'(4)',
    r'(5)',
    r'(6)',
    r'(7)',
    r'(8)',
    r'(9)',
    r'(10)',
    r'(11)',
    r'problem description on error*: (including screenshot of error)',
    r'site*: mark "x" for user site',
    r'[ x ]',
    r'[  ] bth',
    r'[  ] mkz plt',
    r'[  ] wux cc',
    r'[  ] tij',
    r'[  ] sin',
    r'[  ] mkz scc',
    r'[  ] wux ds',
    r'[  ] rbg',
    r'[  ] mkz pla',
    r'[  ] wux hps',
    r'[  ] cjj',
    #r'Lot Number:',
    #r'equipments affected:',
    r'equipment/pc name:',
    r'camstar server*:',
    r'https://faqstorage.infineon.com/knowledgebasearticle125028.aspx',
    r'integration application: mark "x" for other application', 
    r'[  ] awi',
    r'[  ] gpn',
    r'[  ] xmes',
    r'[  ] ddm',
    r'[  ] stms',
    r'[  ] xtest ui',
    r'[  ] eaf',
    r'[  ] workstream',
    r'printer:',
    r'https://faqstorage.infineon.com/knowledgebasearticle125029.aspx',
    r'referred to faq*:',
    r'affected area contact number*:',
    r'alt contact person name (compulsory for critical)*:', 
    r'alt contact person number (compulsory for critical):',
    r'* mandatory fields.',
    r'contactnumber:',
    r'affected area contact number:',
    r'requestor department:',
    r'problem description:']

In [50]:
# this is the regex input to clean up the template for non words
regex_remove_punctuations = r'[~`!@#\$%\^&\*\(\)_\+\-\=\{\}\|\[\]\\:;"\'<>\?,\.\/]'

In [51]:
def spelling_correction(problem_description, resolution):
    corrected_text = ""
    input_words = word_tokenize(problem_description.lower() + ' ' + resolution.lower())
    for input_word in input_words:
        '''
        if input_word not in valid_words or input_word not in corpus_words:
            corrected_word = [(jaccard_distance(set(ngrams(input_word, 2)), set(ngrams(corpus_word, 2))), corpus_word) \
                    for corpus_word in corpus_words if corpus_word[0] == input_word[0]]
            corrected_text += corrected_word[0]
        '''
    return corrected_text

In [52]:
def clean_extract(problem_description, resolution):
    text = problem_description.lower() + ' ' + resolution.lower()
    # replace newline and tab with whitespace
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    # these 2 lines remove all the common templates, by right they should not matter if fed into TF-IDF
    for substring in remove_phrases:
        text = text.replace(substring, ' ')
    # remove punctuations using regex
    text = re.sub(regex_remove_punctuations, ' ', text)
    # remove duplicate white space
    text = re.sub(' +', ' ', text)
    text_list = word_tokenize(text)
    # remove stop words
    text_list = [lem.lemmatize(word) for word in text_list]
    text_list = [word for word in text_list if not word in stop_words]
    corrected_text = []
    '''
    for input_word in text_list:
        if input_word not in valid_words or input_word not in corpus_words:
            corrected_word = [(jaccard_distance(set(ngrams(input_word, 2)), set(ngrams(corpus_word, 2))), corpus_word) \
                    for corpus_word in corpus_words if corpus_word[0] == input_word[0]]
            try:
                corrected_text.append(corrected_word[0])
            except:
                corrected_text.append(input_word)
                '''
    text = TreebankWordDetokenizer().detokenize(text_list)
    # separate lot number for RPA purposes
    try:
        #text = re.sub(r'(?=product).*(?=equipment affected)', '', text)
        lot_number = re.search(r'(?<=lot number ).*(?= equipment affected)', text).group(0).upper()
        #text = re.sub(r'(lot number ).*( equipment affected)', '', text)
    except:
        lot_number = np.nan
    return pd.Series([text, lot_number])

In [53]:
tickets[['Input', 'Lot Number']] = tickets.apply(lambda row: clean_extract(row['Problem Description'], row['Resolution']), axis=1)

In [54]:
# these 9 examples are the best for button greyed out, look at all the different phrasings
print(tickets['Ticketid'][1])
print(tickets['Input'][1])
print(tickets['Lot Number'][1])
# print()
# print(tickets['Ticketid'][135])
# print(tickets['Input'][135])
# print(tickets['Lot Number'][135])
# print()
# print(tickets['Ticketid'][136])
# print(tickets['Input'][136])
# print(tickets['Lot Number'][136])
# print()
# print(tickets['Ticketid'][234])
# print(tickets['Input'][234])
# print(tickets['Lot Number'][234])
# print()
# print(tickets['Ticketid'][235])
# print(tickets['Input'][235])
# print(tickets['Lot Number'][235])
# print()
# print(tickets['Ticketid'][236])
# print(tickets['Input'][236])
# print(tickets['Lot Number'][236])
# print()
# print(tickets['Ticketid'][5134])
# print(tickets['Input'][5134])
# print(tickets['Lot Number'][5134])
# print()
# print(tickets['Ticketid'][5135])
# print(tickets['Input'][5135])
# print(tickets['Lot Number'][5135])
# print()
# print(tickets['Ticketid'][5136])
# print(tickets['Input'][5136])
# print(tickets['Lot Number'][5136])
# print()

INC000004851509
error trackout 0 camstar lotsplitbyitems splitlot e0100 za942324m0j total ndpw item qty doe match lot qty lot number za942324m0j equipment affected stp03 6062515654 iscw8cc938477d infineon com 172 21 97 9 infineon alismoha mkz investigation using script see many total item qty v lot qty select sum ndpw lotwafers containerid select containerid container containername za942324m0j match lot qty sum item qty bypass singulate
ZA942324M0J


In [77]:
tickets[tickets['Ticketid']=="INC000004851282"]

Unnamed: 0,Ticketid,Calendar Week,Ticket Created Time,Priority,Site,Problem Description,General Category,Sub Category,Resolution,Customer Department,Predicted General Category,Predicted Sub Category,Input,Lot Number


In [78]:
tickets['Problem Description'][0]

'(1) Problem description on error*: (Including screenshot of error) \nThe FAJob : ZA940359M0XSPAV0420200908134433329 does not belong to this Equipment: SPTV01\n\n(2) Site*: Mark "x" for user site\n\n[  ] BTH \t\t\t[  ] MKZ PLT\t\t[  ] WUX CC\t\t[  ] TIJ\n[  ] SIN \t\t\t[X] MKZ SCC\t        [  ] WUX DS\t\t[  ] RBG \t\t\n[  ] MKZ PLA\t\t[  ] WUX HPS\t\t[  ] CJJ     \n\n(3) Lot Number: ZA940359M0X\n\n(4) Equipments affected:  SPTV01\n\n(5) Equipment/PC name:ISCN5CG0289G0D\n\n(6) Camstar Server*:\nhttps://faqstorage.infineon.com/KnowledgebaseArticle125028.aspx\n\n(7) Integration application: Mark "x" for other application\n \n[  ] AWI\t\t\t[  ] GPN\t\t\t[  ] XMES  \t   \n[  ] DDM\t\t\t[  ] StMS\t\t        [  ] XTEST UI  \n[  ] EAF\t\t\t[  ] Workstream  \t  \n \n(8) Printer:\nhttps://faqstorage.infineon.com/KnowledgebaseArticle125029.aspx\n\n(9) Referred to FAQ*: no faq found\n\n(10)Affected area contact number*:  +0172907259 / +60 6 251 8844\n\n* Mandatory fields.  ISCN5CG0289G0D.infineon.

In [57]:
# !pip install gensim

In [72]:
import gensim
def textrank(corpus, ratio=0.2):    
    if type(corpus) is str:        
       corpus = [corpus]    
    lst_summaries = [gensim.summarization.summarize(txt,  
                     ratio=ratio) for txt in corpus]    
    return lst_summaries

## Apply the function to corpus
predicted = textrank(corpus=text, ratio=0.2)
predicted

ValueError: input must have more than one sentence

In [74]:
# def clean_extrpact(problem_description, resolution):
text = tickets['Problem Description'][0].lower()
# replace newline and tab with whitespace
text = text.replace('\n', ' ')
text = text.replace('\t', ' ')
# these 2 lines remove all the common templates, by right they should not matter if fed into TF-IDF
for substring in remove_phrases:
    text = text.replace(substring, ' ')
# remove punctuations using regex
text = re.sub(regex_remove_punctuations, ' ', text)
# remove duplicate white space
text = re.sub(' +', ' ', text)
text_list = word_tokenize(text)
# remove stop words
text_list = [lem.lemmatize(word) for word in text_list]
text_list = [word for word in text_list if not word in stop_words]
# corrected_text = []
'''
for input_word in text_list:
    if input_word not in valid_words or input_word not in corpus_words:
        corrected_word = [(jaccard_distance(set(ngrams(input_word, 2)), set(ngrams(corpus_word, 2))), corpus_word) \
                for corpus_word in corpus_words if corpus_word[0] == input_word[0]]
        try:
            corrected_text.append(corrected_word[0])
        except:
            corrected_text.append(input_word)
            '''
text = TreebankWordDetokenizer().detokenize(text_list)
# separate lot number for RPA purposes
# try:
    #text = re.sub(r'(?=product).*(?=equipment affected)', '', text)
#         lot_number = re.search(r'(?<=lot number ).*(?= equipment affected)', text).group(0).upper()
    #text = re.sub(r'(lot number ).*( equipment affected)', '', text)
# except:
#     lot_number = np.nan
#     return pd.Series([text, lot_number])
text

'fajob za940359m0xspav0420200908134433329 doe belong equipment sptv01 x mkz scc lot number za940359m0x equipment affected sptv01 iscn5cg0289g0d faq found 0172907259 60 6 251 8844 iscn5cg0289g0d infineon com 10 245 8 198 infineon tanjinsu mkz dc g r01 ge262'

In [69]:
nlp=spacy.load("en_core_web_sm")

In [70]:
phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = ['button','grey']
patterns = [nlp(i) for i in text]
phrase_matcher.add('button',None, *patterns)

doc = nlp(text[:100])

for entity in doc.ents:
    print(entity.label_, '|', entity.text)

CARDINAL | 20375
CARDINAL | 203


In [73]:
text[:100]

'move lot strip map total qty 20375 doe match lot qty 20379 user request camstar admin adjust qty 203'