In [1]:
import re

import numpy as np 
import pandas as pd


In [4]:
DATA_PATH = "../data/all_tickets.csv"

PRIORITY_COLUMN = "urgency"

TICKET_TYPES = {0: "incident", 1: "request"}

CATEGORIES = {
    0: "lacking information",
    6: "access",
    5: "hardware",
    4: "action",
    7: "location",
    11: "access card",
    8: "database",
    9: "purchase",
}

DESCRIPTION_REGEX = re.compile("(\W|\d)+")

In [5]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact
0,,hi since recruiter lead permission approve req...,1,4,2,21,71,3,4
1,connection with icon,icon dear please setup icon per icon engineers...,1,6,22,7,26,3,4
2,work experience user,work experience user hi work experience studen...,1,5,13,7,32,3,4
3,requesting for meeting,requesting meeting hi please help follow equip...,1,5,13,7,32,3,4
4,reset passwords for external accounts,re expire days hi ask help update passwords co...,1,4,2,76,4,3,4


In [6]:
df['category'].value_counts()

4     34061
5      9634
6      2628
7       921
11      612
8       239
9       191
3       137
1        72
12       45
0         4
2         3
10        2
Name: category, dtype: int64

In [7]:
MIN_CATEGORY_COUNT = 150

selected_categories = set(
    map(
        lambda c: c[0],
        filter(
            lambda i: i[1] >= MIN_CATEGORY_COUNT,
            enumerate(df.category.value_counts().sort_index().values),
        ),
    )
)
selected_categories

df = df[df.category.isin(selected_categories)]
df = df.drop(["sub_category1", "sub_category2", "business_service"], axis=1)
df["title"] = df.title.fillna("")
df = df[~df.title.isna()]
df.head()

Unnamed: 0,title,body,ticket_type,category,urgency,impact
0,,hi since recruiter lead permission approve req...,1,4,3,4
1,connection with icon,icon dear please setup icon per icon engineers...,1,6,3,4
2,work experience user,work experience user hi work experience studen...,1,5,3,4
3,requesting for meeting,requesting meeting hi please help follow equip...,1,5,3,4
4,reset passwords for external accounts,re expire days hi ask help update passwords co...,1,4,3,4


In [8]:
import summa

longest = df["body"].sort_values(key=lambda w: w.apply(len)).values[-5:]

for j, txt in enumerate(longest):
    print(
        "Keywords: ",
        str(j + 1),
        "\n",
        (summa.keywords.keywords(txt, words=5)).split("\n"),
    )


Keywords:  1 
Keywords:  2 
 ['change switch', 'changed', 'issue', 'stack issues', 'standby']
Keywords:  3 
 ['implemented', 'change', 'changes', 'july', 'date', 'attachments']
Keywords:  4 
 ['working', 'work', 'signed', 'signing', 'sign', 'thank', 'thanks', 'agreed', 'agree', 'entities', 'entity']
Keywords:  5 
 ['connect', 'connected', 'thanks', 'hi', 'attached', 'attachments', 'change']


In [9]:
from yake import KeywordExtractor

kw_extractor = KeywordExtractor(lan="en", n=1, top=5)

for j, txt in enumerate(longest):
    kw  = kw_extractor.extract_keywords(text=txt)
    kw = [x for x, y in kw]
    print("Keywords of article", str(j+1), "\n", kw)

Keywords of article 1 
 ['availability', 'capture', 'err', 'replica', 'secondary']
Keywords of article 2 
 ['interface', 'changed', 'link', 'protocol', 'switch']
Keywords of article 3 
 ['attachments', 'recipient', 'information', 'implemented', 'accept']
Keywords of article 4 
 ['section', 'description', 'revisions', 'december', 'germany']
Keywords of article 5 
 ['attachments', 'information', 'recipient', 'accept', 'intended']


In [10]:
from keybert import KeyBERT

bertkw = KeyBERT('distilbert-base-nli-mean-tokens')
for j, txt in enumerate(longest):
    keywords = bertkw.extract_keywords(txt, keyphrase_ngram_range=(3, 3), stop_words='english', 
                           use_mmr=True, diversity=0.75)
    print("Keywords", str(j+1), "\n", keywords)

Keywords 1 
 [('thank engineer thursday', 0.5444), ('poll running primary', 0.194), ('high memory cpu', 0.3095), ('extensions recent upgrade', 0.4304), ('boring technical details', 0.0174)]
Keywords 2 
 [('engineer ext tuesday', 0.5558), ('lot users best', 0.058), ('slot protocol interface', 0.1952), ('reason failure destroyed', 0.1375), ('confirmation election elect', 0.3319)]
Keywords 3 
 [('mob july change', 0.4547), ('award winning portfolio', 0.2323), ('effort free viruses', 0.2795), ('registered address canada', 0.2263), ('relationship manager wales', 0.1634)]
Keywords 4 
 [('engineer wednesday pm', 0.5429), ('hi excited looking', 0.0559), ('destroyed signed contracts', 0.388), ('wife expecting child', 0.3044), ('best counsel germany', 0.2621)]
Keywords 5 
 [('upgrades replacements definitely', 0.5288), ('st december october', 0.3877), ('filtering user traffic', 0.3009), ('largest multimedia publishers', 0.1482), ('attachments strictly prohibited', 0.0354)]
