In [1]:
import pandas as pd
import numpy as np
import sys, os
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,LancasterStemmer
from nltk.corpus import stopwords
import logging
import string
nltk.download('reuters') # Downloading corpus
nltk.download('stopwords') # Downloading stopwords
nltk.download('punkt') # Downloading tokenizer
nltk.download('wordnet')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package reuters to /Users/mmanian/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mmanian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mmanian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mmanian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
logging.basicConfig(filename='app.log', filemode='w', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)

In [3]:
ticket_df = pd.read_excel('input_data.xlsx')
ticket_df.head()

Unnamed: 0,Short description,Description,Caller,Assignment group
0,login issue,-verified user details.(employee# & manager na...,spxjnwir pjlcoqds,GRP_0
1,outlook,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,hmjdrvpb komuaywn,GRP_0
2,cant log in to vpn,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,eylqgodm ybqkwiam,GRP_0
3,unable to access hr_tool page,unable to access hr_tool page,xbkucsvz gcpydteq,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0


In [4]:
ticket_df.shape

(8500, 4)

In [5]:
ticket_df.describe()

Unnamed: 0,Short description,Description,Caller,Assignment group
count,8492,8499,8500,8500
unique,7481,7817,2950,74
top,password reset,the,bpctwhsn kzqsbmtp,GRP_0
freq,38,56,810,3976


In [6]:
ticket_df.isnull().sum()

Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64

In [7]:
targetClassCnt=ticket_df['Assignment group'].value_counts()
targetClassCnt.describe()

count      74.000000
mean      114.864865
std       465.747516
min         1.000000
25%         5.250000
50%        26.000000
75%        84.000000
max      3976.000000
Name: Assignment group, dtype: float64

In [8]:
class cleaning:
    lemmatizer=""
    stemmer =""
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = LancasterStemmer()
    
    def preprocess(self, sentence):
        sentence=str(sentence)
        logging.debug("before preprocessing: %s", sentence)
        sentence = sentence.lower()
        sentence = re.sub(r"\S*@\S*\s?", "", sentence)
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        logging.debug("removed special charaters  %s",sentence)
        sentence=sentence.replace('{html}',"")
        logging.debug("before %s",sentence)
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', sentence)
        logging.debug("after compile %s",cleantext)
        rem_url=re.sub(r'http\S+', '',cleantext)
        rem_num = re.sub('[0-9]+', '', rem_url)
        logging.debug("rem_num: %s",rem_num)
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(rem_num)
        logging.debug("tokens: %s",tokens)
        filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
        stem_words=[self.stemmer.stem(w) for w in filtered_words]
        lemma_words=[self.lemmatizer.lemmatize(w) for w in stem_words]
        result = " ".join(lemma_words)
        logging.debug("after removing stopwords: %s"," ".join(lemma_words))
        return result
    
    def mergeColumns(self, df):
        ticket_df['Description']=ticket_df["Short description"].str.cat(ticket_df["Description"], sep=". ")
        ticket_df.drop("Short description", axis=1, inplace=True)
        return ticket_df
cleaning=cleaning()

In [9]:
ticket_df.drop(['Caller'], axis=1, inplace=True)

In [10]:
ticket_df.head(10)

Unnamed: 0,Short description,Description,Assignment group
0,login issue,-verified user details.(employee# & manager na...,GRP_0
1,outlook,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,GRP_0
2,cant log in to vpn,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,GRP_0
3,unable to access hr_tool page,unable to access hr_tool page,GRP_0
4,skype error,skype error,GRP_0
5,unable to log in to engineering tool and skype,unable to log in to engineering tool and skype,GRP_0
6,event: critical:HostName_221.company.com the v...,event: critical:HostName_221.company.com the v...,GRP_1
7,ticket_no1550391- employment status - new non-...,ticket_no1550391- employment status - new non-...,GRP_0
8,unable to disable add ins on outlook,unable to disable add ins on outlook,GRP_0
9,ticket update on inplant_874773,ticket update on inplant_874773,GRP_0


In [11]:
ticket_df=cleaning.mergeColumns(ticket_df)

In [12]:
ticket_df.head()

Unnamed: 0,Description,Assignment group
0,login issue. -verified user details.(employee#...,GRP_0
1,outlook. \r\n\r\nreceived from: hmjdrvpb.komua...,GRP_0
2,cant log in to vpn. \r\n\r\nreceived from: eyl...,GRP_0
3,unable to access hr_tool page. unable to acces...,GRP_0
4,skype error . skype error,GRP_0


In [13]:
ticket_df['clearText']=ticket_df['Description'].map(lambda s:cleaning.preprocess(s))

In [14]:
ticket_df.head()

Unnamed: 0,Description,Assignment group,clearText
0,login issue. -verified user details.(employee#...,GRP_0,login issu ver u detailsemploy man nam check u...
1,outlook. \r\n\r\nreceived from: hmjdrvpb.komua...,GRP_0,outlook receiv hello team meetingsskyp meet et...
2,cant log in to vpn. \r\n\r\nreceived from: eyl...,GRP_0,cant log vpn receiv cannot log vpn best
3,unable to access hr_tool page. unable to acces...,GRP_0,un access hrtool pag un access hrtool pag
4,skype error . skype error,GRP_0,skyp er skyp er


In [15]:
logging.getLogger().setLevel(logging.DEBUG)
cleaning.preprocess(ticket_df['Description'][1])
logging.getLogger().setLevel(logging.INFO)