In [1]:
import re
import json
import nltk
import pandas as pd
from collections import Counter
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Data

In [2]:
with open('ticket_data.json','r') as json_file:
    allTickets = json.load(json_file)[:4000]

In [3]:
# Get all the tags from all the tickets
allTags = []
for ticket in allTickets:
  tags = ticket['tags']
  for index in tags:
    for tag in tags[index]:
      allTags.append(tag)
    
cleanTags = []
for tag in allTags:
    if 'firefox' not in tag and 'windows' not in tag and 'mac' not in tag and 'linux' not in tag:
        cleanTags.append(tag)
# Get the frequency of each tag
tag_fre = Counter(cleanTags)

# We select most common 50 tags to train the model
df_tags = [stat[0] for stat in tag_fre.most_common(20)]

In [4]:
# build the inital dataframe
ticket_ids = [ticket['ticket_id'] for ticket in allTickets]
contents = [ticket['content'] for ticket in allTickets]
titles = [ticket['title'] for ticket in allTickets]
data = pd.DataFrame(data = ticket_ids, columns=['Ticket ID'])
data['content'] = contents
data['title'] = titles

In [5]:
# determine if a single ticket has specific tag
def if_label(ticket,label):
  temp_tags = []
  tags = ticket['tags']
  for index in tags:
    temp_tags.extend(tags[index])
  return 1 if label in temp_tags else 0

In [6]:
# extend the dataframe
for label in df_tags:
  temp = [if_label(ticket,label) for ticket in allTickets]
  data[label] = temp

### Data cleaning

In [7]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [8]:
data['content'] = data['content'].str.lower()
data['content'] = data['content'].apply(cleanHtml)
data['content'] = data['content'].apply(cleanPunc)
data['content'] = data['content'].apply(keepAlpha)

In [9]:
stop_words = stopwords.words('english')
contents = []
for content in data.content:
  temp = []
  for word in content.strip().split():
    if word not in stop_words:
      temp.append(word)
  contents.append(" ".join(temp))
data.content = contents

In [10]:
data.head()

Unnamed: 0,Ticket ID,content,title,desktop,other,websites,features,data,crash,addon,...,sync,privacy-and-security_1,escalate,download-and-install_1,bookmarks,tabs,needsinfo,android,tips,mobile
0,1277553,os else recently firefox keeps going back prev...,firefox keeps going back to previous page rand...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1277548,ho turn fucking safe search fucking computer t...,How do I turn off Safesearch?,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1277546,experiencing problems running html interaction...,Problem running Adobe Animate CC HTML5 interac...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1277543,im trying get firefox pdf viewer work reason e...,Firefox pdf viewer is not working even when it...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1277540,im trying sync firefox account everytime appea...,Sorry. We’ve locked your account.,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Training and predict

In [11]:
# split the dataset to train and test part
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

print(train.shape)
print(test.shape)
train_text = train['content']
test_text = test['content']

(2800, 23)
(1200, 23)


In [12]:
# convert all the text to vector
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['Ticket ID','content','title'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['Ticket ID','content','title'], axis=1)

### Problem transofrm

In [13]:
%%time
# train the model with one vs rest classifier and logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
LogReg_pipeline = Pipeline([('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),])
for tag in df_tags:
    print('Processing {} tags...'.format(tag))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[tag])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}\n'.format(accuracy_score(test[tag], prediction)))

Processing desktop tags...
Test accuracy is 0.9991666666666666

Processing other tags...
Test accuracy is 0.6091666666666666

Processing websites tags...
Test accuracy is 0.8466666666666667

Processing features tags...
Test accuracy is 0.8716666666666667

Processing data tags...
Test accuracy is 0.9008333333333334

Processing crash tags...
Test accuracy is 0.9291666666666667

Processing addon tags...
Test accuracy is 0.9258333333333333

Processing fix-problems tags...
Test accuracy is 0.9725

Processing beta tags...
Test accuracy is 0.9783333333333334

Processing customize tags...
Test accuracy is 0.9833333333333333

Processing sync tags...
Test accuracy is 0.9866666666666667

Processing privacy-and-security_1 tags...
Test accuracy is 0.995

Processing escalate tags...
Test accuracy is 0.9966666666666667

Processing download-and-install_1 tags...
Test accuracy is 0.9958333333333333

Processing bookmarks tags...
Test accuracy is 0.9958333333333333

Processing tabs tags...
Test accuracy 

In [14]:
%%time
from skmultilearn.problem_transform import LabelPowerset
# initialize Label Powerset multi-label classifier
# with a Logistic Regression classifier
classifier = LabelPowerset(LogisticRegression(solver='lbfgs',multi_class='auto'))

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.4266666666666667
CPU times: user 9min 13s, sys: 48 s, total: 10min 1s
Wall time: 1min 50s


In [15]:
%%time
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(GaussianNB())
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.175
CPU times: user 2min 52s, sys: 1min 8s, total: 4min 1s
Wall time: 4min 2s


In [16]:
%%time
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

# k for 20 tags
classifier = MLkNN(k = 20)

x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.2625
CPU times: user 23min 47s, sys: 3.49 s, total: 23min 50s
Wall time: 23min 53s


In [17]:
%%time
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.16583333333333333
CPU times: user 1min 23s, sys: 50.4 s, total: 2min 13s
Wall time: 2min 14s


In [18]:
%%time
# using random forest 
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'gini', random_state = 42)

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.30333333333333334
CPU times: user 24.8 s, sys: 556 ms, total: 25.4 s
Wall time: 25.5 s


In [19]:
%%time
# using decision tree
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion="gini", max_depth = 100)

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.4058333333333333
CPU times: user 33.2 s, sys: 399 ms, total: 33.6 s
Wall time: 33.7 s
