In [8]:
import multiprocessing

import pandas as pd
import numpy as np
import spacy

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

In [166]:
# Load spacy to tokenize test data
nlp = spacy.load("en_core_web_sm")

## Train
### Load Training Data

Download Will's `document-topic-matrix.csv` file

In [165]:
input_df = pd.read_csv("../../data/document-topic-matrix.csv")
train_df = pd.read_csv("../../data/final_train.csv")

In [167]:
train_docs = list(nlp.pipe(test_df["title_content"], disable=["tagger", "parser", "ner"]))
train_data = [[t.text for t in doc] for doc in train_docs]


In [79]:
input_df.head()

Unnamed: 0,ticket_id,session,java,download,file,account,refresh,homepage,button,paste,...,reinstall,mac,youtube,image,toolbar,load,library,window,restore,title_content
0,955006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,>
1,1066283,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,delete deleted
2,1233313,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,open new window letter n w right click menu mo...
3,1233322,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,web camera picture localhost 65 office picture...
4,1233348,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,firefox hang high cpu use try open local file ...


In [70]:
# Get document term matrix
X = input_df.values[:, 1:-1]

# Get topics by removing
# first_column, ticket_id
# last_column, concatation of title and content
keywords = np.array(list(input_df.columns))
keywords = keywords[1:]
keywords = keywords[:-1]
len(keywords)

127

### Parameters

- `num_topics`: Number of topics that might contain in our corpus
- `learning_method`: online|batch, see [doc](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html) for detail

In [104]:
num_topics = 20
learning_method = "online"

In [178]:
vectorizer = CountVectorizer(max_features=127)
vectorizer.fit(train_df["title_content"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=127, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### LDA Model


In [108]:
lda_model = LatentDirichletAllocation(
    n_components=num_topics, 
    learning_method=learning_method, 
    n_jobs=multiprocessing.cpu_count(),
    random_state=0 # Fixed seed is required to achieve consistent distribution over runs
)
lda_output = lda_model.fit_transform(X)

In [110]:
def show_topics(keywords, lda_model, n_words=num_topics):
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(keywords, lda_model, num_topics)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,mac,video,download,cache,freeze,youtube,copy,library,scroll,safari,paste,gmail,crash,beta,security,website,upgrade,email,cookie,play
Topic 1,window,homepage,home,open,tab,shortcut,attachment,gmail,copy,email,yahoo,bookmark,login,save,website,pdf,freeze,mail,frame,default
Topic 2,sidebar,frame,beta,print,bookmark,update,sync,google,display,uninstall,log,install,history,login,norton,attachment,safari,window,drag,facebook
Topic 3,email,security,certificate,connection,connect,redirect,zoom,google,block,account,youtube,bing,aol,mail,upload,sync,safari,reset,outlook,username
Topic 4,uninstall,permission,flash,plugin,player,error,format,video,search,reload,audio,reinstall,google,gmail,update,keyboard,adobe,beta,block,cookie
Topic 5,bookmark,load,reinstall,block,private,font,gmail,proxy,dropdown,open,format,error,download,redirect,website,bing,home,recover,form,microsoft
Topic 6,open,file,button,pdf,save,export,reload,reader,upload,dialog,adobe,bank,image,error,home,print,frame,reinstall,gmail,startup
Topic 7,tab,crash,error,blank,open,keyboard,send,shortcut,form,download,home,paste,restore,network,session,beta,drag,proxy,background,image
Topic 8,reset,refresh,ubuntu,java,speed,load,restore,crash,paste,profile,youtube,password,connection,open,reinstall,website,copy,reader,default,adobe
Topic 9,startup,icon,content,slow,launch,virus,block,memory,connection,update,google,download,crash,toolbar,search,sync,history,file,install,account


### Create theme from the keywords under each topic

This step requires some domain expertise and creativity

In [115]:
topics_theme = [
    "Video/Youtube/Freeze/Crash",
    "Gmail/Attachment/Email",
    "Sync Bookmark/Sync History/",
    "Security/TLS certificate and connection/",
    "Uninstall and Reinstall/Block Cookie/Adobe Flash Permission",
    "Recover Download/Drodown Item in Form",
    "File Upload Save Dialog/Upload Image/PDF Reader",
    "Beta Crash/Blank Error/Background Image",
    "Reset Password/Youtube Crash/Connection Speed",
    "Startup or Launch Slow/Block connection/Sync Account",
    "Javascript Error/Connection Security Certificate Exception Stuffs/Adobe Flash Security",
    "Network Connection/Play Audio Sound/Conection Slow",
    "Enable Adobe/Startup Crash/Save Profile/Update Password",
    "Block/Update Bookmark",
    "Recover Account/Import History/",
    "Install Printer/Video Zoom/Download and Proxy",
    "Network Session/Restoe Tab",
    "Sync Account/Reset Password/Restore Bookmark",
    "Default Search/Custom Theme Navigation/Save Bing Image",
    "Popup Dialog Form/Recover Account/Website Error Exception",
]
df_topic_keywords["topic_theme"] = topics_theme
df_topic_keywords.set_index('topic_theme', inplace=True)
df_topic_keywords.T

topic_theme,Video/Youtube/Freeze/Crash,Gmail/Attachment/Email,Sync Bookmark/Sync History/,Security/TLS certificate and connection/,Uninstall and Reinstall/Block Cookie/Adobe Flash Permission,Recover Download/Drodown Item in Form,File Upload Save Dialog/Upload Image/PDF Reader,Beta Crash/Blank Error/Background Image,Reset Password/Youtube Crash/Connection Speed,Startup or Launch Slow/Block connection/Sync Account,Javascript Error/Connection Security Certificate Exception Stuffs/Adobe Flash Security,Network Connection/Play Audio Sound/Conection Slow,Enable Adobe/Startup Crash/Save Profile/Update Password,Block/Update Bookmark,Recover Account/Import History/,Install Printer/Video Zoom/Download and Proxy,Network Session/Restoe Tab,Sync Account/Reset Password/Restore Bookmark,Default Search/Custom Theme Navigation/Save Bing Image,Popup Dialog Form/Recover Account/Website Error Exception
Word 0,mac,window,sidebar,email,uninstall,bookmark,open,tab,reset,startup,error,beta,profile,disable,login,update,website,sync,default,popup
Word 1,video,homepage,frame,security,permission,load,file,crash,refresh,icon,update,connection,save,google,password,install,toolbar,account,search,background
Word 2,download,home,beta,certificate,flash,reinstall,button,error,ubuntu,content,facebook,print,memory,display,restore,upgrade,restart,password,history,yahoo
Word 3,cache,open,print,connection,plugin,block,pdf,blank,java,slow,javascript,audio,enable,youtube,recover,download,log,username,microsoft,explorer
Word 4,freeze,tab,bookmark,connect,player,private,save,open,speed,launch,script,import,format,play,account,mouse,theme,norton,navigation,mail
Word 5,youtube,shortcut,update,redirect,error,font,export,keyboard,load,virus,certificate,network,adobe,search,save,downgrade,session,update,cookie,dialog
Word 6,copy,attachment,sync,zoom,format,gmail,reload,send,restore,block,security,setup,error,printer,sign,drag,image,save,bing,reboot
Word 7,library,gmail,google,google,video,proxy,reader,shortcut,crash,memory,font,outlook,update,print,safari,bookmark,bank,reset,error,log
Word 8,scroll,copy,display,block,search,dropdown,upload,form,paste,connection,load,sound,password,scroll,aol,history,ebay,profile,save,scroll
Word 9,safari,email,uninstall,account,reload,open,dialog,download,profile,update,ubuntu,bookmark,crash,load,history,window,exception,youtube,theme,search


In [145]:
lda_output = lda_model.transform(X)

# column names
topicnames = df_topic_keywords.T.columns

# index names
docnames = ["Doc" + str(i) for i in range(len(input_df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(input_df, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

df_topic_theme = df_sent_topic[['title_content', 'dominant_topic']]

def label_theme(row):
    return topics_theme[row["dominant_topic"]]

# Increase diaply max width of dataframe
# pd.options.display.max_colwidth = 150
pd.set_option('display.max_colwidth', None)

df_topic_theme['dominant_topic_theme'] = df_topic_theme.apply(lambda row: label_theme(row), axis=1)
df_topic_theme.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,title_content,dominant_topic,dominant_topic_theme
0,>,7,Beta Crash/Blank Error/Background Image
1,delete deleted,1,Gmail/Attachment/Email
2,open new window letter n w right click menu mouse helpful w window t tab easy remember right keyboard hand mouse,1,Gmail/Attachment/Email
3,web camera picture localhost 65 office picture customer.we need functionality work again.we store local hard drive use self sign cert start camera try add exception work.block use file access need program picture follow php script.error webcam.js error access webcam typeerror argument 1 valid anyof 1-argument overload url.createobjecturl,10,Javascript Error/Connection Security Certificate Exception Stuffs/Adobe Flash Security
4,firefox hang high cpu use try open local file mean try mean open local html file folder tab churn forever spin high cpu use unresponsive form interaction kill task manager mean try include ff file open select html file ff use file://c:/ window explorer context menu open > firefox window explorer drag html file firefox address bar window 10 64-bit win10 32-bit ff 62.0 64-bit middling late model dell laptop 16 gb memory i5 7300u intel graphic,6,File Upload Save Dialog/Upload Image/PDF Reader
5,firefox crash open firefox crash startup safe mode 1536694923,7,Beta Crash/Blank Error/Background Image
6,stop outbound connection address stop firefox make automatic connection page install portableapps version firefox 52.9.0 32 bite esr startup try numerous outbound connection mozilla ip address think ip address like plus slight variation these):54.191.46.2854.187.147.16735.166.127.14852.34.90.2335.166.127.148i add ons extension installed.i set privacy prevent outbound connections.i go mozilla webpage stop firefox make automatic connection stuff like prevent unwanted outbound connection curious contain packet ff send address thank,9,Startup or Launch Slow/Block connection/Sync Account
7,start happen hour firefox year experience phenomenon hour browse firefox grind halt try suggestion work large numb tab regularly family history research move web site time gain information capture screen shot trove update datum ancestry myheritage particularly switch program regularly e.g. photoshop element excel word retire programmer amateur web site developer wonder browser activity index tab refresh tab hour seek way disable activity want happen annoy waste precious time life timetable rapidly come near end similar line frustration experience leave browser open away meal maybe 1 2 hour little 10 15 minute nearly possible firefox start access tab session leave open firefox crash situation occasion take maybe half hour pace firefox background task want firefox background task kind simply want browser show page request leave automatic refresh index simple page display want refresh ask finally point maybe firefox relate maybe ancestry relate maybe os relate sure firefox basic seemingly lot thing want add information record open tab ancestry display tab update information come proper e.g. philip mark hunt instead come ancestry person reload tab hit kind timeout default general true tab,7,Beta Crash/Blank Error/Background Image
8,restore delete site datum manually remove cooky site datum need get ahead accidentally remove needed.just wonder way restore previously clear site datum,14,Recover Account/Import History/
9,save image file exist size prefer firefox open dialoge box require esc key press use right click context menu save image file exist disk exact size prefer firefox open dialoge box require escape key press possible,6,File Upload Save Dialog/Upload Image/PDF Reader


## Testing

### Load test data

In [152]:
test_df = pd.read_csv("../../data/final_test.csv")

In [153]:
test_df.head(2)

Unnamed: 0,ticket_id,session,java,download,file,account,refresh,homepage,button,paste,...,reinstall,mac,youtube,image,toolbar,load,library,window,restore,title_content
0,1277462,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,dom update properly angular web application page load mozilla firefox version 72.0.1 64-bit).the dom update properly navigate home page main menu event occur scroll hover menu dom update refer attach image reproduce issue.i confirm user confirm behavior page work correctly chrome
1,1277474,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,export console hxr response try automate daily work website developping crawl list website dig item list check value be developping powershell python web browser admin right machine work like use catch json answer website parse answer value automate kind popup hey item list 90 100 code look basic solution json interception crawling.i good knowledge json parse python powershell hard catch answer real time export json answer network console perfect basis firefox extension need base start thank help apology average english


In [204]:
def predict_topic(document):
    # document - A test instance that has been tokenize
    Y = vectorizer.transform(document)
    topic_probability_scores = lda_model.transform(Y)
    topic = df_topic_keywords.iloc[np.argmax(np.argmax(topic_probability_scores, axis=0)), :].values.tolist()
    return topic, topic_probability_scores


docs = list(nlp.pipe(test_df["title_content"], disable=["tagger", "parser", "ner"]))
test_documents = [[t.text for t in doc] for doc in docs]

In [213]:
topic, prob_scores = predict_topic(test_documents[0])
print(test_df.loc[0, "title_content"])
print(topic) # Theme: "Gmail/Attachment/Email"



dom update properly angular web application page load mozilla firefox version 72.0.1 64-bit).the dom update properly navigate home page main menu event occur scroll hover menu dom update refer attach image reproduce issue.i confirm user confirm behavior page work correctly chrome
['window', 'homepage', 'home', 'open', 'tab', 'shortcut', 'attachment', 'gmail', 'copy', 'email', 'yahoo', 'bookmark', 'login', 'save', 'website', 'pdf', 'freeze', 'mail', 'frame', 'default']
