In [8]:
import multiprocessing

import pandas as pd
import numpy as np
import spacy

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

### Load data

Download Will's `document-topic-matrix.csv` file

In [20]:
input_df = pd.read_csv("../../data/document-topic-matrix.csv")
# input_df = pd.read_csv("../../data/all_annotated_tickets_clean.csv")

In [79]:
input_df.head()

Unnamed: 0,ticket_id,session,java,download,file,account,refresh,homepage,button,paste,...,reinstall,mac,youtube,image,toolbar,load,library,window,restore,title_content
0,955006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,>
1,1066283,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,delete deleted
2,1233313,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,open new window letter n w right click menu mo...
3,1233322,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,web camera picture localhost 65 office picture...
4,1233348,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,firefox hang high cpu use try open local file ...


In [70]:
# Get document term matrix
X = input_df.values[:, 1:-1]

# Get topics by removing
# first_column, ticket_id
# last_column, concatation of title and content
keywords = np.array(list(input_df.columns))
keywords = keywords[1:]
keywords = keywords[:-1]
len(keywords)

127

### Parameters

- `num_topics`: Number of topics that might contain in our corpus
- `learning_method`: online|batch, see [doc](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html) for detail

In [80]:
num_topics = 20
learning_method = "online"

### LDA Model


In [87]:
lda_model = LatentDirichletAllocation(
    n_components=num_topics, 
    learning_method=learning_method, 
    n_jobs=multiprocessing.cpu_count()
)
lda_output = lda_model.fit_transform(X)

In [88]:
def show_topics(keywords, lda_model, n_words=num_topics):
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(keywords, lda_model, num_topics)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,toolbar,icon,popup,restart,theme,reset,bank,exception,java,website,sync,disable,history,connection,password,profile,uninstall,download,crash,save
Topic 1,sync,login,account,password,email,profile,save,reset,username,sign,mail,norton,recover,error,website,safari,popup,mac,tab,beta
Topic 2,security,cache,gmail,cookie,dropdown,attachment,sidebar,frame,email,zoom,paste,mail,connect,bookmark,error,profile,copy,certificate,window,blank
Topic 3,tab,restore,open,background,session,window,mouse,navigation,send,drag,restart,bookmark,copy,icon,file,keyboard,java,profile,error,safari
Topic 4,button,file,copy,export,safari,reload,downgrade,script,refresh,google,load,upload,private,youtube,gmail,background,bing,form,tab,reader
Topic 5,mac,beta,disable,blank,permission,email,outlook,sync,mail,network,update,download,safari,bookmark,startup,explorer,font,display,connect,scroll
Topic 6,update,install,upgrade,print,profile,printer,email,download,video,login,save,frame,aol,recover,error,pdf,icon,shortcut,font,microsoft
Topic 7,website,window,display,open,library,blank,aol,email,popup,update,shortcut,background,safari,download,security,attachment,save,account,copy,restore
Topic 8,flash,memory,enable,plugin,adobe,format,player,microsoft,update,uninstall,website,video,download,reload,audio,mail,private,script,facebook,print
Topic 9,download,facebook,error,homepage,yahoo,ubuntu,javascript,certificate,speed,mail,open,reinstall,connection,file,private,security,youtube,network,connect,scroll


20