In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
spam_header = "spam\t"
no_spam_header = "ham\t"
documents = []

with open ('./dataset/SMSSpamCollection') as file_handle:
    for line in file_handle:
        if line.startswith(spam_header):
            documents.append(line[len(spam_header):])
        elif line.startswith(no_spam_header):
            documents.append(line[len(no_spam_header):])
            
vectorizer = CountVectorizer(stop_words='english', max_features=2000)
term_counts = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names()

In [5]:
topic_model = LatentDirichletAllocation(n_components=10)
topic_model.fit(term_counts)

topics = topic_model.components_
for topic_id, weights in enumerate(topics):
    print("topic {}".format(topic_id), end=": ")
    pairs = []
    for term_id, value in enumerate(weights):
        pairs.append((abs(value), vocabulary[term_id]))
    pairs.sort(key=lambda x: x[0], reverse=True)
    for pair in pairs[:10]:
        print(pair[1], end=",")
    print()



topic 0: gt,lt,great,hope,message,today,day,ve,min,lol,
topic 1: new,cos,thanks,things,mins,thk,urgent,class,mind,10,
topic 2: got,good,lor,home,work,ask,im,thing,wan,won,
topic 3: free,txt,text,stop,mobile,reply,send,ur,week,www,
topic 4: know,want,just,ll,dont,need,amp,don,time,let,
topic 5: love,sorry,later,dear,happy,ur,life,ll,meet,night,
topic 6: ok,come,like,da,day,oh,hi,said,care,leave,
topic 7: time,pls,really,number,hey,wat,just,buy,babe,ya,
topic 8: ur,going,tell,did,tomorrow,wish,msg,morning,haha,miss,
topic 9: think,yeah,don,right,getting,shit,problem,ll,yo,guys,
