In [15]:
import glob
from nlp.vectorizer.dictionary_builder import build_dictionary, save
from nlp.vectorizer.tfidf_vectorizer import build_TfIdfModel_from_list_of_texts, save_tfidf_model, convert_text_to_tfidf
from nlp.vectorizer.lsi_vectorizer import save_lsi_model, convert_text_to_lsi, build_LsiTopicModel

def load_document(filename):
    s = []
    flist = glob.glob(filename)
    for fname in flist:
    #print(fname)
        tfile = open(fname, "r", encoding="utf-8", errors='ignore')
        line = tfile.read()  # read the content of file and store in "line"
        tfile.close()  # close the file
        s.append(line)

    return s

def get_dictionary(documents, output) :
    my_dict = build_dictionary(documents)
    save(my_dict, output)
    return my_dict

def get_tfidf_model(documents, dictionary, output) :
    tfidf_model = build_TfIdfModel_from_list_of_texts(documents, dictionary)
    save_tfidf_model(tfidf_model, output)
    return tfidf_model

def get_lsi_model(documents, dictionary_filename, tfidf_output, numtopics, output) :
    lsi_model = build_LsiTopicModel(documents, dictionary_filename, tfidf_output, numtopics)
    save_lsi_model(output, lsi_model)
    return lsi_model

def sort_keywords(tfidf, reverse=True) :
    #sorted by Idf value of each word
    #idf value would return tuple (0,0.123231) => 1st element is the key in the dictionary, 2nd element is the idf value
    #higher of idf value means higher word frequency
    return sorted(tfidf, key=lambda item : item[1], reverse=reverse)

def get_keywords(tfidf, dictionary) :
    sorted_keywords = sort_keywords(tfidf)
    keywords = map(lambda item: dictionary.get(item[0]), sorted_keywords)
    #you have to convert it back to list if you want to return the values
    return list(keywords)

In [16]:
if __name__ == "__main__":
    filename = r'artikel/*.txt'
    output_filedict = 'dictionary/artikel_result.dict'
    output_tfidf = 'tf_idf/artikel_result.tfidf'
    output_lsi = "lsi/artikel_result.lsi"
    
    documents = load_document(filename)
    dictionary = get_dictionary(documents, output_filedict)

    #get our tfidf_model
    tfidf_model = get_tfidf_model(documents, dictionary, output_tfidf)
    
    #get lsi model
    lsi_model = get_lsi_model(documents, output_filedict, output_tfidf, 100, output_lsi)

2018-08-23 21:45:56,974 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-23 21:45:58,644 : INFO : built Dictionary(48736 unique tokens: ['ada', 'adalah', 'adanya', 'ade', 'agar']...) from 2969 documents (total 1428944 corpus positions)
2018-08-23 21:45:58,669 : INFO : saving Dictionary object under dictionary/artikel_result.dict, separately None
2018-08-23 21:45:58,688 : INFO : saved dictionary/artikel_result.dict
2018-08-23 21:46:08,967 : INFO : collecting document frequencies
2018-08-23 21:46:08,970 : INFO : PROGRESS: processing document #0
2018-08-23 21:46:09,208 : INFO : calculating IDF weights for 2969 documents and 48735 features (693048 matrix non-zeros)
2018-08-23 21:46:09,344 : INFO : saving TfidfModel object under tf_idf/artikel_result.tfidf, separately None
2018-08-23 21:46:09,529 : INFO : saved tf_idf/artikel_result.tfidf
2018-08-23 21:46:09,530 : INFO : loading Dictionary object from dictionary/artikel_result.dict
2018-08-23 21:46:09,555 : INFO : load

In [18]:
    #example of LSI dari document 3
    sample = documents[3]
    lsi  = convert_text_to_lsi(sample, dictionary, tfidf_model, lsi_model)
    print(lsi)
    
    # find topic
    print(lsi_model.show_topic(0,10))


[(0, 0.07818671203016449), (1, 0.00041254028681540083), (2, -0.002357423059857172), (3, -0.04735748089889817), (4, 0.016830233701086737), (5, -0.018997273036704638), (6, 0.0015581881709658422), (7, -0.012985546338644845), (8, 0.010969893027024209), (9, -0.0178045441708118), (10, 0.025173120139281215), (11, 0.019563807989688008), (12, -0.02508368940075561), (13, -0.022026737930019517), (14, 0.019188343325035547), (15, 0.010284402242063862), (16, -0.005288161405238253), (17, -0.00402358038698052), (18, 0.03616708745191964), (19, -0.04124750845251615), (20, 0.03587983225970891), (21, -0.060600678974893386), (22, -0.06728160691257531), (23, 0.08489568155777426), (24, -0.03367936092877198), (25, -0.07789046680381827), (26, 0.006564937493007829), (27, 0.016122606039506996), (28, 0.008470251563250574), (29, -0.03593025150395827), (30, -0.001024966290132078), (31, -0.054765511010879425), (32, 0.00154840210262587), (33, 0.01040632260470082), (34, -0.008505622115913603), (35, 0.00072411351353590