In [93]:
import glob
from nlp.vectorizer.dictionary_builder import build_dictionary, save
from nlp.vectorizer.tfidf_vectorizer import build_TfIdfModel_from_list_of_texts, save_tfidf_model, convert_text_to_tfidf

def load_document(filename):
    s = []
    flist = glob.glob(filename)
    for fname in flist:
    #print(fname)
        tfile = open(fname, "r", encoding="utf-8", errors='ignore')
        line = tfile.read()  # read the content of file and store in "line"
        tfile.close()  # close the file
        s.append(line)

    return s

def get_dictionary(documents, output) :
    my_dict = build_dictionary(documents)
    save(my_dict, output)
    return my_dict

def get_tfidf_model(documents, dictionary, output) :
    tfidf_model = build_TfIdfModel_from_list_of_texts(documents, dictionary)
    save_tfidf_model(tfidf_model, output)
    return tfidf_model

def sort_keywords(tfidf, reverse=True) :
    #sorted by Idf value of each word
    #idf value would return tuple (0,0.123231) => 1st element is the key in the dictionary, 2nd element is the idf value
    #higher of idf value means higher word frequency
    return sorted(tfidf, key=lambda item : item[1], reverse=reverse)

def get_keywords(tfidf, dictionary) :
    sorted_keywords = sort_keywords(tfidf)
    keywords = map(lambda item: dictionary.get(item[0]), sorted_keywords)
    #you have to convert it back to list if you want to return the values
    return list(keywords)

In [94]:
if __name__ == "__main__":
    filename = r'artikel/*.txt'
    output_filedict = 'dictionary/artikel_result.dict'
    output_tfidf = 'tf_idf/artikel_result.tfidf'
    
    documents = load_document(filename)
    dictionary = get_dictionary(documents, output_filedict)

    #get our tfidf_model
    tfidf_model = get_tfidf_model(documents, dictionary, output_tfidf)
    

2018-08-23 21:22:06,646 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-23 21:22:08,304 : INFO : built Dictionary(48736 unique tokens: ['ada', 'adalah', 'adanya', 'ade', 'agar']...) from 2969 documents (total 1428944 corpus positions)
2018-08-23 21:22:08,327 : INFO : saving Dictionary object under dictionary/artikel_result.dict, separately None
2018-08-23 21:22:08,363 : INFO : saved dictionary/artikel_result.dict
2018-08-23 21:22:18,718 : INFO : collecting document frequencies
2018-08-23 21:22:18,719 : INFO : PROGRESS: processing document #0
2018-08-23 21:22:18,909 : INFO : calculating IDF weights for 2969 documents and 48735 features (693048 matrix non-zeros)
2018-08-23 21:22:19,042 : INFO : saving TfidfModel object under tf_idf/artikel_result.tfidf, separately None
2018-08-23 21:22:19,219 : INFO : saved tf_idf/artikel_result.tfidf


In [95]:
#example of artikel no 1
tfile = open("artikel/1.txt", "r", encoding="utf-8", errors="ignore")
sample = tfile.read()
tfile.close()
tfidf = convert_text_to_tfidf(sample, dictionary, tfidf_model)

keywords = get_keywords(tfidf, dictionary)
print(keywords)


['serangan', 'ade', 'stroke', 'jantung', 'namnung', 'muda', 'faktor', 'perbanyak', 'anda', 'artikel', 'diingatan', 'produkwhole', 'scanningkesehatan', 'upuntuk', 'jaga', 'darah', 'komedian', 'berwaspada', 'detikhot', 'grain', 'arah', 'risiko', 'pembuluh', 'jaringan', 'meninggalnya', 'merebaknya', 'diperburuk', 'terkait', 'sampaikan', 'kolesterol', 'kasus', 'disarankan', 'melengkapi', 'meninggi', 'obesitas', 'pepatah', 'riwayat', 'usia', 'gizi', 'mengejutkan', 'perkotaan', 'merokok', 'tanyadokteranda', 'paham', 'gula', 'sesungguhnya', 'kita', 'saya', 'tahu', 'poin', 'trigliserida', 'kerusakan', 'profil', 'usianya', 'wawasan', 'lipid', 'mengenai', 'hdl', 'menghindarkan', 'makan', 'badannya', 'tersumbat', 'maka', 'check', 'plak', 'ldl', 'penyumbatan', 'tidaknya', 'pencegahan', 'mengalir', 'diabetes', 'intensitas', 'terlepas', 'catatan', 'tentu', 'omega', 'otak', 'keluarga', 'mengarah', 'sempat', 'indeks', 'sewaktu', 'dikaitkan', 'gemuk', 'kematian', 'metabolik', 'diturunkan', 'cegah', 'ge