In [12]:
from functools import reduce
import glob
from nlp.vectorizer.dictionary_builder import build_dictionary, save
from nlp.vectorizer.tfidf_vectorizer import build_TfIdfModel_from_list_of_texts, save_tfidf_model, convert_text_to_tfidf
from nlp.vectorizer.lsi_vectorizer import save_lsi_model, convert_text_to_lsi, build_LsiTopicModel

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def load_document(filename):
    s = []
    flist = glob.glob(filename)
    for fname in flist:
        tfile = open(fname, "r", encoding="utf-8", errors='ignore')
        line = tfile.read()  # read the content of file and store in "line"
        tfile.close()  # close the file
        s.append(line)
    return s

def get_dictionary(documents, output) :
    my_dict = build_dictionary(documents)
    save(my_dict, output)
    return my_dict

def get_tfidf_model(documents, dictionary, output) :
    tfidf_model = build_TfIdfModel_from_list_of_texts(documents, dictionary)
    save_tfidf_model(tfidf_model, output)
    return tfidf_model

def get_lsi_model(documents, dictionary_filename, tfidf_output, numtopics, output) :
    lsi_model = build_LsiTopicModel(documents, dictionary_filename, tfidf_output, numtopics)
    save_lsi_model(output, lsi_model)
    return lsi_model

def sort_keywords(tfidf, reverse=True) :
    #sorted by Idf value of each word
    #idf value would return tuple (0,0.123231) => 1st element is the key in the dictionary, 2nd element is the idf value
    #higher of idf value means higher word frequency
    return sorted(tfidf, key=lambda item : item[1], reverse=reverse)

def get_keywords(tfidf, dictionary) :
    sorted_keywords = sort_keywords(tfidf)
    keywords = map(lambda item: dictionary.get(item[0]), sorted_keywords)
    #you have to convert it back to list if you want to return the values
    return list(keywords)


def labeling_sentiment(documents):
    labels=[]
    sizedocument=len(documents)
    for i in range(sizedocument):
        if (i % 2)==0:
            labels.append(1.0)
        else:
            labels.append(-1.0)
    return labels


def generate_lsi_vector(documents, lsi_model, dictionary, tfidf_model):
    list_of_lsi_vector=[]
    for d in documents:
        lsi_vector=convert_text_to_lsi(d, dictionary, 
                                       tfidf_model, lsi_model)
        list_of_lsi_vector.append(lsi_vector)
    return list_of_lsi_vector


def _get_lsi_values(item) :
    return reduce((lambda acc, _item : acc + [_item[1]] ), item, list())

def get_lsi_values(list_of_lsi_vector) :
    return reduce((lambda lsi_values, item : lsi_values + [_get_lsi_values(item)] ), list_of_lsi_vector, list())

In [2]:
if __name__ == "__main__":
    filename = r'artikel/*.txt'
    output_filedict = 'dictionary/artikel_result.dict'
    output_tfidf = 'tf_idf/artikel_result.tfidf'
    output_lsi = "lsi/artikel_result.lsi"
    
    documents = load_document(filename)
    dictionary = get_dictionary(documents, output_filedict)

    #get our tfidf_model
    tfidf_model = get_tfidf_model(documents, dictionary, output_tfidf)
    
    #get lsi model
    lsi_model = get_lsi_model(documents, output_filedict, output_tfidf, 100, output_lsi)
    

2018-08-28 21:38:10,816 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-28 21:38:12,318 : INFO : built Dictionary(48736 unique tokens: ['ada', 'adalah', 'adanya', 'ade', 'agar']...) from 2969 documents (total 1428944 corpus positions)
2018-08-28 21:38:12,341 : INFO : saving Dictionary object under dictionary/artikel_result.dict, separately None
2018-08-28 21:38:12,363 : INFO : saved dictionary/artikel_result.dict
2018-08-28 21:38:24,862 : INFO : collecting document frequencies
2018-08-28 21:38:24,863 : INFO : PROGRESS: processing document #0
2018-08-28 21:38:25,172 : INFO : calculating IDF weights for 2969 documents and 48735 features (693048 matrix non-zeros)
2018-08-28 21:38:25,355 : INFO : saving TfidfModel object under tf_idf/artikel_result.tfidf, separately None
2018-08-28 21:38:25,632 : INFO : saved tf_idf/artikel_result.tfidf
2018-08-28 21:38:25,637 : INFO : loading Dictionary object from dictionary/artikel_result.dict
2018-08-28 21:38:25,698 : INFO : load

In [8]:
    labels = labeling_sentiment(documents)
    list_of_lsi_vector = generate_lsi_vector(documents, lsi_model, dictionary, tfidf_model)

In [17]:
    list_of_lsi_values = get_lsi_values(list_of_lsi_vector)

In [18]:
    x_train=list_of_lsi_values[:2000]
    x_test=list_of_lsi_values[2000:]

    y_train=labels[:2000]
    y_test=labels[2000:]

    print('===============================================')
    print(x_train[0])
    print(y_train[0])

    rfmodel=RandomForestClassifier(criterion='gini')
    rfmodel.fit(x_train,y_train)

    pred_x_train=rfmodel.predict(x_train)
    pred_x_test=rfmodel.predict(x_test)

    acc_train=accuracy_score(y_train, pred_x_train)
    acc_test=accuracy_score(y_test, pred_x_test)

    print('acc_train', acc_train)
    print('acc_test', acc_test)


[0.31340277881363526, -0.17873327104514491, 0.033885392202418284, 0.02494094846788862, 0.038820745905250385, 0.057400185094827756, -0.1968728278573526, -0.03946640700862913, 0.0018997393478939653, 0.02776916132028745, 0.11639261453729195, -0.06522015782509287, -0.05085329111053569, -0.05057011905627114, 0.047335557248805206, 0.0951693606228714, 0.04819085320671879, 0.10703211546705786, 0.11825671305262338, 0.013003571205236542, 0.09495010331522365, 0.026770076609212233, -0.0742270183795625, -0.060635498194013496, 0.05262728431197605, 0.014152093408801992, -0.02773222255492764, 0.020311912487043202, 0.052108167790078354, 0.058455715193515355, -0.010854336597747061, -0.0056022236078424196, -0.034836033984715645, -0.002615003650990818, 0.011243573304026266, -0.027873094262071664, -0.0008550456729464093, -0.05325387919458913, 0.017803316690094245, 0.021070512892197066, -0.04660218506036753, 0.03364441039339408, -0.0316825992358212, -0.03049905751876317, -0.05396238537683693, 0.007721886339