In [1]:
import pandas as pd
from hazm import *
from itertools import groupby
import math

# Azin's Work (TF-IDF)

In [3]:
answers_clean = pd.read_csv('answers_clean.csv', index_col=0)
# I'm not really sure about removing stop words
text_file = open("stop_words_short.txt", encoding="utf8")
stop_words = text_file.read().split("\n")

In [20]:
answers_clean = pd.read_csv("mh_clean.csv", index_col=0)
with open("stop_words_short.txt", encoding="utf8") as text_file:
    stop_words = text_file.read().split("\n")

In [7]:
def remove_sw(sent_tokens):
    final_tokens = []
    for token in sent_tokens:
        if token not in stop_words:
            final_tokens.append(token)

    return final_tokens

In [8]:
def unique_tokens(tokens):
    unique_list = []
    for x in tokens:
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

In [18]:
def word_tokenizing(remove_stop_word=True):
    sents = answers_clean['sentences']
    all_tokens = []
    max_tokens_per_line = 0
    for sent in sents:
        tokens = word_tokenize(sent)
        if remove_stop_word:
            tokens = remove_sw(tokens)
        if len(tokens) > max_tokens_per_line:
            max_tokens_per_line = len(tokens)
        all_tokens.append(tokens)
    return all_tokens, max_tokens_per_line

In [10]:
def calculate_df(all_tokens):
    DF = {}
    for tokens in all_tokens:
        tokens = unique_tokens(tokens)
        for token in tokens:
            try:
                DF[token] += 1
            except:
                DF[token] = 1

    return DF, len(all_tokens)

In [11]:
def calculate_tf_idf(all_tokens, DF, number_of_documents):
    sentenses = []
    for sent_tokens in all_tokens:
        sentense = []
        number_of_tokens = len(sent_tokens)
        tf = dict(zip([key for key, group in groupby(sent_tokens)],
                      [len(list(group)) for key, group in groupby(sent_tokens)]))
        for token in sent_tokens:
            idf = math.log(number_of_documents/DF[token])
            word_tf = tf[token] / number_of_tokens
            sentense.append(word_tf * idf)
        sentenses.append(sentense)

    return sentenses

In [12]:
def max_padding(vectors, max_tokens_per_line):
    padded_sentenses = []
    for vector in vectors:
        if len(vector) < max_tokens_per_line:
            pad = max_tokens_per_line - len(vector)
            vector.extend([0] * pad)
            padded_sentenses.append(vector)
        else:
            padded_sentenses.append(vector)
    return padded_sentenses

In [13]:
def main():
    all_tokens, max_tokens_per_line = word_tokenizing()
    DF, number_of_documents = calculate_df(all_tokens)
    vectors = calculate_tf_idf(all_tokens, DF, number_of_documents)
    assert(len(vectors) == len(all_tokens))
    padded = max_padding(vectors, max_tokens_per_line)
    df = pd.DataFrame(padded)
    df.to_csv("mh_tf_idf.csv")

In [19]:
if __name__ == '__main__':
    main()