In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from preprocessing import preprocess_text
from math import log10
from gensim.models import Word2Vec
from sklearn.decomposition import PCA

In [None]:
train_dir = os.path.realpath("../assets/annotated_corpus/train")
result_dir = os.path.realpath("../assets/wordcount/train")

In [None]:
topics = os.listdir(train_dir)

# Task 1

In [None]:
def get_stems_processed(filepath):
    sentences = []
    pattern = r"([A-Za-z]+[-.@]?)+\w*\.?"
    with open(filepath) as f:
            lines = "".join(f.readlines())
            sentences_raw = lines.split("\n\n")
            for s in sentences_raw:
                stems = []
                words = s.split("\n")
                if len(words) == 0 or words[0] == "":
                    continue
                stems_raw = list(map(lambda x: x.split("\t")[2], words))
                lemmas = list(map(lambda x: x.split("\t")[2], words))
                for i in range(len(stems_raw)):
                    if lemmas[i] not in stopwords.words("english") and re.match(pattern, lemmas[i]) is not None:
                        stems.append(stems_raw[i])
                sentences.append(stems)
    return sentences

In [None]:
def count_words(sentences, count_by_sentences=False):
    word_count = {}
    sentences_count = []
    for words in sentences:
        s_count = {}
        for w in words:
            if w not in s_count.keys():
                s_count[w] = 0
            s_count[w] += 1
            if w not in word_count.keys():
                word_count[w] = 0
            word_count[w] += 1
        sentences_count.append(s_count)
    return sentences_count if count_by_sentences else word_count

In [None]:
all_documents = []
word_dict_raw = {}
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        stems = get_stems_processed(os.path.join(workdir, filename))
        all_documents.append(stems)
        counts = count_words(stems)
        for w in counts.keys():
            if w not in word_dict_raw.keys():
                word_dict_raw[w] = 0
            word_dict_raw[w] += counts[w]
    break # process only 1st topic

In [None]:
word_dict = dict(word_dict_raw)

for w in word_dict_raw.keys():
    if word_dict_raw[w] < 90:
        del word_dict[w]

In [None]:
if not os.path.isdir(result_dir):
    os.makedirs(result_dir, exist_ok=True)
with open(os.path.join(result_dir, "dictionary.json"), "w") as f:
    json.dump(word_dict, f, indent=1)

In [None]:
doc_names = []
matrix_arr = []
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        doc_names.append(t + "/" + filename)
        stems = get_stems_processed(os.path.join(workdir, filename))
        counts = count_words(stems)
        vec = []
        for w in word_dict.keys():
            if w in counts.keys():
                vec.append(counts[w])
            else:
                vec.append(0)
        matrix_arr.append(vec)
        if sum(vec) == 0:
            print("Zero vector for document", filename)
    break # process only 1st topic
    

In [None]:
matrix = pd.DataFrame(matrix_arr)
matrix.columns = word_dict.keys()
matrix.index = doc_names

In [None]:
matrix.head()

In [None]:
matrix.to_csv(os.path.join(result_dir, "term-document.csv"))

# Task 2

In [None]:
def tf_idf(words, matrix):
    total_words = sum(words.values())
    total_documents = len(matrix.index)
    result = []
    for w in matrix.columns:
        if w not in words:
            result.append(0.0)
            continue
        t_f = words[w] / total_words
        d_f = sum(matrix[w] > 0)
        tfidf = t_f * log10((total_documents + 1) / (d_f + 1))
        result.append(tfidf)
    return result

In [None]:
def vectorize_tf_idf(text, matrix):
    preprocessed = preprocess_text(text)
    text_dict = count_words([preprocessed])
    return tf_idf(text_dict, matrix)

In [None]:
matrix = pd.read_csv(os.path.join(result_dir, "term-document.csv"), index_col=0)

In [None]:
matrix.head()

In [None]:
t1 = """Evolution Designs sell the "Darwin fish".  It's a fish symbol, like the ones
Christians stick on their cars, but with feet and the word "Darwin" written
inside.  The deluxe moulded 3D plastic fish is $4.95 postpaid in the US."""

In [None]:
t2 = """A story based on the premise that the US Congress is mysteriously
assassinated, and fundamentalists quickly take charge of the nation to set it
"right" again.  The book is the diary of a woman's life as she tries to live
under the new Christian theocracy.  Women's right to own property is revoked,
and their bank accounts are closed; sinful luxuries are outlawed, and the
radio is only used for readings from the Bible.  Crimes are punished
retroactively: doctors who performed legal abortions in the "old world" are
hunted down and hanged.  Atwood's writing style is difficult to get used to
at first, but the tale grows more and more chilling as it goes on."""

In [None]:
vectorize_tf_idf(t1, matrix)

In [None]:
vectorize_tf_idf(t2, matrix)

# Task 3

In [None]:
w2v = Word2Vec(sentences=[sentence for document in all_documents for sentence in document], epochs=40)

In [None]:
w2v.wv.most_similar("christian")

# Task 4

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
cosine_similarity(w2v.wv["christian"], w2v.wv["atheist"])

In [None]:
def draw_words(terms, vectors_source):
    pca = PCA(n_components=2)
    vectors_2d = pd.DataFrame(pca.fit_transform([vectors_source[term] for term in terms]))
    vectors_2d.index = terms
    vectors_2d.columns = ["x", "y"]
    
    p = sns.scatterplot(vectors_2d, x="x", y="y")
    for i in vectors_2d.index:
        item = vectors_2d.loc[i]
        p.text(item.x, item.y, i)
    return p

In [None]:
terms_to_check = ["christian", "jesus", "god", "muslim", "islamic", "islam", "atheist", "time", "world", "true", "wrong", "human", "person", "tell", "see", "opinion", "think", "view", "religion"]
draw_words(terms_to_check, w2v.wv)

# Task 5

In [None]:
def transform_to_compare(vectors):
    pca = PCA(n_components=len(w2v.wv[0]))
    transformed = pca.fit_transform(vectors)
    return transformed

In [None]:
terms_vectorized = [vectorize_tf_idf(i, matrix) for i in matrix.columns]

# Task 6

In [None]:
terms_to_compare = pd.DataFrame(transform_to_compare(terms_vectorized))
terms_to_compare.index = matrix.columns
terms_to_compare

In [None]:
def compare_methods(w1, w2):
    print("Results for words", w1, "and", w2)
    print("W2V:", cosine_similarity(w2v.wv[w1], w2v.wv[w2]))
    print("Tf-Idf:", cosine_similarity(terms_to_compare.loc[w1], terms_to_compare.loc[w2]))
    print()

In [None]:
compare_methods("islam", "islamic")
compare_methods("say", "tell")
compare_methods("say", "islam")

In [None]:
tfidf_data = {}
for i in range(len(matrix.columns)):
    tfidf_data[matrix.columns[i]] = terms_vectorized[i]

In [None]:
draw_words(terms_to_check, tfidf_data)

# Task 7

In [None]:
def vectorize(sentences, w2v):
    result_vec = np.zeros(w2v.vector_size)
    for s in sentences:
        sentence_vec = np.zeros(w2v.vector_size)
        for w in s:
            if w2v.wv.has_index_for(w):
                sentence_vec += w2v.wv[w]
        sentence_vec = sentence_vec / len(s) if len(s) > 0 else np.zeros(w2v.vector_size)
        result_vec += sentence_vec
    result_vec = result_vec / len(sentences) if len(sentences) > 0 else np.zeros(w2v.vector_size)
    return result_vec

In [None]:
t1

In [None]:
preprocess_text(t1, True)

In [None]:
vectorize(preprocess_text(t1, True), w2v)

 # Task 8

In [None]:
vectorized_documents = {}
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        stems = get_stems_processed(os.path.join(workdir, filename))
        vectorized_documents[os.path.join(t, filename)] = vectorize(stems, w2v)
    break # process only 1st topic

In [None]:
with open(os.path.join(train_dir, "..", "train_embeddings.tsv"), "w") as f:
    for k in vectorized_documents.keys():
        print(k.replace(".tsv", ""), *vectorized_documents[k], sep="\t", file=f)