In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stops = set(stopwords.words('english'))

In [None]:
from tqdm import tqdm
import pandas as pd
import re
all_lemmas = []
for train_idx in ['1', '2', '3', '4']:
    c = pd.read_csv(f"../assets/annotated-corpus/train/{train_idx}.tsv",delimiter='\t')
    sentence_lemmas=[]
    prev_doc_id = c.values[0][0]
    for el in  tqdm(c.values):
        lemma = el[3]
        if el[1]=="<endofsentence>":
            continue
        if el[0]!=prev_doc_id:
            all_lemmas.append((prev_doc_id, sentence_lemmas))
            sentence_lemmas=[]
            prev_doc_id=el[0]
        else:
            if type(lemma) == str:
                lemma_filtered = re.sub(r'[^\w\s]','', lemma)
                if len(lemma_filtered)==0 or lemma_filtered in stops:
                    continue
                sentence_lemmas.append(lemma_filtered.lower())
            

In [None]:
N = len(all_lemmas)

In [None]:
all_words = [el for sentence in all_lemmas for el in sentence[1]]

In [None]:
len(all_words)

In [None]:
from collections import Counter
word_cnt = Counter(all_words)

In [None]:
len(word_cnt)

In [None]:
import numpy as np
np.percentile(list(word_cnt.values()), 80)

In [None]:
len(Counter({k: c for k, c in word_cnt.items() if c >=13}))

In [None]:
word_cnt = Counter({k: c for k, c in word_cnt.most_common(8192)})

In [None]:
word_cnt

In [None]:
import json
with open("tokens_freq.json", "w") as file:
    json.dump(list(word_cnt.items()), file)

In [None]:
def create_term_document_matrix(documents, token_freq):
    matrix = []
    token_freq = list(token_freq.keys())
    for doc in tqdm(documents):
        tokens = doc[1]
        tokens_cnt = Counter(tokens)
        row = [tokens_cnt[token.lower()] for token in token_freq]
        matrix.append(row)

    return matrix

In [None]:
len(word_cnt)

In [None]:
term_doc_matrix = create_term_document_matrix(all_lemmas,word_cnt)

In [None]:
import json
with open("term_document_matrix.json", "w") as file:
    json.dump(term_doc_matrix, file)

In [None]:
import gc
gc.collect()

In [None]:
term_doc_matrix = np.array(term_doc_matrix)

In [None]:
term_doc_matrix.shape

In [None]:
import matplotlib.pyplot as plt
plt.figure(dpi=500)
plt.imshow(np.asarray(term_doc_matrix[:512]), interpolation='none')
plt.show()

In [None]:
np.count_nonzero([[1,1,0,0],
                  [0,1,1,0]], axis=0)

In [None]:
from math import log
IDF = {}
terms = list(word_cnt.keys())
for i in tqdm(range(len(terms))):
    IDF[i] = log( (1+N) / (1+np.count_nonzero(term_doc_matrix[:,i]) ))

In [None]:
term_doc_matrix[0].shape

In [None]:
def compute_tf_idf(term_document_matrix, terms):
    tf_idf_matrix = []
    for document in tqdm(term_document_matrix):
        tf_idf_row=[]
        for i in range(len(terms)):
            tf = document[i]
            tf_idf_row.append(tf * IDF[i])
        tf_idf_matrix.append(tf_idf_row)

    return tf_idf_matrix

In [None]:
tf_idf_matrix = compute_tf_idf(term_doc_matrix[:10000], terms)

In [None]:
tf_idf_matrix_np = np.array(tf_idf_matrix)

In [None]:
tf_idf_matrix_np.shape

In [None]:
import matplotlib.pyplot as plt 
plt.figure(dpi=1000)
plt.imshow(np.asarray(tf_idf_matrix[:10]), interpolation='none')
plt.show()

In [None]:
vec = tf_idf_matrix[0]
print(vec)

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=[el[1] for el in all_lemmas[:]], vector_size=256, window=5, min_count=1, workers=14)
model.save("word2vec.model")

In [None]:
model = Word2Vec.load("word2vec.model")

In [None]:
all_lemmas[0]

In [None]:
model.train([el[1] for el in all_lemmas[:]], total_examples=len(all_lemmas[:]), epochs=100)

In [None]:
from scipy.spatial import distance
1 - distance.cosine(model.wv["winter"], model.wv["summer"])

In [None]:
np.dot(model.wv["winter"], model.wv["summer"])/(np.linalg.norm(model.wv["winter"])*np.linalg.norm(model.wv["summer"]))

In [None]:
import math
def cosine_similarity(v1,v2):
    dot_product, norm1, norm2 = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]
        y = v2[i]
        dot_product+=x*y
        norm1 += x*x
        norm2 += y*y
    norm1 = math.sqrt(norm1)
    norm2 = math.sqrt(norm2)
    return dot_product/(norm1*norm2)

In [None]:
cosine_similarity(model.wv["winter"], model.wv["summer"])

In [None]:
from sklearn.decomposition import PCA
words = list(model.wv.key_to_index)
X = [model.wv[word] for i, word in enumerate(words)]
pca = PCA(n_components=2)
pca.fit(X)

In [None]:
from sklearn.decomposition import PCA
pca1 = PCA(n_components=512)
tf_idf_matrix_np_transformed = pca1.fit_transform(tf_idf_matrix_np.T)

In [None]:
from sklearn.decomposition import PCA
words = list(model.wv.key_to_index)
X = [model.wv[word] for i, word in enumerate(words)]
pca = PCA(n_components=2)
pca.fit(X)

In [None]:
for word in ['winter',"snow","summer","hot","cold","bomb","president","cool","heat","sunday","december","july"]:
    vec = model.wv[word]
    x,y = pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]
    plt.plot(x,y, 'o', color='red')
    plt.annotate(word, (x, y))

In [None]:
for word in ['president', 'election',"government","senate","monday", 'december']:
    vec = model.wv[word]
    x,y = pca.transform(vec.reshape(1, -1))[0][0], pca.transform(vec.reshape(1, -1))[0][1]
    plt.plot(x,y, 'o', color='red')
    plt.annotate(word, (x, y))

In [None]:

import numpy as np
def get_sentence_emb(sent):
    sum_vector = np.zeros(256)
    for token in sent:
        try:
            emb = model.wv[token]
        except:
            continue
        sum_vector += emb
    return sum_vector/len(sent)

In [None]:
get_sentence_emb(all_lemmas[6][1])

In [None]:
tf_idf_matrix_np = np.array(tf_idf_matrix)

In [None]:
tf_idf_matrix_np tf_idf_matrix_np_transformed

In [None]:
word_cnt

In [None]:
cosine_similarity(tf_idf_matrix_np_transformed[list(word_cnt.keys()).index("election"),:], tf_idf_matrix_np_transformed[list(word_cnt.keys()).index("president"),:])


In [None]:
cosine_similarity(tf_idf_matrix_np[:,list(word_cnt.keys()).index("election")], tf_idf_matrix_np[:,list(word_cnt.keys()).index("president")])


In [None]:
cosine_similarity(model.wv["election"], model.wv["president"])


In [None]:
def embedd_csv_text(lemmas,test_idx):
    for el in lemmas:
        docid=el[0]
        text=el[1]
        with open(f"../assets/annotated-corpus/test/{test_idx}_emb.tsv", "a", encoding="utf-8") as file:
            embed = get_sentence_emb(text)
            file.write(f"{docid}\t")     
            for emb in embed:
                file.write(f"{emb}\t")
            file.write("\n") 

In [None]:
from tqdm import tqdm
import pandas as pd
import re
for the_idx in ['1', '2', '3', '4']:
    test_all_lemmas = []
    c = pd.read_csv(f"../assets/annotated-corpus/test/{the_idx}.tsv",delimiter='\t')
    sentence_lemmas=[]
    prev_doc_id = c.values[0][0]
    for el in  tqdm(c.values):
        lemma = el[3]
        if el[1]=="<endofsentence>":
            continue
        if el[0]!=prev_doc_id:
            test_all_lemmas.append((prev_doc_id, sentence_lemmas))
            sentence_lemmas=[]
            prev_doc_id=el[0]
        else:
            if type(lemma) == str:
                lemma_filtered = re.sub(r'[^\w\s]','', lemma)
                if len(lemma_filtered)==0 or lemma_filtered in stops:
                    continue
                sentence_lemmas.append(lemma_filtered.lower())
    embedd_csv_text(test_all_lemmas,the_idx)
            

In [None]:
len(test_all_lemmas)
              