In [None]:
import os
import regex
import math
import csv
from collections import Counter, OrderedDict
from functools import reduce
from operator import add

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from scipy import spatial
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from gensim.models import Word2Vec
from tqdm.notebook import tqdm

# Пункт 1. Чтение данных и построение матрицы Term-Document

In [None]:
stop_words = set(stopwords.words('english'))
emotiocons = r'(?:(?::|;|=)(?:-|_)?(?:\)|\(|D|P))|(?:[-*]_[-*])'
ending_signs = r'(?:\?|\.|\.\.\.|\!|\?\!|\!\?)$'

def read_data(subset="train", squeeze_text=True, collect_analysis=True):

    # if squeeze_text=True, then we won't split text by sentences and each text will consist with one array of tokens
    # if squeeze_text=False, then wil be created corpus of all sentences
    
    token_frequency = Counter()
    term_document_matrix = Counter()
    term_document_matrix_v2 = dict()
    
    texts = dict()
    
    # Перебираем папки
    for folder in tqdm(['age', 'ethnicity', 'gender', 'religion', 'other_cyberbullying', 'not_cyberbullying']):
        # Путь к папке
        folder_path = os.path.join(f'../assets/annotated-corpus/', subset, folder)
        # Перебираем файлы в папке
        for file in tqdm(os.listdir(folder_path)):
            # Если это tsv файл
            if file.endswith('.tsv'):
                # Путь к файлу
                file_path = os.path.join(folder_path, file)
                # Читаем файл
                try: 
                    df = pd.read_csv(file_path, sep='\t', header=None)
                except pd.errors.EmptyDataError:
                    continue
                    
                # Группируем токены по предложениям (предполагает, что предложение отделено пустой строкой)
                text = list()
                sentence = list()
                tokens_list = df[0].tolist()
                for token in tokens_list:
                    token = str(token).lower()
                    if (regex.search(emotiocons, token) is None and regex.search(ending_signs, token) is not None) \
                        or token in stop_words:
                        continue
                    
                    if squeeze_text:
                        if token == 'nan':
                            continue
                        text.append(token)
                    else:
                        if token == 'nan':
                            if len(sentence) > 0:
                                text.append(sentence)
                            sentence = []
                            continue
                        else:
                            sentence.append(token)
                    
                    if collect_analysis:
                        token_frequency[token] += 1
                        doc_name = f"{folder}_{file.rsplit('.', 1)[0]}"
                        term_document_matrix[(token, doc_name)] += 1
                        if token not in term_document_matrix_v2:
                            term_document_matrix_v2[token] = { doc_name: 1 }
                        else:
                            if doc_name not in term_document_matrix_v2[token]:
                                term_document_matrix_v2[token][doc_name] = 1
                            else:
                                term_document_matrix_v2[token][doc_name] += 1
                
                if not squeeze_text and len(sentence) > 0:
                    text.append(sentence)
                
                if len(text) > 0:
                    texts[f"{folder}_{file.rsplit('.', 1)[0]}"] = text
                    
    return texts, token_frequency, term_document_matrix, term_document_matrix_v2

In [None]:
train_texts, token_frequency, term_document_matrix, term_document_matrix_v2 = read_data("train", True, True)

In [None]:
test_texts, _, _, _ = read_data("test", True, False)

# Пункт 2. Построение матрицы TD-IDF

### Чистим данные от редких токенов

In [None]:
token_frequency_copy = token_frequency.copy()

In [None]:
for k, v in token_frequency.items():
    if v < 2:
        del token_frequency_copy[k]

In [None]:
# Находим токены, которые встречаются 1 раз
rare_tokens = dict(filter(lambda x: x[1] < 2, token_frequency.items())).keys()

In [None]:
len(token_frequency), len(token_frequency) - len(rare_tokens)

In [None]:
term_document_matrix_copy_v2 = term_document_matrix_v2.copy()

In [None]:
for k, v in term_document_matrix_v2.items():
    if k in rare_tokens:
        del term_document_matrix_copy_v2[k]

In [None]:
len(term_document_matrix_v2), len(term_document_matrix_copy_v2)

### Строим матрицу Document-Term

In [None]:
document_term_matrix_v2  = dict()

for term, v in term_document_matrix_copy_v2.items():
    for doc, val in v.items():
        if doc not in document_term_matrix_v2:
            document_term_matrix_v2[doc] = { term: val }
        else:
            document_term_matrix_v2[doc][term] = val

In [None]:
len(dict(filter(lambda x: len(x[1]) > 1, document_term_matrix_v2.items())))

In [None]:
len(dict(filter(lambda x: len(x[1]) > 1, term_document_matrix_v2.items())))

### Строим матрицу TF-IDF

In [None]:
td_matrix = term_document_matrix_copy_v2
dt_matrix = document_term_matrix_v2

documents = dt_matrix.keys()
n = len(dt_matrix)

tf_idf = dict()

for doc, term_vals in dt_matrix.items():
    temp_calc = dict()
    for term, val in term_vals.items():
        tf = val / sum(term_vals.values())
        idf = math.log(n / len(td_matrix[term].keys()))
        temp_calc[term] = tf * idf
    tf_idf[doc] = temp_calc

#  Пункт 3
Реализовать метод, позволяющий векторизовать произвольный текст с использованием нейронных сетей (предлагается использовать стандартную реализацию модели w2v или glove). Выбранную модель необходимо обучить на обучающей выборке.

In [None]:
pure_texts = list(train_texts.values())
model = Word2Vec(sentences=pure_texts, vector_size=100, window=3, min_count=1, workers=4)

In [None]:
model.train(pure_texts, total_examples=len(pure_texts), epochs=100)

In [None]:
#token_frequency

In [None]:
vector1 = model.wv['black']
vector2 = model.wv['white']

vector3 = model.wv['fuck']
vector4 = model.wv['dumb']
vector5 = model.wv['bitch']
vector6 = model.wv['idiot']
vector7 = model.wv['stupid']

vector8 = model.wv['people']
vector9 = model.wv['girl']
vector10 = model.wv['woman']
vector11 = model.wv['man']

vector12 = model.wv['islam']
vector13 = model.wv['muslim']
vector14 = model.wv['christian']

# Пункт 4

Рассмотрим насколько близки между собой токены, выбранные в 3 пункте.

In [None]:
def cosine_sim_lib(vec1, vec2):
    return 1 - spatial.distance.cosine(vec1, vec2)

In [None]:
def cosine_sim(vec1, vec2):
    assert len(vec1) == len(vec2) and not isinstance(vec1[0], list)
    dot12, norm1, norm2 = 0, 0, 0
    for x1, x2 in zip(vec1, vec2):
        dot12 += x1 * x2
        norm1 += x1 * x1
        norm2 += x2 * x2
    return dot12 / math.sqrt(norm1 * norm2)

In [None]:
cosine_sim(vector1, vector2), cosine_sim_lib(vector1, vector2)

In [None]:
words = list(model.wv.key_to_index)
X = [model.wv[word] for word in words]

In [None]:
pca = PCA(n_components=2)
result = pca.fit_transform(X)

In [None]:
plt.figure(dpi=500)

for vec, name in zip([vector1, vector2],['black', 'white']):
    reduced_vec = pca.transform(vec[np.newaxis, ...])
    plt.plot(reduced_vec[0][0], reduced_vec[0][1], 'o', color='red')
    plt.annotate(name, (reduced_vec[0][0], reduced_vec[0][1]))

for vec, name in zip([vector3, vector4, vector5, vector6, vector7], ['fuck', 'dumb', 'bitch', 'idiot', 'stupid']):
    reduced_vec = pca.transform(vec[np.newaxis, ...])
    plt.plot(reduced_vec[0][0], reduced_vec[0][1], 'o', color='blue')
    plt.annotate(name, (reduced_vec[0][0], reduced_vec[0][1]))
    
for vec, name in zip([vector8, vector9, vector10, vector11], ['people', 'girl', 'woman', 'man']):
    reduced_vec = pca.transform(vec[np.newaxis, ...])
    plt.plot(reduced_vec[0][0], reduced_vec[0][1], 'o', color='orange')
    plt.annotate(name, (reduced_vec[0][0], reduced_vec[0][1]))
    
for vec, name in zip([vector12, vector13, vector14], ['islam', 'muslim', 'christian']):
    reduced_vec = pca.transform(vec[np.newaxis, ...])
    plt.plot(reduced_vec[0][0], reduced_vec[0][1], 'o', color='green')
    plt.annotate(name, (reduced_vec[0][0], reduced_vec[0][1]))

plt.show()

# Пункт 5
Сократим размерность матрицы TF-IDF. Для этого преобразуем её из эффективного формата хранения в разреженную матрицу

In [None]:
len(list(td_matrix))

In [None]:
terms = list(td_matrix)
docs = list(dt_matrix)

In [None]:
sparse_data = np.full((len(docs), len(terms)), 0, dtype=np.float32)

In [None]:
for i, doc in tqdm(enumerate(docs)):
    for j, term in enumerate(terms):
        if term in tf_idf[doc]:
            sparse_data[i, j] = tf_idf[doc][term]

In [None]:
sparse_data

In [None]:
pca_v2 = PCA(n_components=100)
sparse_pca_data = pca_v2.fit_transform(sparse_data)

# Пункт 6
С использованием разработанного метода подсчета косинусного расстояния сравнить эффективность метода векторизации с использованием нейронных сетей и эффективность базовых методов векторизации с последующим сокращением размерности.

In [None]:
tf_idf[docs[500]].keys()

In [None]:
model_data = np.full((len(docs), 100), 0, dtype=np.float32)
for i, doc in enumerate(docs):
    temp = np.full(100, 0, dtype=np.float32)
    cnt = 0
    for word in tf_idf[doc].keys():
        temp += model.wv[word]
        cnt += 1 
    model_data[i] = temp / cnt

In [None]:
top_words_v1 = dict(sorted(token_frequency.items(), key=lambda x: x[1], reverse=True)[:100])

In [None]:
top_words_v2 = { k: v for k, v in token_frequency.items() if v > 200 }

In [None]:
len(top_words_v2)

Подберём тексты, содержащие популярные слова, при этом с разной меткой и минимальным числом слов 5

In [None]:
def is_doc_valid(doc_name, words, category, num_words, must_words):
    return category in doc_name and len(words) >= num_words and len(words.intersection(must_words)) > 0

In [None]:
docs_subset = list()
texts_cnt = {
    'age': 0, 'ethnicity': 0, 'gender': 0, 'religion': 0, 'other_cyberbullying': 0, 'not_cyberbullying': 0
}
num_words = 5
num_docs_per_category = 3

must_words = set(top_words_v1)
for doc, words in dt_matrix.items():
    ws = set(words.keys())
    for category in texts_cnt.keys():
        if texts_cnt[category] < num_docs_per_category and is_doc_valid(doc, ws, category, num_words, must_words):
            texts_cnt[category] += 1
            docs_subset.append(doc)
            break

In [None]:
docs_subset

In [None]:
base_doc = docs_subset[0]
base_index = docs.index(base_doc)
for doc in docs_subset:
    index = docs.index(doc)
    print(f"Words2Vec: {base_doc} <-> {doc}:", cosine_sim(model_data[base_index], model_data[index]))
    print(f"TF-IDF: {base_doc} <-> {doc}:", cosine_sim(sparse_data[base_index], sparse_data[index]))
    print(f"TF-IDF with PCA: {base_doc} <-> {doc}:", cosine_sim(sparse_pca_data[base_index], sparse_pca_data[index]))
    print()

# Пункт 7
Реализовать метод, осуществляющий векторизацию произвольного текста

In [None]:
def inverse_td_matrix(td_matrix):
    dt_matrix  = dict()

    for term, v in td_matrix.items():
        for doc, val in v.items():
            if doc not in dt_matrix:
                dt_matrix[doc] = { term: val }
            else:
                dt_matrix[doc][term] = val
    return dt_matrix

In [None]:
def calc_tf_idf(td_matrix, dt_matrix):
    n = len(dt_matrix)  
    tf_idf = dict()
    
    for doc, term_vals in tqdm(dt_matrix.items()):
        temp_calc = dict()
        for term, val in term_vals.items():
            tf = val / sum(term_vals.values())
            idf = math.log(n / len(td_matrix[term].keys()))
            temp_calc[term] = tf * idf
        tf_idf[doc] = temp_calc
    return tf_idf

In [None]:
def clear_texts(texts, rare_tokens):
    texts_copy = OrderedDict()
    for key, text in tqdm(texts.items()):
        text_copy = list()
        for sentence in text:
            sentence_copy = list()
            for word in sentence:
                if word not in rare_tokens:
                    sentence_copy.append(word)
            if len(sentence_copy) > 0:
                text_copy.append(sentence_copy)
        if len(text_copy) > 0:
            texts_copy[key] = text_copy
    return texts_copy

In [None]:
def vectorize_texts(subset, clear_rare_tokens=True):
    print("Reading data and calculating term-document matrices")
    texts, token_frequency_base, term_document_matrix, td_matrix_base = read_data(subset, False, True)

    texts = OrderedDict(texts)

    if clear_rare_tokens:
        print("Delete rare tokens in token_frequency")
        rare_tokens = dict(filter(lambda x: x[1] < 2, token_frequency.items())).keys()

        print("Delete rare tokens in term-document matrix")
        td_matrix = td_matrix_base.copy()
        for k, v in td_matrix_base.items():
            if k in rare_tokens:
                del td_matrix[k]

        print("Clear texts")
        texts = clear_texts(texts, rare_tokens)

    else:
        td_matrix = td_matrix_base

    print("Build document-term matrix")
    dt_matrix = inverse_td_matrix(td_matrix)

    print("Calculate TF-IDF")
    tf_idf = calc_tf_idf(td_matrix, dt_matrix)

    pure_texts = list(map(lambda x: reduce(add, x), texts.values()))

    print("Train Word2Vec")
    model = Word2Vec(sentences=pure_texts, vector_size=100, window=3, min_count=1, workers=4)
    model.train(pure_texts, total_examples=len(pure_texts), epochs=100)

    print("Vectorize texts")
    model_data = np.full((len(dt_matrix), 100), 0, dtype=np.float32)
    for i, (doc, sentences) in tqdm(enumerate(texts.items())):
        temp = np.empty((0, 100), dtype=np.float32)
        cnt = 0
        for sentence in sentences:
            sentence_vec = np.full(100, 0, dtype=np.float32)
            for word in sentence:
                sentence_vec += model.wv[word] * tf_idf[doc][word]
            temp = np.vstack((temp, sentence_vec / sum(tf_idf[doc].values())))

        model_data[i] = np.mean(temp, axis=0)

    return texts, model_data

# Пункт 8
Выполнить векторизацию тестовой выборки с использованием метода, реализованного на предыдущем шаге. Результаты сохранить в формате tsv 

In [None]:
texts, test_vecs = vectorize_texts("test", clear_rare_tokens=True)

In [None]:
test_vecs.shape

In [None]:
train_texts_v2, train_vecs = vectorize_texts("train", clear_rare_tokens=True)

In [None]:
train_vecs.shape

In [None]:
def write_data(texts, vectors, subset="test"):
    with open(f'./assets/annotated-corpus/{subset}-embeddings.tsv', 'w', newline='') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
        for i, doc_name in enumerate(texts.keys()):
            writer.writerow([doc_name] + vectors[i].tolist())

In [None]:
write_data(texts, test_vecs, subset="test")

In [None]:
write_data(train_texts_v2, train_vecs, subset="train")