In [None]:
import os
import re
import math

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

from tqdm.notebook import tqdm

from chroma_db import ChromaDB
from utils import split_into_sentences

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [None]:
nltk.download('stopwords')

In [None]:
df = pd.read_csv("../assets/cyberbullying_tweets.csv")
df.drop_duplicates(keep='first', inplace=True)
df.head()

In [None]:
df["sentences"] = df["tweet_text"].apply(split_into_sentences)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)

## Var 1. BGE large model using HuggingFace Transformers

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5').to(device)
model.eval()

In [None]:
data = df_test
concat_metadata = False
batch_size = 512

sentences_metadata = dict()

cyberbullying_types = sorted(data["cyberbullying_type"].unique().tolist())
cyberbullying_tokens = tokenizer(cyberbullying_types, padding=True, truncation=True, return_tensors='pt').to(device)
with torch.no_grad():
    cyberbullying_emb = model(**cyberbullying_tokens)[0][:, 0]

with torch.no_grad():
    sentence_embeddings = torch.empty((0, 1024), dtype=torch.float32)
    cnt = 0
    for cyberbullying_type in tqdm(cyberbullying_types):
        subset = data[data["cyberbullying_type"] == cyberbullying_type]
        N = subset.shape[0]
        iterations = math.ceil(N / batch_size)

        for i in tqdm(range(iterations)):
            sentences = list()
            for index, text in subset[batch_size * i: batch_size * (i + 1)]["sentences"].items():
                for j, sentence in enumerate(text):
                    sentences.append(sentence)
                    sentences_metadata[cnt] = {
                        "text_index": index,
                        "sentence_index": j,
                        "cyberbullying_type": cyberbullying_type
                    }
                    cnt += 1

            tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
            model_output = model(**tokens)

            embeddings = model_output[0][:, 0]

            if concat_metadata:
                embeddings += cyberbullying_emb[cyberbullying_types.index(cyberbullying_type)]

            sentence_embeddings = torch.vstack((sentence_embeddings, embeddings.to("cpu").detach().clone()))

            model_output[0].to("cpu")
            model_output[1].to("cpu")
            del model_output
            del tokens

In [None]:
import gc

model.cpu()
cyberbullying_tokens[0].to("cpu")
cyberbullying_tokens[1].to("cpu")
embeddings.to("cpu")
del model
del cyberbullying_tokens
del embeddings
gc.collect()
torch.cuda.empty_cache()

In [None]:
sentence_embeddings_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

## Var 2. BGE large model using Sentence-Transformers

In [None]:
model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [None]:
data = df_test
concat_metadata = False
batch_size = 512

sentences_metadata_v2 = dict()

cyberbullying_types = sorted(data["cyberbullying_type"].unique().tolist())

with torch.no_grad():
    sentence_embeddings_v2 = np.empty((0, 1024), dtype=float)
    cnt = 0

    N = data.shape[0]
    iterations = math.ceil(N / batch_size)
    for i in tqdm(range(iterations)):
        sentences = list()
        for index, row in data[batch_size * i: batch_size * (i + 1)][["sentences", "cyberbullying_type"]].iterrows():
            for j, sentence in enumerate(row["sentences"]):
                sentences.append(sentence)
                sentences_metadata_v2[cnt] = {
                    "text_index": index,
                    "sentence_index": j,
                    "cyberbullying_type": row["cyberbullying_type"]
                }
                cnt += 1

        sentence_embeddings_v2 = np.vstack((sentence_embeddings_v2, model.encode(sentences, normalize_embeddings=True)))

## Создание ВБД

In [None]:
client_ch = ChromaDB()

In [None]:
collection_chroma_cosine = client_ch.get_collection("tweets_collection", "cosine")

## Create and fill collection

In [None]:
texts = [sentence for text in data["sentences"].tolist() for sentence in text]
metadatas = [
    {
        "cyberbullying_type": v["cyberbullying_type"],
        "text_index": v["text_index"],
        "sentence_index": v["sentence_index"]
    } for _, v in sentences_metadata_v2.items()
]

ids = [f"id{k}" for k in sorted(sentences_metadata_v2.keys())]

In [None]:
collection_chroma_v2_cosine = client_ch.get_collection("tweets_collection_v2", "cosine")

In [None]:
collection_chroma_v2_cosine.add(sentence_embeddings_v2, texts, metadatas, ids)

In [None]:
collection_chroma_v2_cosine.collection.count()

## Поиск схожих фрагментов

In [None]:
cyberbullying_types = ['age', 'ethnicity', 'gender', 'not_cyberbullying', 'other_cyberbullying', 'religion']

In [None]:
# Поиск документов с упоминанием девушек в школах

res1 = collection_chroma_v2_cosine.query(
    query_texts=["Girl in school"],
    n_results=10,
)

In [None]:
res1_2 = collection_chroma_v2_cosine.query(
    query_embeddings=[model.encode("Girl in school", normalize_embeddings=True).tolist()],
    n_results=10,
)

In [None]:
res1_2["ids"][0].index

In [None]:
# Поиск документов с этническим кибербулингом афро

res2 = collection_chroma_v2_cosine.query(
    query_texts=["Black guy"],
    n_results=10,
    where={"cyberbullying_type": "ethnicity"},
    where_document={"$contains":" black"}
)

In [None]:
# Поиск документов с плохими словами

res3 = collection_chroma_v2_cosine.query(
    query_texts=["fuck", "hate", "dumb"],
    n_results=5
)

In [None]:
# Документы с плохими словами не использующие эти слова
res4 = collection_chroma_v2_cosine.query(
    query_texts=["fuck", "hate", "dumb"],
    n_results=5,
    where_document={
        "$and": [
            {
                "$not_contains": "FUCK,fuck,Fuck",
            },
            {
                "$not_contains": "hate",
            },
            {
                "$not_contains": "dumb, DUMB, Dumb",
            }
        ]
    }
)

In [None]:
questions_array = [
    ((17920, 0), "What the OLF Qarro group is doing?"),
    ((19137, 0), "Who does Blumenthal love?"),
    ((5755, 0), "Who has several chlorine production plants?"),
    ((17311, 2), "Who strongly believes homosexuality is a sin?"),
    ((19035, 0), "Which country sells their sons to terrorists?")
]

In [None]:
pd.set_option('display.max_colwidth', None)
df_test[df_test.index == 19035]

In [None]:
df_test[df_test.index == 17920]

In [None]:
instruction = "Represent this sentence for searching relevant passages:"
collection = collection_chroma_v2_cosine
search_results = []
for q in questions_array:
    query = instruction + " " + q[1]
    query = model.encode(query, normalize_embeddings=True)
    results = collection.query(50, query_embeddings=[query.tolist()])
    id = list(dict(filter(lambda x: x[1]["text_index"] == q[0][0] and x[1]["sentence_index"] == q[0][1], sentences_metadata_v2.items())).keys())[0]
    search_results.append(results["ids"][0].index(f"id{id}"))

In [None]:
search_results

In [None]:
from statistics import mean
mean(search_results)