In [None]:
import re
import regex
import pathlib
from functools import reduce
import csv
import itertools
import pandas as pd
import nltk
from nltk.util import ngrams, pad_sequence
from nltk.probability import FreqDist
from nltk.collocations import *
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, PlaintextCorpusReader
from nltk.tokenize import regexp_tokenize, sent_tokenize

from sklearn.model_selection import train_test_split

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
df = pd.read_csv("../assets/cyberbullying_tweets.csv")
df.head()

In [None]:
df["tweet_text"].apply(lambda n: len(n.split())).sum()

In [None]:
# Check null values
df.isna().any()

In [None]:
# Count duplicates
df.duplicated().sum()

In [None]:
df.drop_duplicates(keep='first', inplace=True)

In [None]:
df['cyberbullying_type'].value_counts()

## Sentence tokenizer

In [None]:
simple_email_pattern = r"\S+@\S+\.\S+"
normal_email_pattern = r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"

simple_phone_pattern = r"\\+?[1-9][0-9]{7,14}"
normal_phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}"

normal_url_pattern = r"(?:https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
normal_url_pattern_v2 = r"(?:https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?\/[a-zA-Z0-9]{2,}|(?:(?:https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?)|(?:https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}(\.[a-zA-Z0-9]{2,})?"
# https://ihateregex.io/expr/phone/
#normal_phone_pattern = r"[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}"

In [None]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"
multiple_dots = r'\.{2,}'
newlines = "\n+"

def split_into_sentences(text: str) -> list[str]:
    """
    Split the text into sentences.

    If the text contains substrings "<prd>" or "<stop>", they would lead 
    to incorrect splitting because they are used as markers for splitting.

    :param text: text to be split into sentences
    :type text: str

    :return: list of sentences
    :rtype: list[str]
    """
    text = " " + text + "  "
    text = re.sub("^\s+", " ", text)
    
    email_iter = re.finditer(normal_email_pattern, text)
    for m in email_iter:
        text = text.replace(m.group(), m.group().replace(".", "<prd>"))

    phone_iter = re.finditer(normal_phone_pattern, text)
    for m in phone_iter:
        text = text.replace(m.group(), m.group().replace(".", "<prd>"))

    url_iter = re.finditer(normal_url_pattern, text)
    for m in url_iter:
        text = text.replace(m.group(), m.group().replace(".", "<prd>"))

    text = re.sub(r"(\,|\:|\;)*((\?|\!|\.)+)(\,|\:|\;)*", "\\2", text)
    text = re.sub(r"(\.+)[\?\!\.]*", "\\1<stop>", text)
    text = re.sub(r"((\?|\!)+)[\?\!\.]*", "\\1<stop>", text)
    text = re.sub(r"((\?+\!)|(\!+\?))[\?\!\.]*", "?!<stop>", text)
        
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)

    text = re.sub(newlines, "<stop>", text)
    
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms + " " + starters, "\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>",text)
    text = re.sub(" " + suffixes + "[.] " + starters," \\1<stop> \\2",text)
    text = re.sub(" " + suffixes + "[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("(?|!|?!)","\\1<stop>")

    text = text.replace("<prd>",".")
    
    text = re.sub("(\s*<stop>\s*){2,}", "<stop>", text)
    
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]: sentences = sentences[:-1]
    return sentences

In [None]:
def tokenize_sentences(text, method="custom") -> list[str]:
    if method == "nltk":
        return sent_tokenize(text)
    elif method == "custom":
        return split_into_sentences(text)

In [None]:
df["sentences"] = df["tweet_text"].apply(split_into_sentences)

In [None]:
df["sentences_count"] = df["sentences"].apply(len)
pd.set_option('display.max_colwidth', None)
df.head(10)

## Tokenize text

In [None]:
# OUTDATED FUNCTION

def tokenize_text(text, remove_hashtags=True):
    # Replace newline and carriage return with space
    text = re.sub(r'\r|\n', ' ', text)
    # Remove urls
    text = re.sub(r'(?:https?\://|www\.)\S+', '', text)
    # Remove emails
    text = re.sub(normal_email_pattern, '', text)
    # Remove mentions
    text = re.sub(r'\@\S+', ' ', text)
    if remove_hashtags:
        # Remove hashtags
        text = re.sub(r'#\S+', ' ', text)
    else:
        # Remove the # symbol from hashtags
        text = re.sub(r'#(?:[\w-]+)', r'\1', text).strip()
    
    # Remove simple emotiocons
    text = re.sub(r'(:|;|=)(-|_)?(\)|\(|D|P)', '', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]', '', text)
    # Remove numbers
    #     text = re.sub(r'\d+', '', text)
    # Replace multispaces with one
    text = re.sub(r"\s\s+", " ", text)
    # Delete repeated punctuation
    text = re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)
    
    # Tokenize text
    words = text.split()
    return words

In [None]:
def get_text_tokenize_pattern():
    words = r"\w+(?:-\w+)*(?:\'(?:s|re|m|t))?"
    mentions = r'@[\w\_\.]+'
    hashtags = r'#[^ !@#$%^&*(),.?":{}|<>]*'
    emotiocons = r'(?:(?::|;|=)(?:-|_)?(?:\)|\(|D|P))|(?:[-*]_[-*])'
    numbers = r"[0-9]+\.?[0-9]*[\%\$]?"

    ending_signs = r'(?:\?|\.|\.\.\.|\!|\?\!|\!\?)$'

    return r'|'.join([
        normal_url_pattern, normal_email_pattern, normal_phone_pattern, mentions, hashtags, emotiocons, numbers, words, ending_signs
    ]) 

In [None]:
def tokenize_sentences(sentences):
    tokenize_pattern = get_text_tokenize_pattern()

    return [
        token for token in [
            regexp_tokenize(sentence, tokenize_pattern) for sentence in sentences
        ] if len(token) > 0 and not (len(token) == 1 and token[0] in ["?", "!", ".", "...", "?!", "!?"])
    ]
    

In [None]:
df["tokenized"] = df["sentences"].apply(tokenize_sentences)

In [None]:
df["tokens_count"] = df["tokenized"].apply(lambda x: [len(i) for i in x])
df.head(10)

## Stem words

In [None]:
stemmer = SnowballStemmer("english")
def stem_text(text):
    return [
        [stemmer.stem(word) for word in sentence] for sentence in text
    ]

In [None]:
df["stem"] = df["tokenized"].apply(stem_text)

In [None]:
df.head()

## Lemmatize words

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return [
        [lemmatizer.lemmatize(word) for word in sentence] for sentence in text
    ]
#     words = nltk.word_tokenize(text)
#     return " ".join([lematizer.lemmatize(word) for word in text])

In [None]:
df["lemmatized"] = df["tokenized"].apply(lemmatize_text)

In [None]:
df.head()

## Save results

In [None]:
for cyberbullying_type in df["cyberbullying_type"].unique():
    for subset in ["train", "test"]:
        pathlib.Path(f'../assets/annotated-corpus/{subset}/{cyberbullying_type}').mkdir(parents=True, exist_ok=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['tokenized', 'stem', 'lemmatized']],
    df["cyberbullying_type"],
    train_size=0.75,
    random_state=42
)

In [None]:
def write_data(X: pd.DataFrame, y: pd.Series, subset):
    for i, ((_, (tokens_s, stems_s, lemms_s)), cyberbullying_type) in enumerate(zip(X.iterrows(), y)):
        with open(f'../assets/annotated-corpus/{subset}/{cyberbullying_type}/{i:03d}.tsv', 'w', newline='') as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
            for tokens, stems, lemms in zip(tokens_s, stems_s, lemms_s):
                for token, stem, lemm in zip(tokens, stems, lemms):
                    writer.writerow([token, stem, lemm])
                writer.writerow([])

In [None]:
write_data(X_test, y_test, "test")

In [None]:
write_data(X_train, y_train, "train")

# Lab 2

In [None]:
import nltk
from nltk.collocations import *
from nltk.corpus import PlaintextCorpusReader

In [None]:
stop_words = set(stopwords.words('english'))

## Пункты 1-4

In [None]:
def get_text_tokenize_pattern_v2():
    words = r"\w+(?:-\w+)*(?:\'(?:s|re|m|t))?"
    mentions = r'@[\w\_\.]+'
    hashtags = r'#[^ !@#$%^&*(),.?":{}|<>]*'
    emotiocons = r'(?:(?::|;|=)(?:-|_)?(?:\)|\(|D|P))|(?:[-*]_[-*])'

    numbers = r"[0-9]+\.?[0-9]*[\%\$]?"

    return r'|'.join([
        normal_url_pattern, normal_email_pattern, normal_phone_pattern, mentions, hashtags, emotiocons, numbers, words
    ])
    

In [None]:
def tokenize_sentences_v2(sentences):
    tokenize_pattern = get_text_tokenize_pattern_v2()

    tokenized_sentences = list()

    for sentence in sentences:
        tokenized_sentence = list()
        tokens = regexp_tokenize(sentence, tokenize_pattern)
        for token in tokens:
            token_low = token.strip().lower()
            if len(token_low) > 0 and not (token_low in stop_words):
                tokenized_sentence.append(token)
        if len(tokenized_sentence) > 0:
            tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

In [None]:
df_test = df[["tweet_text", "cyberbullying_type", "sentences"]].copy()

In [None]:
df_test["tokenized"] = df_test["sentences"].apply(tokenize_sentences_v2)

In [None]:
df_test["tokens_count"] = df_test["tokenized"].apply(lambda x: [len(i) for i in x])
df_test.head()

In [None]:
df_test["lemmatized"] = df_test["tokenized"].apply(lemmatize_text)

In [None]:
df_test.head()

In [None]:
sentences_test = [sentence for row in df_test["lemmatized"].to_list() for sentence in row]

# Пункт 5

In [None]:
from collections import Counter

trigrams = list()

for sentence in sentences_test:
    for i in range(len(sentence) - 2):
        trigrams.append(tuple(sentence[i:i + 3]))

In [None]:
trigrams[:10]

In [None]:
trigrams_counter = Counter(trigrams)

In [None]:
ngram_tokens = [ngrams(sent, ngram_size) for sent in sentences_test]
ngram_counts = Counter([gram for tokens in ngram_tokens for gram in tokens])

In [None]:
def create_ngrams(sentences, n=3):
    ngrams = Counter()

    for sentence in sentences_test:
        for i in range(len(sentence) - n + 1):
            ngrams[tuple(sentence[i:i + n])] += 1
    return ngrams

In [None]:
# unigrams_cnt = create_ngrams(sentences_test, 1)
trigrams_cnt = create_ngrams(sentences_test, 3)
unigrams_cnt = create_ngrams(list(trigrams_cnt.keys()), 1)

In [None]:
unigrams_cnt.most_common(10)

In [None]:
create_ngrams(sentences_test, 1).most_common(10)

In [None]:
len(list(trigrams_cnt.keys())), len(trigrams_cnt), len(trigrams)

In [None]:
len(list(ngrams([word for sentence in sentences_test for word in sentence], 3, pad_right=True, right_pad_symbol=None)))

In [None]:
def prepare_ngrams_for_trigrams(trigrams):
    word_fd = FreqDist()
    wildcard_fd = FreqDist()
    bigram_fd = FreqDist()
    ngram_fd = FreqDist()
    for window in trigrams:
        w1 = window[0]
        if w1 is None:
            continue
        for w2, w3 in itertools.combinations(window[1:], 2):
            word_fd[w1] += 1
            if w2 is None:
                continue
            bigram_fd[(w1, w2)] += 1
            if w3 is None:
                continue
            wildcard_fd[(w1, w3)] += 1
            ngram_fd[(w1, w2, w3)] += 1
    return word_fd, bigram_fd, wildcard_fd, ngram_fd

In [None]:
def prepare_ngrams_for_trigrams_v2(trigrams):
    word_fd = Counter()
    wildcard_fd = Counter()
    bigram_fd = Counter()
    ngram_fd = Counter()
    for window in trigrams:
        w1 = window[0]
        if w1 is None:
            continue
        for w2, w3 in itertools.combinations(window[1:], 2):
            word_fd[w1] += 1
            if w2 is None:
                continue
            bigram_fd[(w1, w2)] += 1
            if w3 is None:
                continue
            wildcard_fd[(w1, w3)] += 1
            ngram_fd[(w1, w2, w3)] += 1
    return word_fd, bigram_fd, wildcard_fd, ngram_fd

In [None]:
trigrams = list()
for sentence in sentences_test:
    for i in range(len(sentence) - 2):
        trigrams.append(tuple(sentence[i:i + 3]))

In [None]:
strange_trigrams = list()

for sentence in sentences_test:
    snt_len = len(sentence)
    for i in range(len(sentence)):
        t = [sentence[i]]
        if i+1 >= snt_len:
            t.append(None)
        else:
            t.append(sentence[i+1])

        if i+2 >= snt_len:
            t.append(None)
        else:
            t.append(sentence[i+2])
        
        strange_trigrams.append(tuple(t))

In [None]:
strange_trigrams_cnt = Counter()

for sentence in sentences_test:
    snt_len = len(sentence)
    for i in range(len(sentence)):
        t = [sentence[i]]
        if i+1 >= snt_len:
            t.append(None)
        else:
            t.append(sentence[i+1])

        if i+2 >= snt_len:
            t.append(None)
        else:
            t.append(sentence[i+2])
        
        strange_trigrams_cnt[tuple(t)] += 1

In [None]:
word_fd_v1, bigram_fd_v1, wildcard_fd_v1, ngram_fd_v1 = prepare_ngrams_for_trigrams(trigrams)

In [None]:
word_fd_v2, bigram_fd_v2, wildcard_fd_v2, ngram_fd_v2 = prepare_ngrams_for_trigrams_v2(trigrams)

In [None]:
word_fd_v1_s, bigram_fd_v1_s, wildcard_fd_v1_s, ngram_fd_v1_s = prepare_ngrams_for_trigrams(strange_trigrams)

In [None]:
word_fd_v2_s, bigram_fd_v2_s, wildcard_fd_v2_s, ngram_fd_v2_s = prepare_ngrams_for_trigrams_v2(strange_trigrams)

In [None]:
word_fd_v2_s2, bigram_fd_v2_s2, wildcard_fd_v2_s2, ngram_fd_v2_s2 = prepare_ngrams_for_trigrams_v2(list(Counter(trigrams).keys()))

In [None]:
dict(ngram_fd_v2)[('bullied', 'high', 'school')]

## Используя NLTK

In [None]:
# Словарь для подсчета триграмм
trigram_counter_nltk = Counter()

for sentence in sentences_test:
    # Генерируем триграммы и преобразуем их в строки, сразу подсчитываем их
    trigram_counter_nltk.update(" ".join(ngram) for ngram in ngrams(sentence, 3))

In [None]:
trigram_counter_nltk.most_common(20)

In [None]:
trigrams_cnt.most_common(20)

In [None]:
import matplotlib.pyplot as plt

# Находим топ 30 самых популярных триграмм
top_30_trigrams = trigrams_cnt.most_common(30)

# Распаковываем данные
trigrams, counts = zip(*top_30_trigrams)

# Создаем построение
plt.figure(figsize=(10, 8), dpi=180)
plt.barh(list(map(lambda x: " ".join(x), trigrams)), counts, color='skyblue')
plt.xlabel('Частота')
plt.ylabel('Триграмма')
plt.title('Топ-30 самых популярных триграмм')
plt.gca().invert_yaxis()  # перевернуть ось Y, чтобы самая частая триграмма была наверху
plt.show()

# Пункт 6

In [None]:
def mi_score(word, colocates, ngrams, unigrams, N, n=3, t=None):
    assert n > 1
    if ngrams[(word,) + colocates] == 0:
        return 0
    m = 1
    if t is not None:
        for colocate in (word,) + colocates:
            m *= t.count(colocate)
    else:
        for colocate in (word,) + colocates:
            m *= unigrams[(colocate,)]
    return math.log2(ngrams[(word,) + colocates] * (N ** (n - 1)) / m)

In [None]:
mi_score_dict = dict()
words = [word for sentence in sentences_test for word in sentence]
N = len(words)
for trigram in sorted(trigrams_counter.items(), key=lambda x:x[1], reverse=True)[:100]:
    score = mi_score(trigram[0][0], trigram[0][1:], trigrams_counter, unigrams, N, t=words)
    mi_score_dict[trigram[0]] = score

In [None]:
list({k: v for k, v in sorted(mi_score_dict.items(), key=lambda item: item[1], reverse=True)})[:10]

In [None]:
list({k: v for k, v in sorted(mi_score_dict.items(), key=lambda item: item[1], reverse=True)})[:10]

In [None]:
trigrams_cnt = create_ngrams(sentences_test, 3)
unigrams_cnt = create_ngrams(list(trigrams_cnt.keys()), 1)

In [None]:
mi_score_dict = dict()
words = [word for sentence in sentences_test for word in sentence]
N = len(words)
for trigram in sorted(trigrams_cnt.items(), key=lambda x:x[1], reverse=True):
    score = mi_score(trigram[0][0], trigram[0][1:], trigrams_cnt, unigrams_cnt, N)
    mi_score_dict[trigram[0]] = score

In [None]:
list({k: v for k, v in sorted(mi_score_dict.items(), key=lambda item: item[1], reverse=True)})[:10]

In [None]:
def mi_score_v2(word, colocates, ngrams, unigrams, N, n=3, t=None):
    assert n > 1
    if ngrams[(word,) + colocates] == 0:
        return 0
    m = 1
    if t is not None:
        for colocate in (word,) + colocates:
            m *= t.count(colocate)
    else:
        for colocate in (word,) + colocates:
            m *= unigrams[colocate]
    return math.log2(ngrams[(word,) + colocates] * (N ** (n - 1)) / m)

In [None]:
mi_score_dict = dict()
words = [word for sentence in sentences_test for word in sentence]
N = sum(word_fd_v2_s.values())

for trigram in ngram_fd_v2_s.keys():
    score = mi_score_v2(trigram[0], trigram[1:], ngram_fd_v2_s, word_fd_v2_s, N)
    mi_score_dict[trigram] = score
    # print(score)

In [None]:
sorted(mi_score_dict.items(), key=lambda item: item[1], reverse=True)

In [None]:
mi_score_dict[('#10ThousandStepsAgain', '#KillinIt', 'https://t.co/gwoaRbVSUw')]

In [None]:
list({k: v for k, v in sorted(mi_score_dict.items(), key=lambda item: item[1], reverse=True)})[:10]

In [None]:
def pmi_v2(*marginals):
    return math.log2(marginals[0] * marginals[-1] ** (3 - 1)) - math.log2(reduce(lambda x, y: x * y, marginals[-2]))

In [None]:
mi_score_dict_2 = dict()
n_all = sum(word_fd_v2_s.values())
for trigram in sorted(ngram_fd_v2_s.items(), key=lambda x:x[1], reverse=True):
    w1_, w2_, w3_ = trigram[0]
    n_iii = ngram_fd_v2_s[(w1_, w2_, w3_)]
    if not n_iii:
        continue
    n_iix = bigram_fd_v2_s[(w1_, w2_)]
    n_ixi = wildcard_fd_v2_s[(w1_, w3_)]
    n_xii = bigram_fd_v2_s[(w2_, w3_)]
    n_ixx = word_fd_v2_s[w1_]
    n_xix = word_fd_v2_s[w2_]
    n_xxi = word_fd_v2_s[w3_]
    score = pmi_v2(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)
    
    mi_score_dict_2[trigram[0]] = score

In [None]:
list({k: v for k, v in sorted(mi_score_dict_2.items(), key=lambda item: item[1], reverse=True)})[:20]

In [None]:
mi_score_dict_2 = dict()
n_all = sum(word_fd_v2_s2.values())
for trigram in sorted(ngram_fd_v2_s2.items(), key=lambda x:x[1], reverse=True):
    w1, w2, w3 = trigram[0]
    n_iii = ngram_fd_v2_s2[(w1, w2, w3)]
    if not n_iii:
        continue
    n_iix = bigram_fd_v2_s2[(w1, w2)]
    n_ixi = wildcard_fd_v2_s2[(w1, w3)]
    n_xii = bigram_fd_v2_s2[(w2, w3)]
    n_ixx = word_fd_v2_s2[w1]
    n_xix = word_fd_v2_s2[w2]
    n_xxi = word_fd_v2_s2[w3]
    score = pmi_v2(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)
    
    mi_score_dict_2[trigram[0]] = score

In [None]:
list({k: v for k, v in sorted(mi_score_dict_2.items(), key=lambda item: item[1], reverse=True)})[:20]

# Пункт 7

In [None]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder_thr_1 = TrigramCollocationFinder(word_fd_v1_s, bigram_fd_v1_s, wildcard_fd_v1_s, ngram_fd_v1_s)
print(finder_thr_1.nbest(trigram_measures.pmi, 20))

In [None]:
scores_nltk = finder_thr_1.score_ngrams(trigram_measures.pmi)
for score_nltk in scores_nltk[:20]:
    print(score_nltk)

In [None]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

text = [item for sublist in sentences_test for item in sublist]
finder_thr = TrigramCollocationFinder.from_words(text, 3)

print(finder_thr.nbest(trigram_measures.pmi, 20))

In [None]:
word_fd_v1, bigram_fd_v1, wildcard_fd_v1, ngram_fd_v1

In [None]:
finder_thr.ngram_fd

In [None]:
finder_thr.bigram_fd

In [None]:
finder_thr.wildcard_fd

In [None]:
finder_thr.word_fd

In [None]:
total_ngrams = sum(ngram_counts.values())
total_ngrams