In [None]:
import pandas as pd
import os
import re
import json
from math import log

def generate_ngrams(text, n):
        words = re.findall(r'\b\w+\b', text.lower())
        return [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]

def compute_idf(protocol_collocations, total_docs):
    doc_counts = {}

    # for every collocation in all documents we increment when we saw the collocation
    # by the end of this loop each collocation should have its frequency in all documents
    for doc_collocations in protocol_collocations:
        for collocation in doc_collocations:
            doc_counts[collocation] = doc_counts.get(collocation, 0) + 1

    idf_scores = {}

    # compute IDF for each collocation
    for collocation, doc_count in doc_counts.items():
        idf_scores[collocation] = log(total_docs / (doc_count))

    return idf_scores


def compute_tf(doc_collocations):
    total_count = len(doc_collocations)
    collocation_counts = {}
    
    # Count occurrences of each collocation in doc_collocations
    for coll in doc_collocations:
        collocation_counts[coll] = collocation_counts.get(coll, 0) + 1
    
    # Compute TF scores
    tf_scores = {coll: count / total_count for coll, count in collocation_counts.items()}

    return tf_scores

def compute_tfidf(tf_scores, idf_scores):
    tfidf_scores = {}
    for coll, tf in tf_scores.items():
        tfidf_scores[coll] = tf * idf_scores.get(coll, 0)
    return tfidf_scores

# Input: 
#   corpus_df: a dataframe containing the corpus' data
#   k: number of top collocations
#   n: length of collocations
#   t: min threshold for the amount of collocations
# Output:
#   collocation:grade list from the corpus
def get_k_n_t_collocations(corpus_df, k, n, t, type):

    #produce all collocations of length n
    corpus_df['collocations'] = corpus_df['sentence_text'].apply(lambda x: generate_ngrams(x, n))

    # place all collocations in a dictionary of structure <Collocation>: <Count>
    collocation_counts = {}
    for coll_list in corpus_df['collocations']:
        for coll in coll_list:
            collocation_counts[coll] = collocation_counts.get(coll, 0) + 1

    if type == "frequency":
        # only include collcations that appear more than <t>
        filtered_collocations = {coll: count for coll, count in collocation_counts.items() if count >= t}
    elif type == "tfidf":

        total_docs = len(corpus_df['protocol_name'].unique())

        # group by protocol docs
        grouped = corpus_df.groupby('protocol_name')['collocations']
        protocol_collocations = grouped.apply(lambda x: sum(x, []))

        idf_scores = compute_idf(protocol_collocations, total_docs)

        tfidf_scores = {}
        collocation_counts = {}

        for protocol_name, collocations in grouped:
            print("--------------")
            print(protocol_name)
            # list of collocations for the current document/protocol
            doc_collocations = sum(collocations, [])

            for coll in doc_collocations:
                collocation_counts[coll] = collocation_counts.get(coll, 0) + 1

            tf_scores = compute_tf(doc_collocations)
            print("tf_scores computed")

            # Compute TF-IDF for collocations in this document
            tfidf = compute_tfidf(tf_scores, idf_scores)
            print("tfidf_scores computed")

            for coll, score in tfidf.items():
                tfidf_scores[coll] = tfidf_scores.get(coll, 0) + score
            print("tf_scores added")

        # Only include collocations that have a score >= t
        filtered_collocations = {coll: score for coll, score in tfidf_scores.items() if collocation_counts.get(coll, 0) >= t}

    sorted_collocations = sorted(filtered_collocations.items(), key=lambda x: x[1], reverse=True)[:k]
    return sorted_collocations

In [None]:
corpus_path = 'knesset_corpus.jsonl'
output_file = 'top_collocations.txt'
k = 10  # top 10 collocations
n = 2   # bigrams
t = 5   # minimum of <t> counts for an n-gram
type = 'tfidf'

#load corpus to df
with open(corpus_path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]
corpus_df = pd.DataFrame(data)

result = get_k_n_t_collocations(corpus_df, k, n, t, type)

#save the output into a file
with open(output_file, 'w', encoding='utf-8') as f:
    for collocation, score in result:
        f.write(f"{collocation}: {score}\n")

print("Complete")

In [None]:
import random
# Input: list of strings and a percentage x
# Output: list of strings after masking x% of the tokens
def mask_tokens_in_sentences(sentences, x):
    masked_sentences = []

    for sentence in sentences:
        tokens = sentence.split()
        num_tokens_to_mask = int(len(tokens) * (x / 100))
        tokens_to_mask = random.sample(range(len(tokens)), num_tokens_to_mask)

        masked_tokens = ["[*]" if i in tokens_to_mask else token for i, token in enumerate(tokens)]
        masked_sentences.append(" ".join(masked_tokens))
    return masked_sentences

In [None]:
# Input: a dataframe, amount of entries to mask with [*], and a percentage x
# Output: the dataframe after applying the mask
def mask_corpus(corpus_df, amount_to_mask, x):
    if amount_to_mask > len(corpus_df):
        amount_to_mask = len(corpus_df)

    mask_indices = random.sample(range(len(corpus_df)), amount_to_mask)

    corpus_df.loc[mask_indices, 'sentence_text'] = corpus_df.loc[mask_indices, 'sentence_text'].apply(
        lambda sentence: mask_tokens_in_sentences(sentence, x)
    )
    return corpus_df