In [6]:
import pandas as pd
import os
import re
import json
from math import log

def generate_ngrams(text, n):
        words = re.findall(r'\b\w+\b', text.lower())
        return [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]

def compute_tfidf(collocations, doc_collocations, total_docs):
        tfidf_scores = {}
        for collocation in collocations:
            tf = doc_collocations.count(collocation) / len(doc_collocations)
            idf = log(total_docs / sum(collocation in doc for doc in doc_collocations))
            tfidf_scores[collocation] = tf * idf
        return tfidf_scores

# Input: 
#   corpus_df: a dataframe containing the corpus' data
#   k: number of top collocations
#   n: length of collocations
#   t: min threshold for the amount of collocations
# Output:
#   collocation:grade list from the corpus
def get_k_n_t_collocations(corpus_df, k, n, t, threshold, type):

    #produce all collocations of length n
    corpus_df['collocations'] = corpus_df['sentence_text'].apply(lambda x: generate_ngrams(x, n))

    # place all collocations in a dictionary of structure <Collocation>: <Count>
    collocation_counts = {}
    for coll_list in corpus_df['collocations']:
        for coll in coll_list:
            collocation_counts[coll] = collocation_counts.get(coll, 0) + 1

    if type == "frequency":
        # only include collcations that appear more than <t>
        filtered_collocations = {coll: count for coll, count in collocation_counts.items() if count >= t}
    elif type == "tfidf":

        total_docs = len(corpus_df['protocol_number'].unique())
        tfidf_scores = {}

        # group by protocol numbers
        grouped = corpus_df.groupby('protocol_number')
        
        for protocol_number, group in grouped:
            # get all collocations in the current protocol
            doc_collocations = [coll for coll_list in group['collocations'] for coll in coll_list]
            doc_tfidf = compute_tfidf(collocation_counts.keys(), doc_collocations, total_docs)
            
            for coll, score in doc_tfidf.items():
                tfidf_scores[coll] = tfidf_scores.get(coll, 0) + score

        # only include collcations that appear more than <t>
        filtered_collocations = {coll: score for coll, score in tfidf_scores.items() if score >= t}
    else:
        raise ValueError("Invalid type. Use 'frequency' or 'tfidf'.")

    # Step 5: Return the top-k collocations by score
    sorted_collocations = sorted(filtered_collocations.items(), key=lambda x: x[1], reverse=True)[:k]
    return sorted_collocations

In [None]:
corpus_path = 'knesset_corpus.jsonl'
output_file = 'top_collocations.txt'
k = 10  # top 10 collocations
n = 2   # bigrams
t = 5   # minimum of <t> counts for an n-gram
threshold = 0.1  # Minimum tf-idf score
type = 'tfidf'

#load corpus to df
with open(corpus_path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]
corpus_df = pd.DataFrame(data)

result = get_k_n_t_collocations(corpus_df, k, n, t, threshold, type)

#save the output into a file
with open(output_file, 'w', encoding='utf-8') as f:
        for collocation, score in result:
            f.write(f"{collocation}: {score}\n")

print("Complete")