In [None]:
import pandas as pd
import os
import re
import json
from math import log

def generate_ngrams(text, n):
        words = re.findall(r'\b\w+\b', text.lower())
        return [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]

def compute_idf(corpus_collocations, total_docs):
    idf_scores = {}
    doc_counts = {}

    # Count the number of documents containing each collocation
    for doc_collocations in corpus_collocations:
        unique_collocations = set(doc_collocations)  # Ensure unique collocations per document
        for collocation in unique_collocations:
            doc_counts[collocation] = doc_counts.get(collocation, 0) + 1

    # Compute IDF for each collocation
    for collocation, doc_count in doc_counts.items():
        idf_scores[collocation] = log(total_docs / (1 + doc_count))

    return idf_scores


def compute_tf(doc_collocations, collocations):
    total_count = len(doc_collocations)
    tf_scores = {coll: doc_collocations.count(coll) / total_count for coll in collocations}
    return tf_scores

def compute_tfidf(tf_scores, idf_scores):
    return {coll: tf * idf_scores.get(coll, 0) for coll, tf in tf_scores.items()}

# def compute_tfidf(collocations, doc_collocations, total_docs):
#         tfidf_scores = {}
#         for collocation in collocations:
#             tf = doc_collocations.count(collocation) / len(doc_collocations)
#             idf = log(total_docs / 1 + sum(collocation in doc for doc in doc_collocations))
#             tfidf_scores[collocation] = tf * idf
#         return tfidf_scores

# Input: 
#   corpus_df: a dataframe containing the corpus' data
#   k: number of top collocations
#   n: length of collocations
#   t: min threshold for the amount of collocations
# Output:
#   collocation:grade list from the corpus
def get_k_n_t_collocations(corpus_df, k, n, t, threshold, type):

    #produce all collocations of length n
    corpus_df['collocations'] = corpus_df['sentence_text'].apply(lambda x: generate_ngrams(x, n))

    # place all collocations in a dictionary of structure <Collocation>: <Count>
    collocation_counts = {}
    for coll_list in corpus_df['collocations']:
        for coll in coll_list:
            collocation_counts[coll] = collocation_counts.get(coll, 0) + 1

    if type == "frequency":
        # only include collcations that appear more than <t>
        filtered_collocations = {coll: count for coll, count in collocation_counts.items() if count >= t}
    elif type == "tfidf":

        total_docs = len(corpus_df['protocol_number'].unique())

        # group by protocol numbers
        grouped = corpus_df.groupby('protocol_number')
        
        corpus_collocations = [set(coll_list) for coll_list in corpus_df['collocations']]
        idf_scores = compute_idf(corpus_collocations, total_docs)

        tfidf_scores = {}

        for protocol_number, group in grouped:
            # Get all collocations in the current protocol
            doc_collocations = [coll for coll_list in group['collocations'] for coll in coll_list]

            tf_scores = compute_tf(doc_collocations, collocation_counts.keys())

            # Compute TF-IDF for collocations in this document
            tfidf = compute_tfidf(tf_scores, idf_scores)
            for coll, score in tfidf.items():
                tfidf_scores[coll] = tfidf_scores.get(coll, 0) + score

        # Only include collocations that have a score >= t
        filtered_collocations = {coll: score for coll, score in tfidf_scores.items() if score >= t}
    else:
        raise ValueError("Invalid type. Use 'frequency' or 'tfidf'.")

    sorted_collocations = sorted(filtered_collocations.items(), key=lambda x: x[1], reverse=True)[:k]
    return sorted_collocations

In [None]:
corpus_path = 'knesset_corpus.jsonl'
output_file = 'top_collocations.txt'
k = 10  # top 10 collocations
n = 2   # bigrams
t = 5   # minimum of <t> counts for an n-gram
threshold = 0.1  # Minimum tf-idf score
type = 'tfidf'

#load corpus to df
with open(corpus_path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]
corpus_df = pd.DataFrame(data)

result = get_k_n_t_collocations(corpus_df, k, n, t, threshold, type)

#save the output into a file
with open(output_file, 'w', encoding='utf-8') as f:
    for collocation, score in result:
        f.write(f"{collocation}: {score}\n")

print("Complete")