In [None]:
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

nltk.download("punkt")

def determine_text_length_bins(texts, num_bins=5):
    # Tokenize texts and count words
    word_counts = [len(word_tokenize(text)) for text in texts]
    
    # Calculate bin size based on the range of word counts
    min_word_count = min(word_counts)
    max_word_count = max(word_counts)
    bin_size = (max_word_count - min_word_count) // num_bins
    
    # Initialize bins
    bin_ranges = [(min_word_count + i * bin_size, min_word_count + (i + 1) * bin_size) for i in range(num_bins)]
    
    # Create a bar plot
    plt.figure(figsize=(10, 6))
    plt.hist(word_counts, bins=[bin_range[0] for bin_range in bin_ranges] + [bin_ranges[-1][1]], rwidth=0.8)
    
    # Label the plot
    plt.xticks([bin_range[0] for bin_range in bin_ranges], [f"{bin_range[0]}-{bin_range[1]}" for bin_range in bin_ranges])
    plt.xlabel("Text Length Range (Number of Words)")
    plt.ylabel("Number of Texts")
    plt.title("Text Length Distribution")
    
    # Show the plot
    plt.show()


In [None]:
from datasets import load_from_disk

In [None]:
# ds = load_from_disk('/data/shared/datasets/nlp/superseg_test_summarized.hf/')
# ds = load_from_disk('/data/shared/datasets/nlp/superseg_test_summarized_bart.hf/')
# ds = load_from_disk('/data/shared/datasets/nlp/superseg_train_summarized.hf/')
ds = load_from_disk('/data/shared/datasets/nlp/ami_summarized_bart.hf/')
# ds = load_from_disk('/data/shared/datasets/nlp/dialseg711_summarized.hf/')
# ds = load_from_disk('/data/shared/datasets/nlp/tiage_train_summarized.hf/')
# ds = load_from_disk('/data/shared/datasets/nlp/wiki727test_summarized.hf/').select(range(1000))

In [None]:
# ds = ds.filter(lambda x: len(word_tokenize(' '.join(x['sections']))) > 1460)

In [None]:
ds

In [None]:
texts = [' '.join(a) for a in ds['sections']]

In [None]:
determine_text_length_bins(texts, num_bins=10)

In [None]:
import sys
sys.path.append('..')
sys.path.append('../lib/pipelines/')
sys.path.append('../lib/pipelines/utils/')

In [None]:
import numpy as np
from utilities.tiling import TopicTilingModel, classify_borders
from utilities.general import calc_metric

In [None]:
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

nltk.download("punkt")

def determine_text_length_bins_with_custom_values(texts, custom_values, num_bins=5):
    # Tokenize texts and count words
    word_counts = [len(word_tokenize(text)) for text in texts]

    # Calculate bin size based on the range of word counts
    min_word_count = min(word_counts)
    max_word_count = max(word_counts)
    bin_size = (max_word_count - min_word_count) // num_bins

    # Initialize bins
    bin_ranges = [(min_word_count + i * bin_size, min_word_count + (i + 1) * bin_size) for i in range(num_bins)]

    # Initialize a dictionary to store custom values for each bin
    bin_custom_values = {bin_range: [] for bin_range in bin_ranges}

    # Assign custom values to bins based on text length
    for i, word_count in enumerate(word_counts):
        for bin_range in bin_ranges:
            if bin_range[0] <= word_count <= bin_range[1]:
                bin_custom_values[bin_range].append(custom_values[i])

    # Create a bar plot using custom values for each bin
    plt.figure(figsize=(10, 6))
    bins = [sum(bin_custom_values[bin_range])/(len(bin_custom_values[bin_range])+0.01) for bin_range in bin_ranges]
    print(sum(bins)/len(bins))
    plt.barh(range(len(bin_ranges)), bins)

    # Label the plot
    plt.yticks(range(len(bin_ranges)), [f"{bin_range[0]}-{bin_range[1]}" for bin_range in bin_ranges])
    plt.xlabel("Metric")
    plt.ylabel("Text Length Range (Number of Words)")
    plt.title("Text Length Distribution with Custom Values")

    # Show the plot
    plt.show()

    # Return the custom values for each bin based on text length
    return bin_custom_values

In [None]:
def get_scores(example):
    pred = classify_borders(example['embeddings'], 30, 0.6, 1, 30)
    wd, pk, f1 = calc_metric(example, pred)
    example['wd'] = float(wd)
    example['pk'] = float(pk)
    example['f1'] = float(f1)
    return example
ds = ds.map(get_scores)

bin_custom_values = determine_text_length_bins_with_custom_values(texts, ds['wd'], num_bins=10)

# Print the custom values for each bin based on text length
for bin_range, bin_values in bin_custom_values.items():
    print(f"Bin {bin_range}: {bin_values}")

In [None]:
def _get_borders_sumseg(example, tiling_model, plot):
    probabilities = example['probs']
    boundaries = tiling_model.transform(probabilities, gold_boundaries=example['boundaries'], plot=plot)
    return boundaries

params = {
            'window_size': 15, 
            'threshold': 0.6, 
            'smoothing_passes': 0, 
            'smoothing_window': 0,
            'n_smooth_savgol': 3,
            'savgol_k': 1/3,
            'polyorder': 3
        }
tiling_model = TopicTilingModel(**params)

def get_scores(example):
    # pred = classify_borders(example['embeddings'], 27, 0.65, 2, 6)
    pred = _get_borders_sumseg(example, tiling_model, False)
    wd, pk, f1 = calc_metric(example, pred)
    example['wd'] = wd
    example['pk'] = pk
    example['f1'] = f1
    return example

ds = ds.map(get_scores)

bin_custom_values = determine_text_length_bins_with_custom_values(texts, ds['wd'], num_bins=10)
for bin_range, bin_values in bin_custom_values.items():
    print(f"Bin {bin_range}: {bin_values}")

# Statistics

In [None]:
ds

In [None]:
from nltk.tokenize import word_tokenize


def calculate_statistics(ds, verbose=True):
    def mean(array):
        if len(array):
            return sum(array) / len(array)
        else:
            raise ValueError('Failed to calculate mean metric, check dataset!')
    
    doc_utterances = ds['sections']
    doc_texts = [' '.join(text) for text in doc_utterances]
    
    # create sections
    sections = []
    for example in ds:
        section = ''
        utterances = example['sections']
        boundaries = example['boundaries']
        for i in range(len(boundaries)):
            if boundaries[i] == '0':
                section += ' ' + utterances[i]
            else:
                sections.append(section.strip())
                section = utterances[i]
    
    # avg # sections in doc
    boundaries  = ds['boundaries']
    avg_segment_length = mean([len(b) / (b.count('1') + 1) for b in boundaries])

    # avg # words in section
    avg_n_words_in_section = mean([len(word_tokenize(section)) for section in sections])
    
    # min / max / avg # words in doc
    n_words_in_doc = [len(word_tokenize(text)) for text in doc_texts]
    avg_n_words_in_doc = mean(n_words_in_doc)
    min_n_words_in_doc = min(n_words_in_doc)
    max_n_words_in_doc = max(n_words_in_doc)
    
    # avg # utterances in doc
    avg_n_utterances_in_doc = mean([len(doc_utterance) for doc_utterance in doc_utterances])
    
    if verbose:
        print(f'# docs: {len(ds)}')
        print(f'min / avg / max # words in doc: {min_n_words_in_doc:.0f} / {avg_n_words_in_doc:.0f} / {max_n_words_in_doc:.0f}')
        print(f'avg # words in section: {avg_n_words_in_section:.0f}')
        print(f'avg # utterances in doc: {avg_n_utterances_in_doc:.0f}')
        print(f'avg # utterances in section: {avg_segment_length:.0f}')
        
    return avg_segment_length

In [None]:
calculate_statistics(ds)

# Correlation between # utterances and # sentences in summary

In [None]:
from datasets import load_from_disk
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [None]:
def correlation_utt_sent_sum(example):
    example['n_utterances'] = len(example['sections'])
    example['n_sentences_summary'] = len(example['splitted_summary'])
    return example

In [None]:
# ds = load_from_disk('/data/shared/datasets/nlp/ami_summarized_bart.hf/')
# ds = load_from_disk('/home/user/clustering/neuroclustering/lib/pipelines/runs/sumseg_run_20231015_224151/test_embedded_dataset') #qmsum
# ds = load_from_disk('/home/user/clustering/neuroclustering/lib/pipelines/runs/sumseg_run_20231015_225338/test_embedded_dataset') # superdialseg
# ds = load_from_disk('/home/user/clustering/neuroclustering/lib/pipelines/runs/sumseg_run_20231015_225758/test_embedded_dataset') # qmsum
ds = load_from_disk('/home/user/clustering/neuroclustering/lib/pipelines/runs/sumseg_run_20231015_230135/test_embedded_dataset') # dialseg

In [None]:
ds = ds.map(correlation_utt_sent_sum)
n_utterances = ds['n_utterances']
n_sentences_summary = ds['n_sentences_summary']

In [None]:
pearsonr(n_utterances, n_sentences_summary)

In [None]:
plt.scatter(n_utterances, n_sentences_summary)
plt.show()