In [1]:
import pandas as pd
import re
import math
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import os
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
choi_folder_path = "data/choi"

# Function to extract segments from a file
def extract_segments(file_path):
    with open(file_path, "r") as file:
        segmented_text = file.read()

    # Split the text by "==========" 
    segments = segmented_text.strip().split("==========")

    # Remove any  whitespace from each segment and remove empty segments
    segments = [segment.strip() for segment in segments if segment.strip()]
    
    return segments

data = []

# Walk through all subdirectories of choi folder
for root, _, files in os.walk(choi_folder_path):
    for file in files:
        if file.endswith(".ref"):
            file_path = os.path.join(root, file)
            segments = extract_segments(file_path)
            united_text = " ".join(segments)  # Combine segments into a single text
            data.append({
                "File": file_path,
                "Number of segments": len(segments),
                "segments": segments,
                "united_text": united_text
            })

df = pd.DataFrame(data)


# df.to_csv("segments_data_with_united_text.csv", index=False)

# Display the DataFrame
df.head()

Unnamed: 0,File,Number of segments,segments,united_text
0,data/choi\1\3-11\0.ref,10,[Santa Barbara -- `` The present recovery move...,Santa Barbara -- `` The present recovery movem...
1,data/choi\1\3-11\1.ref,10,[The vast Central Valley of California is one ...,The vast Central Valley of California is one o...
2,data/choi\1\3-11\10.ref,10,[The bronchus and pulmonary artery in this lun...,The bronchus and pulmonary artery in this lung...
3,data/choi\1\3-11\11.ref,10,[The Fulton County Grand Jury said Friday an i...,The Fulton County Grand Jury said Friday an in...
4,data/choi\1\3-11\12.ref,10,[Temperature of the wash and rinse waters is m...,Temperature of the wash and rinse waters is ma...


In [3]:
def preprocess_text(text, remPunct=True):
    text = text.lower()
    if remPunct:
        text = re.sub(r'[^\w\s]', '', text)
    return text

def calculate_cohesion_gaps(text, w):
    word_freqs = {}
    words = text.split()

    for i in range(len(words) - w + 1):
        window = words[i:i + w]
        word_freq = sum(window.count(word) for word in window)
        word_freqs[i] = word_freq

    cohesion_gaps = {}
    for i in range(len(words) - w):
        if i == 0:
            cohesion_gaps[i] = 0
        else:
            cohesion_gaps[i] = abs(word_freqs[i] - word_freqs[i - 1])

    return cohesion_gaps

def calculate_cohesion_score(cohesion_gaps, k):
    sorted_gaps = sorted(cohesion_gaps.values(), reverse=True)
    k_percentile = int(math.ceil(k * len(sorted_gaps) / 100))
    return sum(sorted_gaps[:k_percentile]) / k_percentile

def topic_segmentation(text, w=100, k=10, s=20, num_topics=10):
    text = preprocess_text(text)
    words = text.split()

    cohesion_gaps = calculate_cohesion_gaps(text, w)
    cohesion_score = calculate_cohesion_score(cohesion_gaps, k)
# this part of the code calculates and generates a list of candidate boundary positions based on the desired number of topics
# and the minimum gap requirement. 
# These candidate boundary positions will be used to segment the text into topics during the algorithm's execution.


    min_boundary_gap = len(words) // num_topics
    boundaries = list(range(min_boundary_gap, len(words) - w + 1, min_boundary_gap))

    topics = []
    start = 0
    for boundary in boundaries:
        topics.append(' '.join(words[start:boundary]))
        start = boundary

    topics.append(' '.join(words[start:]))

    return topics

# sample_text = """Text segmentation, also known as text splitting, is the process of dividing a continuous text into segments or sections based on some patterns or criteria. These segments are intended to represent different topics or themes present in the text. Text segmentation is a common technique used in natural language processing (NLP) and information retrieval tasks.
# There are several methods and algorithms for text segmentation. One such method is called TextTiling. TextTiling is a technique developed by Marti Hearst in 1994. It is mainly used for segmenting longer texts, such as essays, articles, or documents. TextTiling relies on finding patterns in word frequencies and co-occurrences to identify boundaries between different topics.
# In this example, we will implement a basic Python code to perform topic segmentation without using NLTK's TextTilingTokenizer. Instead, we will implement the segmentation from scratch based on the TextTiling algorithm.
# Let's get started with the implementation."""

# predicted_segments = topic_segmentation(sample_text, num_topics=10)
# for i, topic in enumerate(predicted_segments, start=1):
#     print(f"Topic {i}:")
#     print(topic.strip())
#     print("-----------\n")

In [4]:
predicted_segments = topic_segmentation(df['united_text'][0])
for i, topic in enumerate(predicted_segments, start=1):
    print(f"Topic {i}:")
    print(topic.strip())
    print("-----------\n")

Topic 1:
santa barbara the present recovery movement will gather steady momentum to lift the economy to a new historic peak by this autumn beryl w sprinkel economist of harris trust savings bank chicago predicted at the closing session here tuesday of investment bankers assn california group conference another speaker william h draper jr former under secretary of the army and now with the palo alto venture capital firm of draper gaither anderson urged the us to throw down the gauntlet of battle to communism and tell moscow bluntly we wo nt be pushed around any more he urged support for president kennedy s requests for both defense and foreign aid appropriations not flash in pan sprinkel told conferees that the recent improvement in economic activity was not a temporary flash in the pan but the beginning of a substantial cyclical expansion that will carry the economy back to full employment levels and witness a renewal of our traditional growth pattern
-----------

Topic 2:
in view of t

In [11]:
len(predicted_segments)

10

In [6]:
df['predicted_segments'] = df['united_text'].apply(topic_segmentation)

In [7]:
df['predicted_segments_length'] = df['predicted_segments'].apply(len)

In [8]:
df.head()

Unnamed: 0,File,Number of segments,segments,united_text,predicted_segments,predicted_segments_length
0,data/choi\1\3-11\0.ref,10,[Santa Barbara -- `` The present recovery move...,Santa Barbara -- `` The present recovery movem...,[santa barbara the present recovery movement w...,10
1,data/choi\1\3-11\1.ref,10,[The vast Central Valley of California is one ...,The vast Central Valley of California is one o...,[the vast central valley of california is one ...,10
2,data/choi\1\3-11\10.ref,10,[The bronchus and pulmonary artery in this lun...,The bronchus and pulmonary artery in this lung...,[the bronchus and pulmonary artery in this lun...,10
3,data/choi\1\3-11\11.ref,10,[The Fulton County Grand Jury said Friday an i...,The Fulton County Grand Jury said Friday an in...,[the fulton county grand jury said friday an i...,10
4,data/choi\1\3-11\12.ref,10,[Temperature of the wash and rinse waters is m...,Temperature of the wash and rinse waters is ma...,[temperature of the wash and rinse waters is m...,10


In [9]:
def calculate_boundaries(text):
    boundaries = []
    current_index = 0
    for segment in text:
        current_index += len(segment) + 1  # Add 1 for the period at the end of the sentence
        boundaries.append(current_index - 1)  # Subtract 1 to get the index of the period
    return boundaries

In [10]:
predicted_boundaries = calculate_boundaries(df['predicted_segments'][0])
predicted_boundaries

[956, 1828, 2740, 3568, 4524, 5393, 6359, 7338, 8263, 9215]

In [12]:
actual_boundaries = calculate_boundaries(df['segments'][0])
actual_boundaries

[1297, 1934, 2615, 3426, 4580, 5165, 6145, 7604, 8204, 9816]

In [13]:
df['predicted_boundaries'] = df['predicted_segments'].apply(calculate_boundaries)

In [14]:
df['actual_boundaries'] = df['segments'].apply(calculate_boundaries)

In [15]:
df = df[df['predicted_boundaries'].apply(len) == df['actual_boundaries'].apply(len)]

In [16]:
def calculate_percentage_error(predicted_values, actual_values):
    if len(predicted_values) != len(actual_values):
        raise ValueError("The predicted_values and actual_values lists must have the same length.")

    total_percentage_error = 0

    for i in range(1, len(predicted_values)):
        predicted = predicted_values[i]
        actual = actual_values[i]
        actual_previous = actual_values[i - 1]

        if actual_previous == 0:
            raise ValueError("Actual value cannot be zero.")

        segment_length = actual - actual_previous
        percentage_error = abs(predicted - actual) / segment_length * 100
        total_percentage_error += percentage_error

    average_percentage_error = total_percentage_error / (len(predicted_values) - 1)  # Subtract 1 for segments
    return average_percentage_error

# Example usage:
average_error = calculate_percentage_error(predicted_boundaries, actual_boundaries)
print(f"Average Percentage Error: {average_error:.2f}%")


Average Percentage Error: 20.39%


In [24]:
df['average_percentage_error'] = df.apply(lambda row: calculate_percentage_error(row['predicted_boundaries'], row['actual_boundaries']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['average_percentage_error'] = df.apply(lambda row: calculate_percentage_error(row['predicted_boundaries'], row['actual_boundaries']), axis=1)


In [37]:
df['predicted_boundaries'][1]

[1260, 2516, 3700, 4903, 6172, 7498, 8818, 10101, 11330, 12591]

In [38]:
df['actual_boundaries'][1]

[1402, 2547, 4022, 5974, 7209, 8401, 9904, 11071, 12017, 13342]

In [39]:
calculate_percentage_error(df['predicted_boundaries'][1], df['actual_boundaries'][1])

58.200309760020495

In [28]:
def evaluate_segmentation_with_placement(actual_boundaries, predicted_boundaries, tolerance=100):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for pred_boundary in predicted_boundaries:
        closest_true_boundary = min(actual_boundaries, key=lambda x: abs(x - pred_boundary))
        if abs(closest_true_boundary - pred_boundary) <= tolerance:
            true_positives += 1
            actual_boundaries.remove(closest_true_boundary)
        else:
            false_positives += 1
 
    false_negatives = len(actual_boundaries)

    accuracy = true_positives / len(predicted_boundaries)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    return accuracy, precision, recall

accuracy, precision, recall = evaluate_segmentation_with_placement(actual_boundaries, predicted_boundaries)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.20
Precision: 0.20
Recall: 0.20


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(predicted_segments, true_segments):
    # Combine all segments into a single list for vectorization
    all_segments = predicted_segments + true_segments

    # Initialize a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Calculate TF-IDF vectors for all segments
    tfidf_matrix = vectorizer.fit_transform(all_segments)

    # Calculate cosine similarity between predicted and true segments
    similarity_matrix = cosine_similarity(tfidf_matrix[:len(predicted_segments)], tfidf_matrix[len(predicted_segments):])

    return similarity_matrix
# Example usage
predicted_segments = predicted_segments
true_segments = df['segments'][0]

similarity_matrix = calculate_similarity(predicted_segments, true_segments)


# Create a DataFrame in the desired format
similarity_df = pd.DataFrame(similarity_matrix, columns=[f"True {i}" for i in range(1, len(true_segments) + 1)],
                              index=[f"Pred {i}" for i in range(1, len(predicted_segments) + 1)])

# Print the DataFrame on a single line with indexes
print(similarity_df.to_string(header=True, float_format="{:.4f}".format, index_names=False))

         True 1  True 2  True 3  True 4  True 5  True 6  True 7  True 8  True 9  True 10
Pred 1   0.9084  0.1100  0.1568  0.1733  0.2376  0.0848  0.2243  0.2225  0.1670   0.2810
Pred 2   0.3982  0.8412  0.2634  0.1578  0.2472  0.1173  0.2523  0.1785  0.1216   0.2375
Pred 3   0.2409  0.1962  0.8695  0.4297  0.2403  0.0869  0.2289  0.1639  0.1404   0.2548
Pred 4   0.2557  0.1754  0.1766  0.7209  0.5351  0.1537  0.2312  0.2366  0.1666   0.3026
Pred 5   0.2166  0.1271  0.1368  0.1665  0.7426  0.4055  0.2262  0.2087  0.1796   0.2893
Pred 6   0.2739  0.1529  0.1477  0.1675  0.2920  0.5564  0.7176  0.2338  0.2043   0.3158
Pred 7   0.2478  0.1407  0.1487  0.1652  0.2455  0.0882  0.5060  0.6137  0.1693   0.2898
Pred 8   0.2329  0.1146  0.1109  0.1578  0.2478  0.1280  0.2234  0.6983  0.4607   0.2465
Pred 9   0.2703  0.1091  0.1594  0.1762  0.2732  0.1173  0.3017  0.2147  0.5624   0.6348
Pred 10  0.3029  0.1549  0.1843  0.1918  0.3482  0.1189  0.3273  0.2591  0.2153   0.8767


In [30]:
# Get the minimum length between true_segments and predicted_segments
min_length = min(len(true_segments), len(predicted_segments))

# Iterate through the range of indices up to min_length to print similarity scores
for i in range(min_length):
    true_segment_label = f"True {i + 1}"
    pred_segment_label = f"Pred {i + 1}"
    similarity_score = similarity_df.loc[pred_segment_label, true_segment_label]
    print(f"Similarity between {true_segment_label} and {pred_segment_label}: {similarity_score:.4f}")


Similarity between True 1 and Pred 1: 0.9084
Similarity between True 2 and Pred 2: 0.8412
Similarity between True 3 and Pred 3: 0.8695
Similarity between True 4 and Pred 4: 0.7209
Similarity between True 5 and Pred 5: 0.7426
Similarity between True 6 and Pred 6: 0.5564
Similarity between True 7 and Pred 7: 0.5060
Similarity between True 8 and Pred 8: 0.6983
Similarity between True 9 and Pred 9: 0.5624
Similarity between True 10 and Pred 10: 0.8767


In [31]:
# Get the minimum length between true_segments and predicted_segments
min_length = min(len(true_segments), len(predicted_segments))
all_similarity = []
# Iterate through the range of indices up to min_length to print similarity scores
for i in range(min_length):
    true_segment_label = f"True {i + 1}"
    pred_segment_label = f"Pred {i + 1}"
    similarity_score = similarity_df.loc[pred_segment_label, true_segment_label]
    all_similarity.append(similarity_score)
Avg_similarity = round(np.mean(all_similarity),4)
    
#     print(f"Similarity between {true_segment_label} and {pred_segment_label}: {similarity_score:.4f}")
print(Avg_similarity)

0.7282


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

def calculate_similarity(predicted_segments, true_segments):
    # Combine all segments into a single list for vectorization
    all_segments = predicted_segments + true_segments

    # Initialize a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Calculate TF-IDF vectors for all segments
    tfidf_matrix = vectorizer.fit_transform(all_segments)

    # Calculate cosine similarity between predicted and true segments
    similarity_matrix = cosine_similarity(tfidf_matrix[:len(predicted_segments)], tfidf_matrix[len(predicted_segments):])

    return similarity_matrix


# Calculate average similarity for each row and add 'avg_similarity' column
df['avg_similarity'] = df.apply(lambda row: calculate_similarity(row['predicted_segments'], row['segments']), axis=1)
df['avg_similarity'] = df['avg_similarity'].apply(lambda matrix: round(np.mean(np.diag(matrix)), 4))

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_similarity'] = df.apply(lambda row: calculate_similarity(row['predicted_segments'], row['segments']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_similarity'] = df['avg_similarity'].apply(lambda matrix: round(np.mean(np.diag(matrix)), 4))


Unnamed: 0,File,Number of segments,segments,united_text,predicted_segments,predicted_segments_length,predicted_boundaries,actual_boundaries,average_percentage_error,avg_similarity
0,data/choi\1\3-11\0.ref,10,[Santa Barbara -- `` The present recovery move...,Santa Barbara -- `` The present recovery movem...,[santa barbara the present recovery movement w...,10,"[956, 1828, 2740, 3568, 4524, 5393, 6359, 7338...","[1297, 1934, 2615, 3426, 4580, 5165, 6145, 760...",20.39,0.73
1,data/choi\1\3-11\1.ref,10,[The vast Central Valley of California is one ...,The vast Central Valley of California is one o...,[the vast central valley of california is one ...,10,"[1260, 2516, 3700, 4903, 6172, 7498, 8818, 101...","[1402, 2547, 4022, 5974, 7209, 8401, 9904, 110...",58.2,0.83
2,data/choi\1\3-11\10.ref,10,[The bronchus and pulmonary artery in this lun...,The bronchus and pulmonary artery in this lung...,[the bronchus and pulmonary artery in this lun...,10,"[1025, 2107, 3143, 4275, 5324, 6338, 7422, 853...","[1644, 2083, 3350, 5123, 5789, 7114, 7741, 955...",53.62,0.81
3,data/choi\1\3-11\11.ref,10,[The Fulton County Grand Jury said Friday an i...,The Fulton County Grand Jury said Friday an in...,[the fulton county grand jury said friday an i...,10,"[728, 1403, 2107, 2822, 3602, 4268, 4960, 5679...","[1337, 1725, 2351, 2910, 3897, 4896, 5457, 652...",67.19,0.72
4,data/choi\1\3-11\12.ref,10,[Temperature of the wash and rinse waters is m...,Temperature of the wash and rinse waters is ma...,[temperature of the wash and rinse waters is m...,10,"[1081, 2314, 3458, 4662, 5863, 7063, 8300, 941...","[945, 1352, 2195, 4033, 5774, 7277, 8334, 9504...",54.67,0.63


In [33]:
len(df['segments'][0][0])

1297

In [34]:
df['avg_similarity'].mean()

0.7262494712103407

In [40]:
df['average_percentage_error'].mean()

60.395165384593604

In [41]:
df['average_percentage_error'].corr(df['avg_similarity'])

-0.7860333282447101

In [36]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# def calculate_similarity(predicted_segments, true_segments):
#     # Combine all segments into a single list for vectorization
#     all_segments = predicted_segments + true_segments

#     # Initialize a TF-IDF vectorizer
#     vectorizer = TfidfVectorizer()

#     # Calculate TF-IDF vectors for all segments
#     tfidf_matrix = vectorizer.fit_transform(all_segments)

#     # Calculate cosine similarity between predicted and true segments
#     similarity_matrix = cosine_similarity(tfidf_matrix[:len(predicted_segments)], tfidf_matrix[len(predicted_segments):])

#     return similarity_matrix

# def calculate_metrics_for_pairs(similarity_matrix, threshold=0.7):
#     metrics = []

#     for i in range(similarity_matrix.shape[0]):
#         similarity_score = similarity_matrix[i, i]
#         true_positive = 1 if similarity_score >= threshold else 0
#         false_positive = 1 - true_positive
#         false_negative = 0  # We're only considering diagonal elements

#         metrics.append((true_positive, false_positive, false_negative))

#     return metrics
# # Example usage
# predicted_segments = predicted_segments
# true_segments = df['segments'][0]

# # Set the threshold for similarity scores
# threshold = 0.7

# # Calculate similarity matrix
# similarity_matrix = calculate_similarity(predicted_segments, true_segments)

# # Calculate metrics for each pair of "True i" and "Pred i"
# metrics = calculate_metrics_for_pairs(similarity_matrix, threshold)

# # Calculate overall metrics
# true_positives = sum(tp for tp, _, _ in metrics)
# false_positives = sum(fp for _, fp, _ in metrics)
# false_negatives = sum(fn for _, _, fn in metrics)

# accuracy = true_positives / (true_positives + false_positives + false_negatives)

# precision_denominator = true_positives + false_positives
# precision = true_positives / precision_denominator if precision_denominator != 0 else 0

# recall_denominator = true_positives + false_negatives
# recall = true_positives / recall_denominator if recall_denominator != 0 else 0

# # Print the metrics for each pair and overall metrics
# for i, (tp, fp, fn) in enumerate(metrics, start=1):
#     print(f"For True {i} and Pred {i}:")
#     print(f"True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}\n")

# print("Overall Metrics:")
# print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)
