In [None]:
import pandas as pd
import re
import math
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import os


# Method 3: Using Embedding Model

In [None]:
choi_folder_path = "data/choi"

# Function to extract segments from a file
def extract_segments(file_path):
    with open(file_path, "r") as file:
        segmented_text = file.read()

    # Split the text by "==========" 
    segments = segmented_text.strip().split("==========")

    # Remove any  whitespace from each segment and remove empty segments
    segments = [segment.strip() for segment in segments if segment.strip()]
    
    return segments

data = []

# Walk through all subdirectories of choi folder
for root, _, files in os.walk(choi_folder_path):
    for file in files:
        if file.endswith(".ref"):
            file_path = os.path.join(root, file)
            segments = extract_segments(file_path)
            united_text = " ".join(segments)  # Combine segments into a single text
            data.append({
                "File": file_path,
                "Number of segments": len(segments),
                "segments": segments,
                "united_text": united_text
            })

df = pd.DataFrame(data)


# df.to_csv("segments_data_with_united_text.csv", index=False)

# Display the DataFrame
df.head()

In [None]:
def preprocess_text(text, remPunct = True):
    # Preprocess the text by converting to lowercase and removing punctuation
    text = text.lower()
    if remPunct :
        text = re.sub(r'[^\w\s]', '', text)
    return text

def sentence_embed(sentences, model_name='paraphrase-MiniLM-L6-v2'):
    # Load the SentenceTransformer model
    model = SentenceTransformer(model_name)

    # Get sentence embeddings using the model
    embeddings = model.encode(sentences)

    return embeddings
def calculate_sentence_similarity(embeddings):
    # Calculate cosine similarity between adjacent sentence embeddings
    similarities = []
    for i in range(len(embeddings) - 1):
        similarity = np.dot(embeddings[i], embeddings[i + 1]) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
        similarities.append(similarity)
    
    return similarities

def topic_segmentation_embed(text, threshold=0.5):
    # Perform text segmentation based on sentence embeddings

    text = preprocess_text(text, remPunct = False)

    # Step 1: Tokenize sentences
    sentences = text.split('.')

    # Step 2: Vectorize sentences using SentenceTransformer
    embeddings = sentence_embed(sentences)

    # Step 3: Calculate sentence similarity
    sentence_similarity = calculate_sentence_similarity(embeddings)

    # Step 4: Identify potential boundaries based on sentence similarity
    boundaries = [0]

    for i in range(1, len(sentence_similarity)):
        if sentence_similarity[i] < threshold:
            boundaries.append(i)

    boundaries.append(len(sentence_similarity))

    # Step 5: Segment the text into topics based on the identified boundaries
    topics = []
    for i in range(len(boundaries) - 1):
        start = boundaries[i]
        end = boundaries[i + 1]
        topic_text = '.'.join(sentences[start:end])
        topics.append(topic_text)

    return topics



In [None]:
sample_text = """
Text segmentation, also known as text splitting, is the process of dividing a continuous text into segments or sections based on some patterns or criteria. These segments are intended to represent different topics or themes present in the text. Text segmentation is a common technique used in natural language processing (NLP) and information retrieval tasks.

There are several methods and algorithms for text segmentation. One such method is called TextTiling. TextTiling is a technique developed by Marti Hearst in 1994. It is mainly used for segmenting longer texts, such as essays, articles, or documents. TextTiling relies on finding patterns in word frequencies and co-occurrences to identify boundaries between different topics.

In this example, we will implement a basic Python code to perform topic segmentation using SentenceTransformer to vectorize sentences. SentenceTransformer provides pre-trained models for sentence embeddings, which allow us to capture semantic meaning and similarity between sentences.

Let's get started with the implementation.
"""

segmented_topics = topic_segmentation_embed(sample_text, threshold=0.4)
for i, topic in enumerate(segmented_topics, start=1):
    print(f"Topic {i}:")
    print(topic.strip())
    print("-----------\n")


In [None]:
threshold = []
num_topics = []

for i in range(20) :
    thres = 0 + (i+1)/20.0
    segmented_topics = topic_segmentation_embed(sample_text, threshold=thres)
    threshold.append(thres)
    num_topics.append(len(segmented_topics))
    
plt.plot(threshold, num_topics)

In [None]:
sample_text = df['united_text'][0]
segmented_topics = topic_segmentation_embed(sample_text, threshold=0.4)
for i, topic in enumerate(segmented_topics, start=1):
    print(f"Topic {i}:")
    print(topic.strip())
    print("-----------\n")

In [None]:
threshold = []
num_topics = []

for i in range(20) :
    thres = 0 + (i+1)/20.0
    segmented_topics = topic_segmentation_embed(sample_text, threshold=thres)
    threshold.append(thres)
    num_topics.append(len(segmented_topics))
    
plt.plot(threshold, num_topics)

In [None]:
def preprocess_text(text, remPunct = True):
    # Preprocess the text by converting to lowercase and removing punctuation
    text = text.lower()
    if remPunct :
        text = re.sub(r'[^\w\s]', '', text)
    return text

def sentence_embed(sentences, model_name='paraphrase-MiniLM-L6-v2'):
    # Load the SentenceTransformer model
    model = SentenceTransformer(model_name)

    # Get sentence embeddings using the model
    embeddings = model.encode(sentences)

    return embeddings
def calculate_sentence_similarity(embeddings):
    # Calculate cosine similarity between adjacent sentence embeddings
    similarities = []
    for i in range(len(embeddings) - 1):
        similarity = np.dot(embeddings[i], embeddings[i + 1]) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
        similarities.append(similarity)
    
    return similarities

def calculate_cosine_similarity(sentence1, sentence2):
    # Calculate cosine similarity between two sentences

    def vectorize(sentence):
        words = sentence.split()
        word_freq = {word: words.count(word) for word in set(words)}
        return word_freq

    vec_sentence1 = vectorize(sentence1)
    vec_sentence2 = vectorize(sentence2)

    intersection = set(vec_sentence1.keys()) & set(vec_sentence2.keys())
    dot_product = sum(vec_sentence1[word] * vec_sentence2[word] for word in intersection)

    magnitude1 = math.sqrt(sum(vec_sentence1[word] ** 2 for word in vec_sentence1))
    magnitude2 = math.sqrt(sum(vec_sentence2[word] ** 2 for word in vec_sentence2))
    
    if magnitude1 * magnitude2 == 0 :
        return 0
    else :
        return dot_product / (magnitude1 * magnitude2)

def topic_segmentation_embed(text,topic_segmentation_embed, num_topics=10):
    # Perform text segmentation based on sentence embeddings

    text = preprocess_text(text, remPunct=False)

    # Step 1: Tokenize sentences
    sentences = text.split('.')

    # Step 2: Vectorize sentences using SentenceTransformer
    embeddings = sentence_embed(sentences)

    # Step 3: Calculate sentence similarity
    sentence_similarity = calculate_sentence_similarity(embeddings)

    # Calculate the number of sentences per topic
    sentences_per_topic = len(sentences) // num_topics

    # Step 4: Identify boundaries by selecting sentences that maximize dissimilarity
    boundaries = [0]

    for _ in range(num_topics - 1):
        start = boundaries[-1]
        end_candidates = range(start + sentences_per_topic, len(sentence_similarity))
        max_dissimilarity_index = max(end_candidates, key=lambda i: max(sentence_similarity[start:i]))
        
        # Check if the maximum similarity in the selected range is below the threshold
        if max(sentence_similarity[start:max_dissimilarity_index]) < threshold:
            boundaries.append(max_dissimilarity_index)
        else:
            # If the threshold is not met, adjust the range to include more sentences
            for i in end_candidates:
                if max(sentence_similarity[start:i]) >= threshold:
                    max_dissimilarity_index = i - 1
                    boundaries.append(max_dissimilarity_index)
                    break

    boundaries.append(len(sentence_similarity))

    # Step 5: Segment the text into topics based on the identified boundaries
    topics = []
    for i in range(len(boundaries) - 1):
        start = boundaries[i]
        end = boundaries[i + 1]
        topic_text = '.'.join(sentences[start:end])
        topics.append(topic_text)

    return topics


In [None]:
sample_text = df['united_text'][0]
segmented_topics = topic_segmentation_embed(sample_text)
for i, topic in enumerate(segmented_topics, start=1):
    print(f"Topic {i}:")
    print(topic.strip())
    print("-----------\n")

In [None]:
actual_segments = df['segments'][0]

In [None]:
import re
import math
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_text(text, remPunct=True):
    # Preprocess the text by converting to lowercase and removing punctuation
    text = text.lower()
    if remPunct:
        text = re.sub(r'[^\w\s]', '', text)
    return text

def sentence_embed(sentences, model_name='paraphrase-MiniLM-L6-v2'):
    # Load the SentenceTransformer model
    model = SentenceTransformer(model_name)

    # Get sentence embeddings using the model
    embeddings = model.encode(sentences)

    return embeddings

def calculate_sentence_similarity(embeddings):
    # Calculate cosine similarity between adjacent sentence embeddings
    similarities = []
    for i in range(len(embeddings) - 1):
        similarity = np.dot(embeddings[i], embeddings[i + 1]) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
        similarities.append(similarity)
    
    return similarities

def calculate_cosine_similarity(sentence1, sentence2):
    # Calculate cosine similarity between two sentences

    def vectorize(sentence):
        words = sentence.split()
        word_freq = {word: words.count(word) for word in set(words)}
        return word_freq

    vec_sentence1 = vectorize(sentence1)
    vec_sentence2 = vectorize(sentence2)

    intersection = set(vec_sentence1.keys()) & set(vec_sentence2.keys())
    dot_product = sum(vec_sentence1[word] * vec_sentence2[word] for word in intersection)

    magnitude1 = math.sqrt(sum(vec_sentence1[word] ** 2 for word in vec_sentence1))
    magnitude2 = math.sqrt(sum(vec_sentence2[word] ** 2 for word in vec_sentence2))
    
    if magnitude1 * magnitude2 == 0 :
        return 0
    else :
        return dot_product / (magnitude1 * magnitude2)

def topic_segmentation_embed(text, threshold=0.5, num_topics=10):
    # Perform text segmentation based on sentence embeddings

    text = preprocess_text(text, remPunct=False)

    # Step 1: Tokenize sentences
    sentences = text.split('.')

    # Step 2: Vectorize sentences using SentenceTransformer
    embeddings = sentence_embed(sentences)

    # Step 3: Calculate sentence similarity
    sentence_similarity = calculate_sentence_similarity(embeddings)

    # Calculate the number of sentences per topic
    sentences_per_topic = len(sentences) // num_topics

    # Step 4: Identify boundaries by selecting sentences that maximize dissimilarity
    boundaries = [0]

    for _ in range(num_topics - 1):
        start = boundaries[-1]
        end_candidates = range(start + sentences_per_topic, len(sentence_similarity))
        max_dissimilarity_index = None

        for i in end_candidates:
            if max(sentence_similarity[start:i]) >= threshold:
                max_dissimilarity_index = i - 1
                break

        if max_dissimilarity_index is None:
            # If no suitable index was found, break the loop
            break

        boundaries.append(max_dissimilarity_index)

    boundaries.append(len(sentence_similarity))

    # Step 5: Segment the text into topics based on the identified boundaries
    topics = []
    for i in range(len(boundaries) - 1):
        start = boundaries[i]
        end = boundaries[i + 1]
        topic_text = '.'.join(sentences[start:end])
        topics.append(topic_text)

    return topics

# List of threshold values to test
threshold_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Calculate similarity scores for each threshold
for threshold in threshold_range:
    predicted_segments = topic_segmentation_embed(sample_text, threshold=threshold)
    
    # Check if the number of predicted segments is 10
    if len(predicted_segments) == 10:
        # Calculate similarity score for the obtained segments and actual segments
        similarity_scores = []
        for obtained_segment in predicted_segments:
            max_similarity = 0
            for actual_segment in actual_segments:
                similarity = calculate_cosine_similarity(obtained_segment, actual_segment)
                max_similarity = max(max_similarity, similarity)
            similarity_scores.append(max_similarity)

        # Print or store the similarity scores for this threshold
        print(f"Threshold: {threshold}, Similarity Scores: {similarity_scores}")
    else:
        # If not 10 segments, add zero similarity scores
        similarity_scores = [0] * len(predicted_segments)
        print(f"Threshold: {threshold}, Number of Predicted Segments: {len(predicted_segments)}, Similarity Scores: {similarity_scores}")


In [None]:
# List of threshold values to test
threshold_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Calculate and print the average similarity scores for each threshold
for threshold in threshold_range:
    predicted_segments = topic_segmentation_embed(sample_text, threshold=threshold)
    
    # Check if the number of predicted segments is 10
    if len(predicted_segments) == 10:
        # Calculate similarity score for the obtained segments and actual segments
        similarity_scores = []
        for obtained_segment in predicted_segments:
            max_similarity = 0
            for actual_segment in actual_segments:
                similarity = calculate_cosine_similarity(obtained_segment, actual_segment)
                max_similarity = max(max_similarity, similarity)
            similarity_scores.append(max_similarity)

        # Calculate the average similarity score
        avg_similarity = sum(similarity_scores) / len(similarity_scores)
        
        # Print the average similarity score for this threshold
        print(f"Threshold: {threshold}, Average Similarity Score: {avg_similarity}")
    else:
        # If not 10 segments, print an average similarity score of 0
        print(f"Threshold: {threshold}, Number of Predicted Segments: {len(predicted_segments)}, Average Similarity Score: 0")


In [None]:
import pandas as pd

# List of threshold values to test
threshold_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Calculate the average similarity score for a given threshold
def calculate_average_similarity(predicted_segments, actual_segments):
    if len(predicted_segments) != 10:
        return 0
    
    similarity_scores = []
    for obtained_segment in predicted_segments:
        max_similarity = 0
        for actual_segment in actual_segments:
            similarity = calculate_cosine_similarity(obtained_segment, actual_segment)
            max_similarity = max(max_similarity, similarity)
        similarity_scores.append(max_similarity)
    
    avg_similarity = sum(similarity_scores) / len(similarity_scores)
    return avg_similarity

# Calculate and store the average similarity scores for each threshold
for threshold in threshold_range:
    df[f'Predicted_segments_{threshold}'] = df['united_text'].apply(lambda text: topic_segmentation_embed(text, threshold=threshold))
    df[f'Avg_score_{threshold}'] = df.apply(lambda row: calculate_average_similarity(row[f'Predicted_segments_{threshold}'], row['segments']), axis=1)

In [None]:
df.head()

In [None]:
# List of columns for which you want to calculate the average
columns_to_average = ['Avg_score_0.1', 'Avg_score_0.2', 'Avg_score_0.3','Avg_score_0.4','Avg_score_0.5','Avg_score_0.6','Avg_score_0.7',
                      'Avg_score_0.8','Avg_score_0.9','Avg_score_1.0']

# Calculate the average values for the specified columns
average_values = df[columns_to_average].mean()

# Print the average values
print(average_values)