Author: Naomi Baes and Chat GPT

Description: The script processes a collection of text files containing sentences grouped by specific intervals, calculates dissimilarity scores based on sentence embeddings generated from a pre-trained model, and saves the results along with any detected issues such as NaN values to output files. Additionally, the script logs information about NaN values to a designated folder for further analysis and debugging.

Model url: https://huggingface.co/sentence-transformers/all-mpnet-base-v2

Note: File takes a while to run (~60 mins).

In [2]:
from sentence_transformers import SentenceTransformer
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import codecs  # Import the 'codecs' module for file I/O with specific encodings

# check out the model
model = SentenceTransformer("all-mpnet-base-v2") # initialize a sentence transformer with model name; go to sbert.net to select model
model # should have transformer model and pooling layer

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [3]:
def calculate_dissimilarity_scores(embeddings):
    similarity_matrix = cosine_similarity(embeddings)
    dissimilarity_scores = 1 - similarity_matrix
    
    # Get the shape of the dissimilarity matrix
    n = dissimilarity_scores.shape[0]
    
    # Create a mask to select the upper triangular part (excluding diagonal)
    upper_triangular_mask = np.triu(np.ones((n, n)), k=1)
    
    # Apply the mask to get the upper triangular part of the dissimilarity matrix
    upper_triangular_dissimilarity = dissimilarity_scores[upper_triangular_mask == 1]
    
    return upper_triangular_dissimilarity

def main(data_folder, output_folder, model):
    for file_name in os.listdir(data_folder):
        if file_name.endswith(('.1', '.2', '.3', '.4', '.5', '.6', '.7', '.8', '.9', '.10')):
            file_path = os.path.join(data_folder, file_name)
            if os.path.isfile(file_path):
                print(f"Processing file: {file_path}")
                
                try:
                    # Extract information from the file name
                    parts = file_name.split('.')
                    term = parts[0]
                    year_range = parts[1]
                    corpus = parts[2]
                    epoch = parts[-2]
                    
                    sentences = codecs.open(file_path, "r").readlines()
                    embeddings = [model.encode(sentence.strip()) for sentence in sentences]
                    print(f"Number of embeddings generated: {len(embeddings)}")

                    # Calculate dissimilarity scores directly from embeddings
                    dissimilarity_scores = calculate_dissimilarity_scores(embeddings)

                    if dissimilarity_scores is not None:
                        # Modify output filename to append "_cds_mpnet" to the original file name
                        output_filename = os.path.join(output_folder, f"{file_name}_cds_mpnet")
                        np.savetxt(output_filename, dissimilarity_scores, fmt='%.6f')
                        print(f"Saved the dissimilarity scores to: {output_filename}")

                        # Print the count of NaN values and their associated years (if applicable)
                        nan_count = np.isnan(dissimilarity_scores).sum()
                        nan_years = [year_range] * nan_count
                        print(f"Number of NaN values in the output file ({file_name}): {nan_count}")
                        if nan_count > 0:
                            print(f"Years where NaN values appeared: {nan_years}")

                    else:
                        print(f"Skipping file {file_name} due to error in dissimilarity score calculation")

                except Exception as e:
                    print(f"Error processing file {file_name}: {e}")

data_folder = "output/5-year.cosine"
output_folder = "output/5-year.cosine"

# Call the main function with the loaded model (from the top of the script)
main(data_folder, output_folder, model)

print("Processing completed.")


Processing file: output/5-year.cosine\mental_health.1970-1974.cohacoca.1
Number of embeddings generated: 23
Saved the dissimilarity scores to: output/5-year.cosine\mental_health.1970-1974.cohacoca.1_cds_mpnet
Number of NaN values in the output file (mental_health.1970-1974.cohacoca.1): 0
Processing file: output/5-year.cosine\mental_health.1970-1974.cohacoca.10
Number of embeddings generated: 23
Saved the dissimilarity scores to: output/5-year.cosine\mental_health.1970-1974.cohacoca.10_cds_mpnet
Number of NaN values in the output file (mental_health.1970-1974.cohacoca.10): 0
Processing file: output/5-year.cosine\mental_health.1970-1974.cohacoca.2
Number of embeddings generated: 23
Saved the dissimilarity scores to: output/5-year.cosine\mental_health.1970-1974.cohacoca.2_cds_mpnet
Number of NaN values in the output file (mental_health.1970-1974.cohacoca.2): 0
Processing file: output/5-year.cosine\mental_health.1970-1974.cohacoca.3
Number of embeddings generated: 23
Saved the dissimilarit