In [1]:
import json
import string
import os
import numpy as np
import pandas as pd
from itertools import product

# Pearson, Spearman
import scipy

# NLP
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag
import nltk

# Word2Vec
from gensim.models import Word2Vec

# Explicit representation
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize

DEBUG = False

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/luigi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/luigi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/luigi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/luigi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Create the results directory if it does not exist
if not os.path.exists('results'):
    os.mkdir('results')

# Create the model directory if it does not exist
if not os.path.exists('model'):
    os.mkdir('model')    

In [93]:
!unzip -n dataset.zip -d ./data
!unzip -n ./data/sample_annotated_sentences.zip -d ./data/sample_annotated_sentences
!unzip -n ./data/semantic_simlex_v0.1.zip -d ./data/semantic_simlex_v0.1
! unzip -n ./data/student_prediciton_example.zip -d ./data/student_prediciton_example

Archive:  dataset.zip
Archive:  ./data/sample_annotated_sentences.zip
Archive:  ./data/semantic_simlex_v0.1.zip
Archive:  ./data/student_prediciton_example.zip


# MOSAICO Dataset Preprocessing

This section of the notebook is dedicated to preprocessing the MOSAICO dataset, a substantial collection of words with assigned WordNet meanings. The primary objective is to refine the dataset for semantic analysis by standardizing word forms and filtering for semantically relevant content.

The preprocessing begins by defining custom lemmas. This step is crucial to ensure words in different forms, such as 'people' and 'stolen', are uniformly represented as 'person' and 'steal', respectively. It aids in maintaining consistency across the dataset.

In [3]:
# Define some custom lemmas
custom_lemmas = {
    'people': 'person',
    'stolen': 'steal',
    'quicker': 'quick',
    'shrunk': 'shrink',
    'leaf': 'leave',
    'hung': 'hang',
}

Conversion of the part-of-speech tags from the Penn Treebank tagset to the WordNet tagset is provided, which is essential for accurate WordNet-based lemmatization. 

In [4]:
#Convert part-of-speech tags from the Penn Treebank tagset to the WordNet tagset.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

The `process_jsonl_file` function is at the core of our preprocessing workflow. It processes a JSONL file of the dataset, where each line is tokenized, POS-tagged, and lemmatized using both custom and standard WordNet lemmatization. 
A critical aspect of this function is the evaluation of sentences based on their annotations. In this step, we aim to replace specific words with their corresponding senses (as specified in the annotations). However, to ensure the reliability of the data, we first verify that the lemmatized form of a word is indeed present in the intended sense. If the lemma does not match the expected sense, the sentence is discarded. This procedure helps in filtering out potential errors in the dataset, ensuring that only accurately annotated sentences are retained for semantic analysis.

The processed data is segregated into two categories: semantic and non-semantic sentences. Semantic sentences comprise those that have been successfully lemmatized and annotated with the correct senses, whereas non-semantic sentences include those filtered out due to mismatches or stopwords.

In [5]:
# Process a JSONL file and return the semantic and non-semantic sentences tokens
def process_jsonl_file(filepath, custom_lemmas, limit=None):
    # Initialize a lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Initialize a spacy model
    nlp = spacy.load("en_core_web_sm")

    # Initialize some variables
    semantic_sentences = []
    non_semantic_sentences = []
    discarded_indices = []
    line_counter = 0

    # Define a set of stopwords for performance
    stop_words = set(stopwords.words('english'))
    # Include punctuation in the set of stopwords
    stop_words.update(set(string.punctuation))

    with open(filepath, 'r') as file:
        for i, line in enumerate(file):
            # Stop if limit is reached
            if limit is not None and line_counter >= limit: break
                
            # Load the JSON line and tokenize the sentence
            data = json.loads(line)
            tokens = data['text'].lower().split(' ')
            # Get the annotations
            annotations = data.get('annotations', [])
            
            # POS tagging
            pos_tags = pos_tag(tokens)
            # Lemmatize the tokens and prepare for annotations
            lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag) or wordnet.NOUN) for token, tag in pos_tags]

            # Initialize a flag to indicate if the sentence should be discarded
            discard_sentence = False

            # Iterate through the annotations
            for annotation in annotations:
                # Get the token span and label
                token_span = annotation['token_span']
                label = annotation['label']

                # Check if the token span is valid
                if token_span[1] <= len(lemmatized_tokens):
                    # Check if the lemma at the token span matches the label
                    lemma = lemmatized_tokens[token_span[0]]

                    # If the label is a substring of the lemma, it is valid
                    if lemma in label:
                        lemmatized_tokens[token_span[0]] = label

                    # Try with spacy lemmatization
                    elif nlp(tokens[token_span[0]])[0].lemma_ in label:
                        lemmatized_tokens[token_span[0]] = label

                    # Try with custom lemmatization
                    elif lemma in custom_lemmas and custom_lemmas[lemma] in label:
                        lemmatized_tokens[token_span[0]] = label
                        
                    # If the lemma does not match the label, discard the sentence
                    else:
                        if DEBUG:
                            print(f"\n{data['text']}' at line {i+1}")
                            print(lemmatized_tokens)
                            print(f"Error at index {token_span[0]}, label '{label}', token '{lemma}'")
                        discard_sentence = True
                        discarded_indices.append(i)
                        # Break the loop (stop checking the annotations)
                        break

            # If the sentence should not be discarded
            if not discard_sentence:
                # Append the semantic sentences
                semantic_sentences.append([token for token in lemmatized_tokens if token not in stop_words])
            
            # Append the non-semantic sentences
            non_semantic_sentences.append([token for token in tokens if token not in stop_words])

            # Increment the line counter
            line_counter += 1

    # Save the discarded indices
    with open('./results/discarded_indices.json', 'w') as file:
        json.dump(discarded_indices, file)
    
    # Return the processed sentences
    return semantic_sentences, non_semantic_sentences

The system also checks for existing processed data, loading it if available, or processes and saves new data if necessary. This approach enhances efficiency by avoiding repetitive processing.

In [6]:
semantic_data_path = './results/semantic_data.json'
non_semantic_data_path = './results/non_semantic_data.json'

# Load the processed data if it exists
if os.path.exists(semantic_data_path) and os.path.exists(non_semantic_data_path):
    with open(semantic_data_path, 'r') as file:
        semantic_sentences = json.load(file)
    with open(non_semantic_data_path, 'r') as file:
        non_semantic_sentences = json.load(file)
    print(f"Loaded processed data from {non_semantic_data_path}")

# Otherwise, process the sample annotated sentences and save to a JSON file
else:
    semantic_sentences, non_semantic_sentences = process_jsonl_file('data/sample_annotated_sentences/500000.jsonl', custom_lemmas, limit=None)
    with open(semantic_data_path, 'w') as file:
        json.dump(semantic_sentences, file)
    with open(non_semantic_data_path, 'w') as file:
        json.dump(non_semantic_sentences, file)
    print(f"Saved processed data to {non_semantic_data_path}")


# Print some statistics
print(f"Total discarded semantic sentences: {534300 - len(semantic_sentences)} ({(534300 - len(semantic_sentences)) / 534300 * 100:.2f}%)")

Saved processed data to ./results/non_semantic_data.json
Total discarded semantic sentences: 41500 (7.77%)
Total discarded non semantic sentences: 0 (0.00%)


# Word2Vec Training and Word Similarity Analysis

This section of the notebook focuses on training Word2Vec models using the preprocessed data for both non-semantic and semantic sentences. The models are tailored with specific parameters, including vector size, window size, and number of epochs, to effectively capture the intricate relationships within the datasets.


In [8]:
# Define the model
nsm_model = Word2Vec(
    non_semantic_sentences,
    vector_size=100,  # Vector size
    window=5,         # A larger window for more context
    min_count=1,      # Ignore words that appear less than 1 time
    workers=4,        # Match to the number of available CPU cores
    epochs=50,        # More epochs for better convergence
)

# Train the model
nsm_model.train(non_semantic_sentences, total_examples=nsm_model.corpus_count, epochs=nsm_model.epochs)

# Save the model and the embeddings
nsm_model_path = f"model/nsm_embeddings_{nsm_model.epochs}ep.model"
nsm_model.save(nsm_model_path)

In [9]:
# Define the model
sm_model = Word2Vec(
    semantic_sentences,
    vector_size=100,  # Consider trying larger sizes
    window=5,         # A larger window for more context
    min_count=1,      # Ignore words that appear less than 1 times
    workers=4,        # Match to the number of available CPU cores
    epochs=50,        # More epochs for better convergence
)

# Train the model
sm_model.train(semantic_sentences, total_examples=nsm_model.corpus_count, epochs=nsm_model.epochs)

# Save the model and the embeddings
sm_model_path = f"model/sm_embeddings_{sm_model.epochs}ep.model"
sm_model.save(sm_model_path)

Post-training, a unified function `compute_correlation_scores` is employed to compute correlation scores between word pairs. This function is a key component in evaluating the models' ability to replicate human-like word similarity assessments. It operates in two modes: 'non_semantic' and 'semantic', each addressing a distinct aspect of language understanding.

In 'non_semantic' mode, the function evaluates word pairs in their plain form, directly comparing their similarity against human-rated scores. This mode is crucial for understanding how the model perceives relationships between words without the influence of semantic context.

Conversely, in 'semantic' mode, the function acknowledges the multiple senses of words. It computes the maximum similarity across all possible combinations of senses for each word pair. This is particularly important for assessing the model's proficiency in capturing the subtleties of semantic meanings.

For both modes, the function iterates over word pairs, checks their presence in the model's vocabulary, and calculates similarity scores. It handles cases where valid word pairs are not found by appending a default score and can issue warnings for such instances.

The performance of the models in both semantic and non-semantic contexts is quantified using Pearson's r and Spearman's rho correlation coefficients. These metrics provide a comprehensive view of the models' alignment with human perception of word similarity, thus offering insights into the complexities of language understanding.

In [14]:
# Compute the correlation scores
def compute_correlation_scores(model, word_pair2score, mode, print_warning=DEBUG):
    # Initialize lists to store the scores
    human_scores = []
    system_scores = []

    # Iterate through the word pairs
    for word_pair, score in word_pair2score.items():
        # Initialize variables
        max_similarity = -1
        found_pair = False

        # Get the word pairs
        if mode == 'non_semantic':
            w1, w2 = word_pair          
            word_pairs = [(w1, w2)]
        elif mode == 'semantic':
            w1_senses, w2_senses = word_pair
            word_pairs = product(w1_senses.split(','), w2_senses.split(','))
        else:
            raise ValueError("Invalid mode. Choose 'non_semantic' or 'semantic'.")

        # Iterate through the word pairs
        for w1, w2 in word_pairs:
            # Check if the word pair is in the model
            if w1 in model.wv.key_to_index and w2 in model.wv.key_to_index:
                # Compute the similarity
                system_similarity = model.wv.similarity(w1, w2)
                # Update the max similarity
                max_similarity = max(max_similarity, system_similarity)
                # Update the flag
                found_pair = True

        # If the word pair was not found, append -1
        if not found_pair:
            if print_warning: 
                print(f"WARNING: No valid word pairs for {word_pair} were found in the embedding model.")
            system_scores.append(-1)    
        # Otherwise, append the max similarity if in semantic mode       
        elif found_pair and mode == 'semantic':
            system_scores.append(max_similarity)
        # Otherwise, append the system similarity if in non-semantic mode
        elif found_pair and mode == 'non_semantic':
            system_scores.append(system_similarity)
        # Append the human score
        human_scores.append(score)

    # Convert lists to numpy arrays for the pairs that were found
    if human_scores and system_scores:
        human_scores = np.array(human_scores)
        system_scores = np.array(system_scores)

        # Compute correlation
        pearson_r, _ = scipy.stats.pearsonr(human_scores, system_scores)
        spearman_rho = scipy.stats.spearmanr(human_scores, system_scores).correlation
    else:
        pearson_r, spearman_rho = float('nan'), float('nan')

    # Return the correlation scores
    return pearson_r, spearman_rho


Ultimately, this analysis provides a dual perspective on word similarity: one that considers the plain form of words and another that delves into their deeper semantic meanings. By comparing these two approaches, we gain valuable insights into the complexities of language understanding and the efficacy of Word2Vec models in different linguistic contexts.

Through this dual-mode analysis, we gain a deeper understanding of word similarities, both in the context of plain word forms and their deeper semantic meanings. This approach allows for a more nuanced evaluation of the Word2Vec models in capturing the essence of word relationships.

In [16]:
# Load the TSV file
simlex999_path = './data/semantic_simlex_v0.1/semantic_simlex_v0.1.tsv'
simlex999 = pd.read_csv(simlex999_path, delimiter='\t')

# Extract sense-annotated word pairs and their scores
nsm_word_pair2score = {(row['word1'], row['word2']): row['SimLex999'] for _, row in simlex999.iterrows()}

# Compute the correlation score and print the results
nsm_pearson_r, nsm_spearman_rho = compute_correlation_scores(nsm_model, nsm_word_pair2score, mode='non_semantic')
print(f"NON SEMANTIC score \nPearson's r: {nsm_pearson_r:.4f}, Spearman's rho: {nsm_spearman_rho:.4f}\n")


# Extract sense-annotated word pairs and their scores
sm_word_pair2score = {(row['Senses(w1)'], row['Senses(w2)']): row['SimLex999'] for _, row in simlex999.iterrows()}

# Compute the correlation score and print the results
sm_pearson_r, sm_spearman_rho = compute_correlation_scores(sm_model, sm_word_pair2score, mode='semantic')
print(f"SEMANTIC score: \nPearson's r: {sm_pearson_r:.4f}, Spearman's rho: {sm_spearman_rho:.4f}")

NON SEMANTIC score 
Pearson's r: 0.3726, Spearman's rho: 0.3592

SEMANTIC score: 
Pearson's r: 0.3390, Spearman's rho: 0.3216


# Maximum Similarity Computation and Analysis on SimLex-999 Dataset

This part of the notebook focuses on computing the maximum similarity between word pairs from the SimLex-999 dataset, using the previously trained Word2Vec models for semantic and non-semantic contexts. The aim is to assess how well these models capture the relationships between words as reflected in human-rated similarity scores.

The `compute_max_similarity` function is at the core of this analysis. It operates in two modes: 'semantic' and 'non-semantic'. In the 'semantic' mode, the function considers multiple senses of words, iterating through all possible sense combinations to find the highest similarity score. In contrast, the 'non-semantic' mode evaluates the similarity between words in their plain form, without considering different senses.

The function iterates through each word pair, checking if they exist in the model's vocabulary. If both words are found, it computes their similarity and updates the maximum similarity score. A warning is printed if a word pair is not found in the model, ensuring transparency in the analysis process.

In [28]:
# Comute max similarity
def compute_max_similarity(model, row, mode='semantic', print_warning=DEBUG):
    # Initialize the max similarity
    max_similarity = -1

    # Get the word pairs
    if mode == 'semantic':
        w1_senses = row['Senses(w1)'].split(',')
        w2_senses = row['Senses(w2)'].split(',')
        pairs = product(w1_senses, w2_senses)
    elif mode == 'non_semantic':
        pairs = [(row['word1'], row['word2'])]
    else:
        raise ValueError("Invalid mode. Choose 'semantic' or 'non_semantic'.")

    # Iterate through the word pairs
    for w1, w2 in pairs:
        # Check if the word pair is in the model
        if w1 in model.wv.key_to_index and w2 in model.wv.key_to_index:
            # Compute the similarity
            similarity = model.wv.similarity(w1, w2)
            # Update the max similarity
            max_similarity = max(max_similarity, similarity)

        # Otherwise, print a warning
        elif print_warning:
            print(f"WARNING: ({w1} and {w2}) are not present in the embedding model.")
            
    # Return the max similarity 
    return max_similarity

Following the similarity computation, the `process_simlex` function processes the SimLex-999 dataset. It calculates the maximum similarity for each word pair in the dataset, in either 'semantic' or 'non-semantic' mode as specified. The results are stored in a DataFrame, capturing the predicted similarity scores for each word pair.

In [29]:
# Process SimLex-999
def process_simlex(file_path, model, mode='semantic'):
    # Load the TSV file
    df = pd.read_csv(file_path, delimiter='\t')
    # Initialize a list to store the results
    results = []

    # Iterate through the rows
    for index, row in df.iterrows():
        # Compute the max similarity
        max_similarity = compute_max_similarity(model, row, mode=mode)
        # Append the results
        results.append({
            'Word1': row['word1'],
            'Word2': row['word2'],
            'Predicted_Similarity': max_similarity if max_similarity != -1 else 'N/A'
        })

    # Return the results
    return pd.DataFrame(results)

Finally, the results are saved into separate TSV files for non-semantic and semantic analyses. This step provides an organized and accessible record of the model's performance in both contexts. The separate files, 'non_semantic.tsv' and 'semantic.tsv', offer a clear comparison of how the models perform in understanding word similarities in different linguistic scenarios.

By analyzing the maximum similarity in these two distinct contexts, we gain valuable insights into the capabilities and limitations of our Word2Vec models in capturing the essence of word relationships as perceived by humans.

In [94]:
# Process SimLex-999 for non-semantic
nsm_df = process_simlex(simlex999_path, nsm_model, mode='non_semantic')
nsm_df.to_csv('./results/non_semantic.tsv', sep='\t', index=False, header=False)
print("Saved the results to non_semantic.tsv")

# Process SimLex-999 for semantic
sm_df = process_simlex(simlex999_path, sm_model, mode='semantic')
sm_df.to_csv('./results/semantic.tsv', sep='\t', index=False, header=False)
print("Saved the results to semantic.tsv")

Saved the results to non_semantic.tsv
Saved the results to semantic.tsv


We finally proceed to export the 5 most difficult pairs imo.

In [102]:
# Define the pairs and comments
lines = [
    ("happy", "young"),
    ("'happy' is an emotional state, while 'young' is a descriptor of age or development stage.", ""),
    ("cent", "size"),
    ("'cent' is a unit of currency, and 'size' is a dimension of physical space or volume.", ""),
    ("chapter", "choice"),
    ("'chapter' refers to a segment of a sequence, often used in the context of books, while 'choice' is about the act of selecting from available options.", ""),
    ("loop", "belt"),
    ("Both relate to circular forms, but different objects.", ""),
    ("sly", "tough"),
    ("Both can describe a person, but with different implications of return.", ""),
]

# Create a DataFrame from the pairs and comments
df = pd.DataFrame(lines, columns=['Word1', 'Word2'])

# Save the DataFrame to a TSV file
df.to_csv('./results/top_5_difficult.tsv', sep='\t', index=False, header=False)

# Computing explicit representations
In this section, we demonstrate the process of computing explicit word embeddings using Latent Semantic Analysis (LSA). The LSA method leverages a co-occurrence matrix and dimensionality reduction via Singular Value Decomposition (SVD) to produce dense word vectors.

First of all, to use the previously defined functions (i.e. to compute similarity scores), since our word_vectors is a NumPy array and the functions expect a model with a .wv attribute (like those from Gensim), we need to create a wrapper class that mimics this structure.

In [53]:
# Wrapper for the explicit representation
class WordEmbeddingModel:
    def __init__(self, word_vectors, vocab):
        self.word_vectors = word_vectors
        self.vocab = vocab
        self.wv = WordVectors(self.word_vectors, self.vocab)

class WordVectors:
    def __init__(self, word_vectors, vocab):
        self.word_vectors = word_vectors
        self.vocab = vocab
        self.key_to_index = vocab  # Mimicking Gensim's structure

    def similarity(self, word1, word2):
        if word1 in self.vocab and word2 in self.vocab:
            vec1 = self.word_vectors[self.vocab[word1], :]
            vec2 = self.word_vectors[self.vocab[word2], :]
            return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        else:
            return None

We need to construct a co-occurrence matrix that encapsulates the frequency with which words appear in the vicinity of one another within a specified window size throughout the corpus. This matrix is fundamentally large and sparse.

In [54]:
# Create the co-occurrence matrix
def create_cooccurrence_matrix(corpus, window_size):
    # Create the vocabulary
    vocab = set(word for sentence in corpus for word in sentence)
    vocab = dict(zip(vocab, range(len(vocab))))
    
    # Initialize a co-occurrence matrix
    cooc_mat = defaultdict(Counter)

    # Iterate through the corpus
    for sentence in corpus:
        # Iterate through the words in the sentence
        for i, word in enumerate(sentence):
            # Get the start and end indices
            start = max(0, i - window_size)
            end = min(len(sentence), i + window_size + 1)
            # Iterate through the words in the window
            for j in range(start, end):
                # Update the co-occurrence matrix
                if i != j:
                    cooc_mat[vocab[word]][vocab[sentence[j]]] += 1
    
    # Initialize lists to store the data
    data, row_indices, col_indices = [], [], []

    # Iterate through the co-occurrence matrix
    for word_idx, counts in cooc_mat.items():
        # Iterate through the counts
        for col_idx, count in counts.items():
            # Append the data
            data.append(count)
            # Append the row and column indices
            row_indices.append(word_idx)
            # Append the column index
            col_indices.append(col_idx)
    
    # Create a sparse matrix
    cooc_sparse = csr_matrix((data, (row_indices, col_indices)), shape=(len(vocab), len(vocab)), dtype=np.float32)
    
    # Return the sparse matrix and the vocabulary
    return cooc_sparse, vocab

Next, we apply Singular Value Decomposition (SVD) to the co-occurrence matrix. This step reduces the dimensionality of the vectors while maintaining the most significant aspects of the data. We also normalize the vectors to unit length.

In [55]:
# Reduce the dimensions of the co-occurrence matrix
def reduce_dimensions(cooc_matrix, n_components):
    # Using Sparse SVD for dimensionality reduction
    u, _, _ = svds(cooc_matrix, k=n_components)
    # Normalize the vectors to unit length
    u = normalize(u, norm='l2', axis=1)
    return u

Finally, we can retrieve the dense vector representation for any word in our vocabulary by indexing into our matrix of word vectors.

In [56]:
# Create a co-occurrence matrix
nsm_cooc_matrix, nsm_vocab = create_cooccurrence_matrix(non_semantic_sentences, window_size=5)
sm_cooc_matrix, sm_vocab = create_cooccurrence_matrix(semantic_sentences, window_size=5)

# Reduce the dimensions of the co-occurrence matrix and return dense word vectors
nsm_word_vectors = reduce_dimensions(nsm_cooc_matrix, n_components=100)
sm_word_vectors = reduce_dimensions(sm_cooc_matrix, n_components=100)

# Create a word embedding model
nsm_model_explicit = WordEmbeddingModel(nsm_word_vectors, nsm_vocab)
sm_model_explicit = WordEmbeddingModel(sm_word_vectors, sm_vocab)

# Compute correlation
nsm_exp_pearson_r, nsm_spearman_exp_rho = compute_correlation_scores(nsm_model_explicit, nsm_word_pair2score, mode='non_semantic')
sm_exp_pearson_r, sm_spearman_exp_rho = compute_correlation_scores(sm_model_explicit, sm_word_pair2score, mode='semantic')

# Print the results
print(f"NON SEMANTIC score \nPearson's r: {nsm_exp_pearson_r:.4f}, Spearman's rho: {nsm_spearman_exp_rho:.4f}\n")
print(f"SEMANTIC score \nPearson's r: {sm_exp_pearson_r:.4f}, Spearman's rho: {sm_spearman_exp_rho:.4f}\n")

NON SEMANTIC score 
Pearson's r: 0.2145, Spearman's rho: 0.2278

SEMANTIC score 
Pearson's r: 0.2020, Spearman's rho: 0.2067

