IN THIS NOTEBOOK, CORRELATION ANALYSIS BETWEEN EACH OF THE EXTRACTED SPEECH FEATURES(MFCC, X-VECTOR AND EMBEDDING), AND COSINE SIMILARITIES ARE CONDUCTED

STEP 1: load and preprocess the data

In [None]:
#load the data
import pandas as pd

loaded_segment_df = pd.read_hdf('FILE_PATH_THAT_STORES_THE_DATA')

In [None]:
#organise the data based on speakers

loaded_segment_df['unique_speaker'] = loaded_segment_df['file'] + "_" + loaded_segment_df['Speaker']

grouped = loaded_segment_df.groupby(['file', 'unique_speaker'])

data_per_speaker = []
for (file_name, unique_speaker), group in grouped:
    mfcc_list = group['mfcc'].tolist()
    embedding_list = group['embedding'].tolist()
    xvector_list = group['xvector'].tolist()

    data_per_speaker.append({
        'file': file_name,
        'unique_speaker': unique_speaker,
        'mfccs': mfcc_list,
        'xvectors': xvector_list,
        'embeddings': embedding_list
    })

grouped_df = pd.DataFrame(data_per_speaker)

In [None]:
#store the orgnised data
store_path = 'path_to_store_the_orgnised_data'
grouped_df.to_pickle(store_path)

print(f"DataFrame saved to {store_path}")

STEP 2: prepare the parameters that will be used for correlation analysis, including:
- the cosine similarities obtained from different speech features (MFCC, x-vector or embedding) and averaged based on different granularities (segment level and speaker level)
- the MFCCs averaged based on different granularities
- the x-vectors averaged based on different granularities
- the embeddings averaged based on different granularities

STEP 2.1: prepare the parameters that will be used in analysis 1, which is the analysis with cosine similarities of embeddings.

We analyse cosine similarities of embeddings in the first analysis as the referenced research (mentioned in README) us this parameter for analysis as well, so we assume this perspective of analysis are more standard.

STEP 2.1.1: preprocess the embeddings for analysis 1

In [None]:
# obtain the cosine similarity using embeddings

# calculate the speaker centroid based on embedding for obtaining cosine similarities
import numpy as np

for i, data in enumerate(data_per_speaker):
  #calculate the speaker centroid
  concatenated_embeddings = []
  #shape of data['embeddings']: (number of segments, number of frames, number of coefficients(512))
  for embedding_per_segment in data['embeddings']:
      #embedding shape in embedding_per_segment: (number of frame(1), number of coefficients(512))
      for embedding in embedding_per_segment:
          concatenated_embeddings.append(embedding)

  stacked_embeddings = np.vstack(concatenated_embeddings)
  speaker_centroid = np.mean(stacked_embeddings, axis=0)

  #shape of speaker_centroid: (512,)
  data['speaker_centroid'] = speaker_centroid

add_centroid_df = pd.DataFrame(data_per_speaker)

STEP 2.1.2: preprocess mfccs for analysis 1

In [None]:
#calculate the segment based mean mfcc vectors

def calculate_mean_mfcc(speaker_mfccs):
    mean_mfcc_segments = []
    for mfcc_per_segment in speaker_mfccs:
        mean_mfcc = np.mean(mfcc_per_segment, axis=1)
        mean_mfcc_segments.append(mean_mfcc)
    return mean_mfcc_segments

add_centroid_df['mean_mfcc_segments'] = add_centroid_df['mfccs'].apply(calculate_mean_mfcc)

add_mean_mfcc_df = pd.DataFrame(add_centroid_df)

In [None]:
#calculate the segment based mean mfcc values

def calculate_mean_mfcc_value(mean_mfccs_vectors):
    mean_mfcc_seg_value = []
    for mfcc_segments in mean_mfccs_vectors:
        mean_mfcc = np.mean(mfcc_segments, axis=0)
        mean_mfcc_seg_value.append(mean_mfcc)
    return mean_mfcc_seg_value

add_mean_mfcc_df['mean_mfcc_seg_value'] = add_mean_mfcc_df['mean_mfcc_segments'].apply(calculate_mean_mfcc_value)

add_mean_mfcc_value_df = pd.DataFrame(add_mean_mfcc_df)

In [None]:
#calculate the speaker based mfcc vectors
def speaker_mean_mfcc_vector(mean_mfccs_vectors):
    speaker_mean_mfcc_vector = np.mean(mean_mfccs_vectors, axis=0)
    return speaker_mean_mfcc_vector

add_mean_mfcc_df['speaker_mfcc_vector'] = add_mean_mfcc_df['mean_mfcc_segments'].apply(speaker_mean_mfcc_vector)

add_mean_mfcc_df = pd.DataFrame(add_mean_mfcc_df)

In [None]:
#calculate the speaker based mean mfcc values

def speaker_mean_mfcc_value(speaker_mean_mfcc_vector):
    speaker_mean_mfcc_value = np.mean(speaker_mean_mfcc_vector, axis=0)
    return speaker_mean_mfcc_value

add_mean_mfcc_df['speaker_mfcc_value'] = add_mean_mfcc_df['speaker_mfcc_vector'].apply(speaker_mean_mfcc_value)

add_mean_mfcc_df = pd.DataFrame(add_mean_mfcc_df)

STEP 2.1.3: preprocess xvectors for analysis 1

In [None]:
#calculate the segment based mean xvector values

def calculate_mean_xvector_seg(xvectors):
    mean_xvector_seg_value = []
    for xvector_segments in xvectors:
        mean_xvector = np.mean(xvector_segments, axis=1)
        mean_xvector_seg_value.append(mean_xvector)
    return mean_xvector_seg_value

add_mean_mfcc_df['seg_mean_xvector_value'] = add_mean_mfcc_df['xvectors'].apply(calculate_mean_xvector_seg)

add_mean_xvector_df = pd.DataFrame(add_mean_mfcc_df)

In [None]:
#calculate the speaker based mean xvector values

def speaker_mean_xvector_value(seg_mean_xvector_value):
    speaker_mean_xvector_value = np.mean(seg_mean_xvector_value, axis=0)
    return speaker_mean_xvector_value

add_mean_xvector_df['speaker_mean_xvector_value'] = add_mean_xvector_df['seg_mean_xvector_value'].apply(speaker_mean_xvector_value)

add_mean_xvector_df = pd.DataFrame(add_mean_xvector_df)

STEP 2.1.4: preprocess embeddings for analysis 1

In [None]:
#calculate the segment based mean embedding values

def seg_mean_embedding_value(embeddings):
  mean_seg_list = []
  for embedding_per_seg in embeddings:
    mean_emb_seg = np.mean(embedding_per_seg,axis=0)
    mean_emb_seg = np.mean(embedding_per_seg)
    mean_seg_list.append(mean_emb_seg)
    """for embedding_per_frame in embedding_per_seg:
      mean_emb_frame = np.mean(embedding_per_frame,axis=)"""
  return mean_seg_list

add_mean_xvector_df['seg_embedding_value'] = add_mean_xvector_df['embeddings'].apply(seg_mean_embedding_value)

add_mean_xvector_df = pd.DataFrame(add_mean_xvector_df)

In [None]:
#calculate the speaker based mean embedding values

def speaker_mean_embedding_value(embeddings):
    speaker_mean_embedding_value = np.mean(embeddings)
    return speaker_mean_embedding_value

add_mean_xvector_df['speaker_embedding_value'] = add_mean_xvector_df['seg_embedding_value'].apply(speaker_mean_embedding_value)

add_mean_xvector_df = pd.DataFrame(add_mean_xvector_df)

STEP 2.1.5: prepare cosine similarites for analysis 1

In [None]:
#calculate the segment based mean cosine similarities of embeddings

#calculate the cosine similarities of each embedding of selected segments
from sklearn.metrics.pairwise import cosine_similarity

add_mean_xvector_df['seg_cos_sim'] = None

for index, data in add_mean_xvector_df.iterrows():
    speaker_centroid = data['speaker_centroid']

    seg_cos_sim_list = []

    for embedding_per_segment in data['embeddings']:
        cosine_similarities = []

        for embedding_per_frame in embedding_per_segment:
            cos_sim = cosine_similarity([embedding_per_frame], [speaker_centroid])[0][0]
            cosine_similarities.append(cos_sim)

        seg_cos_sim = np.mean(cosine_similarities)
        seg_cos_sim_list.append(seg_cos_sim)

    add_mean_xvector_df.at[index,'seg_cos_sim'] = seg_cos_sim_list

In [None]:
#calculate the speaker level mean cosine similarities (cosine similarity scores) of embeddings

add_mean_xvector_df['speaker_cos_sim'] = add_mean_xvector_df['seg_cos_sim'].apply(lambda x: np.mean(x) if isinstance(x, list) else np.nan)

In [None]:
# store the dataframe which includes all processed data into a separate file for further reference

output_file_path = 'your_output_file_path'

add_mean_xvector_df.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")

STEP 3: conduct correlation analysis 1, which is the analysis between speech features and the mean cosine similarities of embeddings

In [None]:
#prepare data lists for visualization
seg_mean_mfcc_vector = [seg_mfcc for row in add_mean_xvector_df['mean_mfcc_segments'] for seg_mfcc in row]
seg_mean_mfcc_value = [seg_mfcc_value for row in add_mean_xvector_df['mean_mfcc_seg_value'] for seg_mfcc_value in row]

speaker_mean_mfcc_vector = [speaker_mfcc_vector for speaker_mfcc_vector in add_mean_xvector_df['speaker_mfcc_vector']]
speaker_mean_mfcc_value = [speaker_mfcc_value for speaker_mfcc_value in add_mean_xvector_df['speaker_mfcc_value']]

seg_mean_xvector_value = [seg_xvector_value for row in add_mean_xvector_df['seg_mean_xvector_value'] for seg_xvector_value in row]
speaker_mean_xvector_value = [speaker_xvector_value for speaker_xvector_value in add_mean_xvector_df['speaker_mean_xvector_value']]

seg_mean_embedding_value = [seg_embedding_value for row in add_mean_xvector_df['seg_embedding_value'] for seg_embedding_value in row]
speaker_mean_embedding_value = [speaker_embedding_value for speaker_embedding_value in add_mean_xvector_df['speaker_embedding_value']]

seg_cos_sim = [seg_cos_sim for row in add_mean_xvector_df['seg_cos_sim'] for seg_cos_sim in row]
speaker_cos_sim = [speaker_cos_sim for speaker_cos_sim in add_mean_xvector_df['speaker_cos_sim']]

STEP 3.1: conduct correlation analysis between MFCCs and the mean cosine similarites of embeddings

In [None]:
#segment based analysis with mfcc 1
#coefficientwise analysis: use the mean mfcc vector of each segment
#13 plots for 13 coefficients

import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

seg_mean_mfcc_vector = np.array(seg_mean_mfcc_vector)
seg_cos_sim = np.array(seg_cos_sim)

for coeff_idx in range(13):
    plt.figure(figsize=(8, 6))

    plt.scatter(seg_cos_sim, seg_mean_mfcc_vector[:, coeff_idx], color='b', s=50)

    plt.title(f"MFCC Coefficient {coeff_idx + 1} vs Embedding Cosine Similarity", fontsize=16)
    plt.xlabel("Cosine Similarity (Segment-based)", fontsize=12)
    plt.ylabel(f"MFCC Coefficient {coeff_idx + 1} (Segment-based)", fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    #calculate the linear correlation level (pearson correlation)
    pearson_corr, pearson_p_value = pearsonr(seg_mean_mfcc_vector[:, coeff_idx], seg_cos_sim)
    print(f'Linear correlation: {pearson_corr}')

    #calculate the none-linear correlation level (spearman’s rank correlation)
    spearman_corr, spearman_p_value = spearmanr(seg_mean_mfcc_vector[:, coeff_idx], seg_cos_sim)
    print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#segment based analysis with mfcc 2
#use the mean value of the mean mfcc vector of each segment

seg_mean_mfcc_value = np.array(seg_mean_mfcc_value)
seg_cos_sim = np.array(seg_cos_sim)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim, seg_mean_mfcc_value, color='b', s=50)

plt.title(f"MFCC vs Embedding Cosine Similarity", fontsize=16)
plt.xlabel("Embedding Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"MFCC (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(seg_mean_mfcc_value, seg_cos_sim)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_mfcc_value, seg_cos_sim)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#speaker based analysis with mfcc 1
#use the mean mfcc vector for each speaker
#13 plots
speaker_mean_mfcc_vector = np.array(speaker_mean_mfcc_vector)
speaker_cos_sim = np.array(speaker_cos_sim)

for coeff_idx in range(13):
    plt.figure(figsize=(8, 6))

    plt.scatter(speaker_cos_sim, speaker_mean_mfcc_vector[:, coeff_idx], color='b', s=50)

    plt.title(f"MFCC Coefficient {coeff_idx + 1} vs Embedding Cosine Similarity Score", fontsize=16)
    plt.xlabel("Embedding Cosine Similarity Score", fontsize=12)
    plt.ylabel(f"MFCC Coefficient {coeff_idx + 1} (Speaker-based)", fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    #calculate the linear correlation level (pearson correlation)
    pearson_corr, pearson_p_value = pearsonr(speaker_mean_mfcc_vector[:, coeff_idx], speaker_cos_sim)
    print(f'Linear correlation: {pearson_corr}')

    #calculate the none-linear correlation level (spearman’s rank correlation)
    spearman_corr, spearman_p_value = spearmanr(speaker_mean_mfcc_vector[:, coeff_idx], speaker_cos_sim)
    print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#speaker based analysis with mfcc 2
#use the mean mfcc value for each speaker

speaker_mean_mfcc_value = np.array(speaker_mean_mfcc_value)
speaker_cos_sim = np.array(speaker_cos_sim)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim, speaker_mean_mfcc_value, color='b', s=50)

plt.title(f"MFCC vs Embedding Cosine Similarity Score", fontsize=16)
plt.xlabel("Embedding Cosine Similarity Score", fontsize=12)
plt.ylabel(f"MFCC (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(speaker_mean_mfcc_value, speaker_cos_sim)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_mfcc_value, speaker_cos_sim)
print(f"Non-linear correlation: {spearman_corr}")

STEP 3.2: conduct correlation analysis between xvectors and the mean cosine similarites of embeddings

In [None]:
#segment based analysis with xvector
#no coefficientwise analysis due to the large number of coefficients (512)
#use the mean vector value of each segment

seg_mean_xvector_value = np.array(seg_mean_xvector_value)
seg_cos_sim = np.array(seg_cos_sim)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim, seg_mean_xvector_value, color='b', s=50)

plt.title(f"X-vector vs Embedding Cosine Similarity", fontsize=16)
plt.xlabel("Embedding Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"X-vector (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
seg_mean_xvector_value = np.ravel(seg_mean_xvector_value)
seg_cos_sim = np.ravel(seg_cos_sim)

pearson_corr, pearson_p_value = pearsonr(seg_mean_xvector_value, seg_cos_sim)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_xvector_value, seg_cos_sim)
print(f"Non-linear correlation: {spearman_corr}")



In [None]:
#speaker based analysis with xvector
#use the mean xvector value for each speaker

speaker_mean_xvector_value = np.array(speaker_mean_xvector_value)
speaker_cos_sim = np.array(speaker_cos_sim)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim, speaker_mean_xvector_value, color='b', s=50)

plt.title(f"X-vector vs Embedding Cosine Similarity Score", fontsize=16)
plt.xlabel("Embedding Cosine Similarity Score", fontsize=12)
plt.ylabel(f"X-vector (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
speaker_mean_xvector_value = np.ravel(speaker_mean_xvector_value)
speaker_cos_sim = np.ravel(speaker_cos_sim)

pearson_corr, pearson_p_value = pearsonr(speaker_mean_xvector_value, speaker_cos_sim)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_xvector_value, speaker_cos_sim)
print(f"Non-linear correlation: {spearman_corr}")

STEP 3.3: conduct correlation analysis between embeddings and the mean cosine similarites of embeddings

In [None]:
#segment based analysis with embedding

seg_mean_embedding_value = np.array(seg_mean_embedding_value)
seg_cos_sim = np.array(seg_cos_sim)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim, seg_mean_mfcc_value, color='b', s=50)

plt.title(f"Embedding vs Embedding Cosine Similarity", fontsize=16)
plt.xlabel("Embedding Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"Embedding (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(seg_mean_embedding_value, seg_cos_sim)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_embedding_value, seg_cos_sim)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#speaker based analysis with embedding

speaker_mean_embedding_value = np.array(speaker_mean_embedding_value)
speaker_cos_sim = np.array(speaker_cos_sim)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim, speaker_mean_embedding_value, color='b', s=50)

plt.title(f"Embedding vs Embedding Cosine Similarity Score", fontsize=16)
plt.xlabel("Embedding Cosine Similarity Score", fontsize=12)
plt.ylabel(f"Embedding (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(speaker_mean_embedding_value, speaker_cos_sim)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_embedding_value, speaker_cos_sim)
print(f"Non-linear correlation: {spearman_corr}")

STEP 4: prepare data for correlation analysis 2 and 3

Analysis 2 is the analysis between speech features and the mean cosine similarities of MFCCs
Analysis 3 is the analysis between speech features and the mean cosine similarities of x-vectors

STEP 4.1: prepare the MFCC parameters that will be used for analysis

In [None]:
#obtain the mfcc based speaker centroid
#according to how the speaker centriod is calculated in the paper (Chowdhury et al. (2024)), the mfcc based speaker centriod is actually the mean mfcc vector for each speaker

mfcc_speaker_centroid = [mfcc_speaker_vec for mfcc_speaker_vec in add_mean_xvector_df["speaker_mfcc_vector"]]
mfcc_speaker_centroid = np.array(mfcc_speaker_centroid)

add_mean_xvector_df["speaker_centroid_mfcc"] = add_mean_xvector_df["speaker_mfcc_vector"]

In [None]:
#calculate the segment based mean cosine similarities of mfccs
#calculate the cosine similarities of each mfcc of selected segments

add_mean_xvector_df['seg_cos_sim_mfcc'] = None

for index, data in add_mean_xvector_df.iterrows():
    speaker_centroid = data['speaker_centroid_mfcc']

    seg_cos_sim_list = []

    for mfcc_per_segment in data['mfccs']:
        cosine_similarities = []

        for mfcc_per_frame in mfcc_per_segment.T:
            cos_sim = cosine_similarity([mfcc_per_frame], [speaker_centroid])[0][0]
            cosine_similarities.append(cos_sim)

        seg_cos_sim = np.mean(cosine_similarities)
        seg_cos_sim_list.append(seg_cos_sim)

    add_mean_xvector_df.at[index,'seg_cos_sim_mfcc'] = seg_cos_sim_list

In [None]:
#calculate the speaker level mean cosine similarity (also defined as cosine similarity score by Chowdhury et al. (2024)) of mfccs

add_mean_xvector_df['speaker_cos_sim_mfcc'] = add_mean_xvector_df['seg_cos_sim_mfcc'].apply(lambda x: np.mean(x) if isinstance(x, list) else np.nan)

STEP 4.2: prepare x-vector parameters that will be used for analysis

In [None]:
#obtain the xvector based speaker centroid
#according to Chowdhury et al. (2024), speaker centriod is the average of the embeddings associated with all of the segments assigned to a speaker, in this case, the the xvector based speaker centriod is actually the mean xvector of each speaker

speaker_centroid_xvector = []
for i,data in add_mean_xvector_df.iterrows():
  speaker_seg = np.squeeze(data["xvectors"],axis=1)
  speaker_centroid = np.mean(speaker_seg,axis=0)
  speaker_centroid_xvector.append(speaker_centroid)

add_mean_xvector_df["speaker_centroid_xvector"] = speaker_centroid_xvector

In [None]:
#calculate the segment based mean cosine similarities of xvectors

#calculate the cosine similarities of each xvector of selected segments

add_mean_xvector_df['seg_cos_sim_xvector'] = None

for index, data in add_mean_xvector_df.iterrows():
    speaker_centroid = data['speaker_centroid_xvector']

    speaker_centroid = np.reshape(speaker_centroid,(1,512))
    seg_cos_sim_list = []
    for xvector_per_segment in data['xvectors']:
        cos_sim = cosine_similarity(xvector_per_segment, speaker_centroid)[0][0]
        seg_cos_sim_list.append(cos_sim)

    add_mean_xvector_df.at[index,'seg_cos_sim_xvector'] = seg_cos_sim_list

In [None]:
#calculate the speaker based xvector cosine similarity (cosine similarity score)
add_mean_xvector_df['speaker_cos_sim_xvector'] = add_mean_xvector_df['seg_cos_sim_xvector'].apply(lambda x: np.mean(x) if isinstance(x, list) else np.nan)

STEP 5: conduct correlation analysis 2, which is the analysis between speech features and the mean cosine similarities of MFCCs

In [None]:
#prepare data lists for plotting
seg_mean_mfcc_vector = [seg_mfcc for row in add_mean_xvector_df['mean_mfcc_segments'] for seg_mfcc in row]
seg_mean_mfcc_value = [seg_mfcc_value for row in add_mean_xvector_df['mean_mfcc_seg_value'] for seg_mfcc_value in row]

speaker_mean_mfcc_vector = [speaker_mfcc_vector for speaker_mfcc_vector in add_mean_xvector_df['speaker_mfcc_vector']]
speaker_mean_mfcc_value = [speaker_mfcc_value for speaker_mfcc_value in add_mean_xvector_df['speaker_mfcc_value']]

seg_mean_xvector_value = [seg_xvector_value for row in add_mean_xvector_df['seg_mean_xvector_value'] for seg_xvector_value in row]
speaker_mean_xvector_value = [speaker_xvector_value for speaker_xvector_value in add_mean_xvector_df['speaker_mean_xvector_value']]

seg_mean_embedding_value = [seg_embedding_value for row in add_mean_xvector_df['seg_embedding_value'] for seg_embedding_value in row]
speaker_mean_embedding_value = [speaker_embedding_value for speaker_embedding_value in add_mean_xvector_df['speaker_embedding_value']]

seg_cos_sim_mfcc = [seg_cos_sim for row in add_mean_xvector_df['seg_cos_sim_mfcc'] for seg_cos_sim in row]
speaker_cos_sim_mfcc = [speaker_cos_sim for speaker_cos_sim in add_mean_xvector_df['speaker_cos_sim_mfcc']]

seg_cos_sim_xvector = [seg_cos_sim for row in add_mean_xvector_df['seg_cos_sim_xvector'] for seg_cos_sim in row]
speaker_cos_sim_xvector = [speaker_cos_sim for speaker_cos_sim in add_mean_xvector_df['speaker_cos_sim_xvector']]

STEP 5.1: conduct correlation analysis between MFCCs and the mean cosine similarities of MFCCs

In [None]:
#correlation analysis with mfcc 1
#segment level
#analysis based on each mfcc coefficient,13 in total
seg_mean_mfcc_vector = np.array(seg_mean_mfcc_vector)
seg_cos_sim_mfcc = np.array(seg_cos_sim_mfcc)

for coeff_idx in range(13):
    plt.figure(figsize=(8, 6))

    plt.scatter(seg_cos_sim_mfcc, seg_mean_mfcc_vector[:, coeff_idx], color='b', s=50)

    plt.title(f"MFCC Coefficient {coeff_idx + 1} vs MFCC Cosine Similarity", fontsize=16)
    plt.xlabel("MFCC Cosine Similarity (Segment-based)", fontsize=12)
    plt.ylabel(f"MFCC Coefficient {coeff_idx + 1} (Segment-based)", fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    #calculate the linear correlation level (pearson correlation)
    pearson_corr, pearson_p_value = pearsonr(seg_mean_mfcc_vector[:, coeff_idx], seg_cos_sim_mfcc)
    print(f'Linear correlation: {pearson_corr}')

    #calculate the none-linear correlation level (spearman’s rank correlation)
    spearman_corr, spearman_p_value = spearmanr(seg_mean_mfcc_vector[:, coeff_idx], seg_cos_sim_mfcc)
    print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#analysis with mfcc 1
#speaker based
#analysis based on each mfcc coefficient,13 in total
speaker_mean_mfcc_vector = np.array(speaker_mean_mfcc_vector)
speaker_cos_sim_mfcc = np.array(speaker_cos_sim_mfcc)

for coeff_idx in range(13):
    plt.figure(figsize=(8, 6))

    plt.scatter(speaker_cos_sim_mfcc, speaker_mean_mfcc_vector[:, coeff_idx], color='b', s=50)

    plt.title(f"MFCC Coefficient {coeff_idx + 1} vs MFCC Cosine Similarity Score", fontsize=16)
    plt.xlabel("MFCC Cosine Similarity Score", fontsize=12)
    plt.ylabel(f"MFCC Coefficient {coeff_idx + 1} (Speaker-based)", fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    #calculate the linear correlation level (pearson correlation)
    pearson_corr, pearson_p_value = pearsonr(speaker_mean_mfcc_vector[:, coeff_idx], speaker_cos_sim_mfcc)
    print(f'Linear correlation: {pearson_corr}')

    #calculate the none-linear correlation level (spearman’s rank correlation)
    spearman_corr, spearman_p_value = spearmanr(speaker_mean_mfcc_vector[:, coeff_idx], speaker_cos_sim_mfcc)
    print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#correlation analysis with mfcc 2
#segment level
#use the mean value of the mean mfcc vector of each segment

seg_mean_mfcc_value = np.array(seg_mean_mfcc_value)
seg_cos_sim_mfcc = np.array(seg_cos_sim_mfcc)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim_mfcc, seg_mean_mfcc_value, color='b', s=50)

plt.title(f"MFCC vs MFCC Cosine Similarity", fontsize=16)
plt.xlabel("MFCC Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"MFCC (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(seg_mean_mfcc_value, seg_cos_sim_mfcc)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_mfcc_value, seg_cos_sim_mfcc)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#correlation analysis with mfcc 2
#speaker based
#use the mean mfcc value for each speaker

speaker_mean_mfcc_value = np.array(speaker_mean_mfcc_value)
speaker_cos_sim_mfcc = np.array(speaker_cos_sim_mfcc)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim_mfcc, speaker_mean_mfcc_value, color='b', s=50)

plt.title(f"MFCC vs MFCC Cosine Similarity Score", fontsize=16)
plt.xlabel("MFCC Cosine Similarity Score", fontsize=12)
plt.ylabel(f"MFCC (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(speaker_mean_mfcc_value, speaker_cos_sim_mfcc)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_mfcc_value, speaker_cos_sim_mfcc)
print(f"Non-linear correlation: {spearman_corr}")

STEP 5.2: conduct correlation analysis between x-vectors and the mean cosine similarities of MFCCs

In [None]:
#analysis with xvector
#segment level
#no coefficientwise analysis due to the large number of coefficients (512)
#use the mean vector value of each segment

seg_mean_xvector_value = np.array(seg_mean_xvector_value)
seg_cos_sim_mfcc = np.array(seg_cos_sim_mfcc)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim_xvector, seg_mean_xvector_value, color='b', s=50)

plt.title(f"X-vector vs MFCC Cosine Similarity", fontsize=16)
plt.xlabel("MFCC Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"X-vector (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
seg_mean_xvector_value = np.ravel(seg_mean_xvector_value)
seg_cos_sim = np.ravel(seg_cos_sim)

pearson_corr, pearson_p_value = pearsonr(seg_mean_xvector_value, seg_cos_sim_mfcc)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_xvector_value, seg_cos_sim_mfcc)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#analysis with xvector
#speaker level
#use the mean xvector value for each speaker

speaker_mean_xvector_value = np.array(speaker_mean_xvector_value)
speaker_cos_sim_mfcc = np.array(speaker_cos_sim_mfcc)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim, speaker_mean_xvector_value, color='b', s=50)

plt.title(f"X-vector vs MFCC Cosine Similarity Score", fontsize=16)
plt.xlabel("MFCC Cosine Similarity Score", fontsize=12)
plt.ylabel(f"X-vector (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
speaker_mean_xvector_value = np.ravel(speaker_mean_xvector_value)
speaker_cos_sim = np.ravel(speaker_cos_sim)

pearson_corr, pearson_p_value = pearsonr(speaker_mean_xvector_value, speaker_cos_sim_mfcc)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_xvector_value, speaker_cos_sim_mfcc)
print(f"Non-linear correlation: {spearman_corr}")

STEP 5.3: conduct correlation analysis between embeddings and the mean cosine similarities of MFCCs

In [None]:
#segment based analysis with embedding

seg_mean_embedding_value = np.array(seg_mean_embedding_value)
seg_cos_sim_mfcc = np.array(seg_cos_sim_mfcc)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim_mfcc, seg_mean_embedding_value, color='b', s=50)

plt.title(f"Embedding vs MFCC Cosine Similarity", fontsize=16)
plt.xlabel("MFCC Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"Embedding (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(seg_mean_embedding_value, seg_cos_sim_mfcc)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_embedding_value, seg_cos_sim_mfcc)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#speaker based analysis with embedding

speaker_mean_embedding_value = np.array(speaker_mean_embedding_value)
speaker_cos_sim_mfcc = np.array(speaker_cos_sim_mfcc)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim_mfcc, speaker_mean_embedding_value, color='b', s=50)

plt.title(f"Embedding vs MFCC Cosine Similarity ", fontsize=16)
plt.xlabel("MFCC Cosine Similarity Score", fontsize=12)
plt.ylabel(f"Embedding (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(speaker_mean_embedding_value, speaker_cos_sim_mfcc)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_embedding_value, speaker_cos_sim_mfcc)
print(f"Non-linear correlation: {spearman_corr}")

STEP 6: conduct correlation analysis 3, which is the analysis between speech features and the mean cosine similarities of x-vectors

STEP 6.1: conduct correlation analysis between MFCCs and the mean cosine similarities of x-vectors

In [None]:
#correlation analysis with mfcc 1
#segment level
#analysis based on each mfcc coefficient,13 in total
seg_mean_mfcc_vector = np.array(seg_mean_mfcc_vector)
seg_cos_sim_xvector = np.array(seg_cos_sim_xvector)

for coeff_idx in range(13):
    plt.figure(figsize=(8, 6))

    plt.scatter(seg_cos_sim_xvector, seg_mean_mfcc_vector[:, coeff_idx], color='b', s=50)

    plt.title(f"MFCC Coefficient {coeff_idx + 1} vs X-vector Cosine Similarity ", fontsize=16)
    plt.xlabel("X-vector Cosine Similarity (Segment-based)", fontsize=12)
    plt.ylabel(f"MFCC Coefficient {coeff_idx + 1} (Segment-based)", fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    #calculate the linear correlation level (pearson correlation)
    pearson_corr, pearson_p_value = pearsonr(seg_mean_mfcc_vector[:, coeff_idx], seg_cos_sim_xvector)
    print(f'Linear correlation: {pearson_corr}')

    #calculate the none-linear correlation level (spearman’s rank correlation)
    spearman_corr, spearman_p_value = spearmanr(seg_mean_mfcc_vector[:, coeff_idx], seg_cos_sim_xvector)
    print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#analysis with mfcc 1
#speaker based
#analysis based on each mfcc coefficient,13 in total
speaker_mean_mfcc_vector = np.array(speaker_mean_mfcc_vector)
speaker_cos_sim_xvector = np.array(speaker_cos_sim_xvector)

for coeff_idx in range(13):
    plt.figure(figsize=(8, 6))

    plt.scatter(speaker_cos_sim_xvector, speaker_mean_mfcc_vector[:, coeff_idx], color='b', s=50)

    plt.title(f"MFCC Coefficient {coeff_idx + 1} vs X-vector Cosine Similarity Score", fontsize=16)
    plt.xlabel("X-vector Cosine Similarity Score", fontsize=12)
    plt.ylabel(f"MFCC Coefficient {coeff_idx + 1} (Speaker-based)", fontsize=12)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    #calculate the linear correlation level (pearson correlation)
    pearson_corr, pearson_p_value = pearsonr(speaker_mean_mfcc_vector[:, coeff_idx], speaker_cos_sim_xvector)
    print(f'Linear correlation: {pearson_corr}')

    #calculate the none-linear correlation level (spearman’s rank correlation)
    spearman_corr, spearman_p_value = spearmanr(speaker_mean_mfcc_vector[:, coeff_idx], speaker_cos_sim_xvector)
    print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#analysis with mfcc 2
#segment based
#use the mean value of the mean mfcc vector of each segment

seg_mean_mfcc_value = np.array(seg_mean_mfcc_value)
seg_cos_sim_xvector = np.array(seg_cos_sim_xvector)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim_xvector, seg_mean_mfcc_value, color='b', s=50)

plt.title(f"MFCC vs X-vector Cosine Similarity ", fontsize=16)
plt.xlabel("X-vector Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"MFCC (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(seg_mean_mfcc_value, seg_cos_sim_xvector)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_mfcc_value, seg_cos_sim_xvector)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#analysis with mfcc 2
#speaker based
#use the mean mfcc value for each speaker

speaker_mean_mfcc_value = np.array(speaker_mean_mfcc_value)
speaker_cos_sim_xvector = np.array(speaker_cos_sim_xvector)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim_xvector, speaker_mean_mfcc_value, color='b', s=50)

plt.title(f"MFCC vs X-vector Cosine Similarity", fontsize=16)
plt.xlabel("X-vector Cosine Similarity (Speaker-based)", fontsize=12)
plt.ylabel(f"MFCC (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(speaker_mean_mfcc_value, speaker_cos_sim_xvector)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_mfcc_value, speaker_cos_sim_xvector)
print(f"Non-linear correlation: {spearman_corr}")

STEP 6.2: conduct correlation analysis between xvectors and the mean cosine similarities of x-vectors

In [None]:
#segment based analysis with xvector
#no coefficientwise analysis due to the large number of coefficients (512)
#use the mean vector value of each segment

seg_mean_xvector_value = np.array(seg_mean_xvector_value)
seg_cos_sim_xvector = np.array(seg_cos_sim_xvector)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim_xvector, seg_mean_xvector_value, color='b', s=50)

plt.title(f"X-vector vs X-vector Cosine Similarity ", fontsize=16)
plt.xlabel("X-vector Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"X-vector (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
seg_mean_xvector_value = np.ravel(seg_mean_xvector_value)
seg_cos_sim = np.ravel(seg_cos_sim)

pearson_corr, pearson_p_value = pearsonr(seg_mean_xvector_value, seg_cos_sim_xvector)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_xvector_value, seg_cos_sim_xvector)
print(f"Non-linear correlation: {spearman_corr}")



In [None]:
#speaker based analysis with xvector
#use the mean xvector value for each speaker

speaker_mean_xvector_value = np.array(speaker_mean_xvector_value)
speaker_cos_sim_xvector = np.array(speaker_cos_sim_xvector)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim_xvector, speaker_mean_xvector_value, color='b', s=50)

plt.title(f"X-vector vs X-vector Cosine Similarity Score ", fontsize=16)
plt.xlabel("X-vector Cosine Similarity Score", fontsize=12)
plt.ylabel(f"X-vector (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
speaker_mean_xvector_value = np.ravel(speaker_mean_xvector_value)
speaker_cos_sim = np.ravel(speaker_cos_sim)

pearson_corr, pearson_p_value = pearsonr(speaker_mean_xvector_value, speaker_cos_sim_xvector)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_xvector_value, speaker_cos_sim_xvector)
print(f"Non-linear correlation: {spearman_corr}")

STEP 6.3: conduct correlation analysis between embeddings and the mean cosine similarities of x-vectors

In [None]:
#segment based analysis with embedding

seg_mean_embedding_value = np.array(seg_mean_embedding_value)
seg_cos_sim_xvector = np.array(seg_cos_sim_xvector)

plt.figure(figsize=(8, 6))
plt.scatter(seg_cos_sim_xvector, seg_mean_mfcc_value, color='b', s=50)

plt.title(f"Embedding vs  X-vector Cosine Similarity", fontsize=16)
plt.xlabel(" X-vector Cosine Similarity (Segment-based)", fontsize=12)
plt.ylabel(f"Embedding (Segment-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(seg_mean_embedding_value, seg_cos_sim_xvector)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(seg_mean_embedding_value, seg_cos_sim_xvector)
print(f"Non-linear correlation: {spearman_corr}")

In [None]:
#speaker based analysis with embedding

speaker_mean_embedding_value = np.array(speaker_mean_embedding_value)
speaker_cos_sim_xvector = np.array(speaker_cos_sim_xvector)

plt.figure(figsize=(8, 6))
plt.scatter(speaker_cos_sim_xvector, speaker_mean_embedding_value, color='b', s=50)

plt.title(f"Embedding vs X-vector Cosine Similarity ", fontsize=16)
plt.xlabel("X-vector Cosine Similarity Score)", fontsize=12)
plt.ylabel(f"Embedding (Speaker-based)", fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

#calculate the linear correlation level (pearson correlation)
pearson_corr, pearson_p_value = pearsonr(speaker_mean_embedding_value, speaker_cos_sim_xvector)
print(f'Linear correlation: {pearson_corr}')

#calculate the none-linear correlation level (spearman’s rank correlation)
spearman_corr, spearman_p_value = spearmanr(speaker_mean_embedding_value, speaker_cos_sim_xvector)
print(f"Non-linear correlation: {spearman_corr}")