# Import Dependencies

In [1]:
import nltk
import random

# !pip install sentence_transformers
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

import time

from elasticsearch import Elasticsearch

# Assuming you have already created an Elasticsearch client instance 'es'
es = Elasticsearch("http://localhost:9200")

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('pinecone/distiluse-podcast-nq')

# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

# pinecone/distiluse-podcast-nq Test on Episodes

In [2]:
# pre-identified episodes to be tested on all chatbots
indicator_episodes = ["spotify:episode:5fG4VlWnWwzAt6mSs0H7lY", "spotify:episode:7JG3lLnRoDdOxuqjf14ZkM",
                            "spotify:episode:3kkhUQJ9DXYs6aSdDmPp2V", "spotify:episode:4fJ6Y6IpljKy8FT8DZHx1L",
                            "spotify:episode:5xBPWxqVCocdBgybmHjr5V", "spotify:episode:0X663c1I6j1cehJvy10WMm", 
                            "spotify:episode:61a1JjZO27lGCvCwBaCkpC", "spotify:episode:0goWRy1gwB23rQVy8ci7Wa", 
                            "spotify:episode:0BSD8QYmd2mQ1V43uIrU4I", "spotify:episode:5xH3cdpkxnJhQjPV22sxKC",
                            "spotify:episode:0YPvJfSEw0jacPB3IeT37d", "spotify:episode:1gnpv26FFvIxpnwVbbRXv1",
                            "spotify:episode:6rh4J52THn252yi7t11Yqf", "spotify:episode:3IfmcM2rcWb82601pkPvCh",
                            "spotify:episode:5LJ33LdXWhqOu1KNad6D5q", "spotify:episode:1AxFBio6NwwG0MAjdCK5gK",
                            "spotify:episode:3U33mRnDJcXywmBm1hahlL", "spotify:episode:57Nzb9H2VRZgHcNFChwbBG",
                            "spotify:episode:6vRLNVEQ4xqtMxnms3RZh3", "spotify:episode:1tN044BhlPjjiluZ7Wo7UL",
                            "spotify:episode:1fs86N6FLUKW2e5NdX1dF1", "spotify:episode:1A4cHtP3wIVQutpCgS7kd7",
                            "spotify:episode:1Mi90UjG91rm73qvHzcG0t", "spotify:episode:3TuC8HZp9VdXtxYMQkJI0m",
                            "spotify:episode:67v8V9SOXxivYQfAHSsc5f", "spotify:episode:3QE8qxHtJg3zMrq03R4GOj",
                            "spotify:episode:3Vr6AUCTQgVWoE137b4IdB", "spotify:episode:4oQZAcd6xnxVA1e2GWaIFb",
                            "spotify:episode:4sbSs1xtuf8dF3xtb7btUi", "spotify:episode:2RoLspr2PQTki51sfMhd20"]

In [34]:
len(indicator_episodes)

30

In [33]:
of_interest = []

# Loop through each episode and extract the relevant information
for episode_num, episode in enumerate(indicator_episodes):
    episode_ = {
        "query": {
            "match_phrase": {
                "_id": episode,
            }
        }
    }

    # Search for the episode in the index
    results = es.search(index="spotify_podcast_transcripts", body=episode_)

    # Extract relevant information from the search results
    hits = results["hits"]["hits"]

    # Extract the episode URI
    episode_uri = hits[0]['_id']
    sent_tokens = hits[0]['_source']['sentence_tokens']

    # Begin timing model runtime
    start = time.time()

    # Ad question to compare episode transcript sentences to
    ad_question = "who are the advertisers, sponsors, advertisement, or ads and/or businesses, people, teams thanked?"
    
    # Append the question to the end of the list of transcript sentences
    sent_tokens.append(ad_question)

    # Generate sentence transformer embeddings
    top_responses = [] 
    sentence_encodings = model.encode(sent_tokens, convert_to_tensor=True)# generate sentence transformer embeddings
    sentence_encodings = sentence_encodings.cpu()

    # Calculate cosine similarity between the ad_question sentence and all other sentences
    vals = cosine_similarity(sentence_encodings[-1].reshape(1, -1), sentence_encodings) #the chatbot conversation code

    #index of the second - fourth highest similarity (the first highest would be the question itself
    index_top25 = vals.argsort()[0][-25:-1]

    flat = vals.flatten() #reduces dimension of cosine similarity array to be able to sort
    flat.sort() #sort the cosine similarity values
    top25_cos_sim_val = flat[-25:-1] #get the second highest cosine similarity value.
    for index in index_top25:  
        top_responses.append(sent_tokens[index])

    # end timing runtime
    end = time.time()
    total_time = round((end-start)/60, 2)
    print(f"Total time for {episode_uri}: {total_time} minutes")

    # Create a DataFrame with the episode information
    episode_info = pd.DataFrame({
            "episode_uri": episode_uri,
            "top_responses": top_responses,
            "top_scores": top25_cos_sim_val,
            "total_time": total_time
        })
    print(episode_info.shape)
    of_interest.append(episode_info)

# Create a DataFrame from the list of episode information
print(len(of_interest))

Total time for spotify:episode:5fG4VlWnWwzAt6mSs0H7lY: 0.52 minutes
(24, 4)
Total time for spotify:episode:7JG3lLnRoDdOxuqjf14ZkM: 0.38 minutes
(24, 4)
Total time for spotify:episode:3kkhUQJ9DXYs6aSdDmPp2V: 0.41 minutes
(24, 4)
Total time for spotify:episode:4fJ6Y6IpljKy8FT8DZHx1L: 0.43 minutes
(24, 4)
Total time for spotify:episode:5xBPWxqVCocdBgybmHjr5V: 0.5 minutes
(24, 4)
Total time for spotify:episode:0X663c1I6j1cehJvy10WMm: 0.26 minutes
(24, 4)
Total time for spotify:episode:61a1JjZO27lGCvCwBaCkpC: 0.46 minutes
(24, 4)
Total time for spotify:episode:0goWRy1gwB23rQVy8ci7Wa: 0.53 minutes
(24, 4)
Total time for spotify:episode:0BSD8QYmd2mQ1V43uIrU4I: 0.32 minutes
(24, 4)
Total time for spotify:episode:5xH3cdpkxnJhQjPV22sxKC: 0.19 minutes
(24, 4)
Total time for spotify:episode:0YPvJfSEw0jacPB3IeT37d: 0.38 minutes
(24, 4)
Total time for spotify:episode:1gnpv26FFvIxpnwVbbRXv1: 0.1 minutes
(24, 4)
Total time for spotify:episode:6rh4J52THn252yi7t11Yqf: 0.32 minutes
(24, 4)
Total time for

In [43]:
# get rid of colons in the column names
of_interest[0].iloc[0,0].replace(":", "_")

'spotify_episode_5fG4VlWnWwzAt6mSs0H7lY'

In [44]:
# save each dataframe to a separate excel file
for df in of_interest:
    episode_uri = df.iloc[0,0].replace(":", "_")
    df.to_excel(f'output/first_chatbot_results/pinecone_{episode_uri}.xlsx', index=False)

# Examine classification results for pinecone/distiluse-podcast-nq

In [1]:
import pandas as pd

# episodes whose sentences didn't get classified. spotify:episode:0X663c1I6j1cehJvy10WMm, spotify:episode:1Mi90UjG91rm73qvHzcG0t
data = pd.read_excel('output/classification_indicator_episodes.xlsx', sheet_name='raw_output')

# get the sentences that were classified as advertisement
data['advert_sentences'] = data['sentence_predictions'].apply(lambda x: x[0][0])

In [77]:
# read in all episode excel files
path = 'output/Indicator_episode_results/first_chatbot_results/pinecone_'

scored_dfs = []

# iterate through all episodes
for episode in indicator_episodes:

    # read in episode excel file
    scored_df = pd.read_excel(f'{path}{episode.replace(":", "_")}.xlsx')

    # if there is an empty value, impute with 0
    scored_df['y_true'].fillna(0, inplace=True)

    # sort by top scores
    scored_df.sort_values(by=['top_scores'], ascending=False, inplace=True)
    scored_dfs.append(scored_df)

# test_episode = 'spotify:episode:7JG3lLnRoDdOxuqjf14ZkM'.replace(':', '_')
# test_scored_df = pd.read_excel(f'{path}{test_episode}.xlsx')

In [75]:
# test_scored_df.sort_values(by=['top_scores'], ascending=False)

Unnamed: 0,episode_uri,top_responses,top_scores,total_time,y_true
23,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,I'm sorry product right going right dude.,0.220177,0.38,0
22,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,It doesn't matter if you're 42 or 40 inches wh...,0.207098,0.38,0
21,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,"No, like wide receiver.",0.195982,0.38,0
20,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,"All right all yeah, that's the great unknown a...",0.191011,0.38,0
19,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,Well every one of those guys are capable of do...,0.190526,0.38,0
18,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,And if you're a premium user you can download ...,0.181691,0.38,0
17,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,"From your first no, but why me for sports purp...",0.179988,0.38,0
16,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,Why don't we alternate throwing names out?,0.178179,0.38,0
15,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,Take a relief following practices that are ope...,0.177365,0.38,0
14,spotify:episode:7JG3lLnRoDdOxuqjf14ZkM,They want to help but now they have to get to ...,0.17305,0.38,0


In [69]:
len(scored_dfs)

30

In [58]:
import numpy as np

# take ground truth score and create np.array for y_true scores calculated
# true_relevance = scored_df['y_true'].array
true_relevance = np.asarray([scored_df['y_true']])
display(true_relevance)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]], dtype=int64)

In [60]:
# predicted_scores = scored_df['top_scores'].array
predicted_scores = np.asarray([scored_df['top_scores']])
display(predicted_scores)

array([[0.13248363, 0.13354959, 0.13725534, 0.1400743 , 0.14035347,
        0.14086919, 0.14144759, 0.14327906, 0.14947793, 0.15029049,
        0.15133779, 0.15557562, 0.15909256, 0.16203743, 0.16310035,
        0.1678865 , 0.17047015, 0.18375085, 0.18465284, 0.1927107 ,
        0.203134  , 0.21267931, 0.23191629, 0.24184948]])

In [61]:
from sklearn.metrics import ndcg_score
import numpy as np
# create a function to calculate the ndcg score to measure
# the performance of the model to identify advertisment in a podcast episode transcript
ndcg = ndcg_score(true_relevance, predicted_scores)

print("NDCG score: ", ndcg)

NDCG score:  0.0


In [80]:
from sklearn.metrics import ndcg_score
import numpy as np

# ordered greatest to least
model_name = 'pinecone/distiluse-podcast-nq'
ndcg_scores = []

for df in scored_dfs:
    true_relevance = np.asarray([df['y_true']])
    predicted_scores = np.asarray([df['top_scores']])

    ndcg = ndcg_score(true_relevance, predicted_scores)
    ndcg_scores.append(ndcg)
    print("NDCG score: ", ndcg)

avg_ndcg_score = np.mean(ndcg_scores)
print('Average NDCG score: ', avg_ndcg_score, "for model: ", model_name)

NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.7134154243608393
NDCG score:  0.6334896111932239
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.6749855592156191
NDCG score:  0.2242438242175755
NDCG score:  0.6436409928154081
NDCG score:  0.7280404434255173
NDCG score:  0.6309297535714573
NDCG score:  0.21810429198553116
NDCG score:  0.7093946221322694
NDCG score:  0.22767024869695263
NDCG score:  0.3154648767857289
NDCG score:  0.3089957244886703
NDCG score:  0.4694080473965761
NDCG score:  0.27894294565112965
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.8005641284036986
NDCG score:  0.28906482631788766
NDCG score:  0.0
NDCG score:  0.6350189957328872
NDCG score:  0.0
Average NDCG score:  0.2833791438796991 for model:  pinecone/distiluse-podcast-nq


In [79]:
# ordered least to greatest
# scored_episodes = {}
# for df in scored_dfs:
#     true_relevance = np.asarray([df['y_true']])
#     predicted_scores = np.asarray([df['top_scores']])

#     ndcg = ndcg_score(true_relevance, predicted_scores)

#     print("NDCG score: ", ndcg)