# Import Dependencies

In [1]:
# !pip install sentence_transformers

import nltk
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
import pandas as pd
import torch
import torch.nn.functional as F
import time

from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_class = AutoModelForSequenceClassification.from_pretrained('morenolq/spotify-podcast-advertising-classification')
tokenizer = AutoTokenizer.from_pretrained('morenolq/spotify-podcast-advertising-classification')

# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

# morenolq/spotify-podcast-advertising-classification test on Episodes

In [2]:
# Previously indicated episodes to test all chatbots against
indicator_episodes = ["spotify:episode:5fG4VlWnWwzAt6mSs0H7lY", "spotify:episode:7JG3lLnRoDdOxuqjf14ZkM",
                            "spotify:episode:3kkhUQJ9DXYs6aSdDmPp2V", "spotify:episode:4fJ6Y6IpljKy8FT8DZHx1L",
                            "spotify:episode:5xBPWxqVCocdBgybmHjr5V", "spotify:episode:0X663c1I6j1cehJvy10WMm", 
                            "spotify:episode:61a1JjZO27lGCvCwBaCkpC", "spotify:episode:0goWRy1gwB23rQVy8ci7Wa", 
                            "spotify:episode:0BSD8QYmd2mQ1V43uIrU4I", "spotify:episode:5xH3cdpkxnJhQjPV22sxKC",
                            "spotify:episode:0YPvJfSEw0jacPB3IeT37d", "spotify:episode:1gnpv26FFvIxpnwVbbRXv1",
                            "spotify:episode:6rh4J52THn252yi7t11Yqf", "spotify:episode:3IfmcM2rcWb82601pkPvCh",
                            "spotify:episode:5LJ33LdXWhqOu1KNad6D5q", "spotify:episode:1AxFBio6NwwG0MAjdCK5gK",
                            "spotify:episode:3U33mRnDJcXywmBm1hahlL", "spotify:episode:57Nzb9H2VRZgHcNFChwbBG",
                            "spotify:episode:6vRLNVEQ4xqtMxnms3RZh3", "spotify:episode:1tN044BhlPjjiluZ7Wo7UL",
                            "spotify:episode:1fs86N6FLUKW2e5NdX1dF1", "spotify:episode:1A4cHtP3wIVQutpCgS7kd7",
                            "spotify:episode:1Mi90UjG91rm73qvHzcG0t", "spotify:episode:3TuC8HZp9VdXtxYMQkJI0m",
                            "spotify:episode:67v8V9SOXxivYQfAHSsc5f", "spotify:episode:3QE8qxHtJg3zMrq03R4GOj",
                            "spotify:episode:3Vr6AUCTQgVWoE137b4IdB", "spotify:episode:4oQZAcd6xnxVA1e2GWaIFb",
                            "spotify:episode:4sbSs1xtuf8dF3xtb7btUi", "spotify:episode:2RoLspr2PQTki51sfMhd20"]

In [6]:
of_interest = []

for episode in indicator_episodes:
    episode_ = {
        "query": {
            "match_phrase": {
                "_id": episode,
            }
        }
    }

    results = es.search(index="spotify_podcast_transcripts", body=episode_)

    # Extract relevant information from the search results
    hits = results["hits"]["hits"]

    if hits:
        episode_uri = hits[0]['_id']
        sentences = hits[0]['_source']['sentence_tokens']

        # sentence_outputs_tuples = []

        # Iterate through each sentence in the transcript
        # If the sentence is an advertisement, add it to the list of advertisements
        ad_sentences = []
        probabilities = []
        start = time.time()

        # Iterate through each sentence in the transcript
        for i, s in enumerate(sentences):

            # If the sentence is an advertisement, add it to the list of advertisements 
            if i==0:
                context = "__START__"

            # If the sentence is not an advertisement, add it to the list of non-advertisements    
            else:
                # Get the context of the sentence
                context = sentences[i-1]

            # Get the probability of the sentence being an advertisement 
            out = tokenizer(context,
                            s, padding = "max_length", max_length = 256,truncation=True,
                            return_attention_mask=True, return_tensors = 'pt')
            
            # Get the model's prediction for the sentence
            outputs = model_class(**out)

            # Get the probabilities of the sentence being an advertisement
            probabilities_ = F.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(outputs.logits, dim=-1)

            # If the model predicts the sentence is an advertisement, add it to the list of advertisements
            if predictions.item() == 1:
                ad_sentences.append(s)
                probabilities.append(probabilities_.tolist()[0][1])

        end = time.time()
        total_time = round((end-start)/60, 2)
        print(f"Total time for {episode_uri}: {total_time} minutes")

        episode_info = pd.DataFrame({
            "episode_uri": episode_uri,
            "top_responses": ad_sentences,
            "top_scores": probabilities,
            "total_time": total_time
            })        
        of_interest.append(episode_info)

## Create a DataFrame from the list of episode information
# indicator_episodes_df = pd.DataFrame(of_interest)
# indicator_episodes_df

len(of_interest)

Total time for spotify:episode:5fG4VlWnWwzAt6mSs0H7lY: 13.52 minutes
Total time for spotify:episode:7JG3lLnRoDdOxuqjf14ZkM: 9.71 minutes
Total time for spotify:episode:3kkhUQJ9DXYs6aSdDmPp2V: 8.1 minutes
Total time for spotify:episode:4fJ6Y6IpljKy8FT8DZHx1L: 8.4 minutes
Total time for spotify:episode:5xBPWxqVCocdBgybmHjr5V: 10.75 minutes
Total time for spotify:episode:0X663c1I6j1cehJvy10WMm: 3.76 minutes
Total time for spotify:episode:61a1JjZO27lGCvCwBaCkpC: 9.88 minutes
Total time for spotify:episode:0goWRy1gwB23rQVy8ci7Wa: 6.17 minutes
Total time for spotify:episode:0BSD8QYmd2mQ1V43uIrU4I: 4.01 minutes
Total time for spotify:episode:5xH3cdpkxnJhQjPV22sxKC: 1.93 minutes
Total time for spotify:episode:0YPvJfSEw0jacPB3IeT37d: 5.68 minutes
Total time for spotify:episode:1gnpv26FFvIxpnwVbbRXv1: 1.35 minutes
Total time for spotify:episode:6rh4J52THn252yi7t11Yqf: 7.4 minutes
Total time for spotify:episode:3IfmcM2rcWb82601pkPvCh: 1.45 minutes
Total time for spotify:episode:5LJ33LdXWhqOu1KNad

30

In [7]:
# preview example of identified sentences for one episode
of_interest[0]

Unnamed: 0,episode_uri,top_responses,top_scores,total_time
0,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,So download the free anchor app or go to Ancho...,0.523892,13.52
1,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,F m-- to get started.,0.872203,13.52
2,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,Okay.,0.560135,13.52
3,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,It's been a long.,0.525132,13.52
4,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,I don't think it's that important.,0.561293,13.52
5,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,Hi.,0.747715,13.52
6,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,I just kind of wish it so that's where we got ...,0.551299,13.52
7,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,It was weird.,0.512934,13.52
8,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,Yeah.,0.590432,13.52
9,spotify:episode:5fG4VlWnWwzAt6mSs0H7lY,Yeah the majority.,0.53159,13.52


In [10]:
# indicator_episodes_df.to_excel('output/classification_indicator_episodes.xlsx', index=False)

# Classification per episode
index_problem_df = []
returned_df = []
path = 'output/Indicator_episode_results/classification_per_episode/'

# Loop through all the classification results
for count, df in enumerate(of_interest):

    # Check if the dataframe is empty
    if df.shape[0] == 0:

        # Add the episode number to the list of episodes without results
        index_problem_df.append(count)
        continue

    # If the dataframe is not empty, add the episode number to the list of episodes with results
    else:
        # Get the episode URI
        episode_uri =df.iloc[0,0]

        # Add the episode URI to the list of episodes with results
        returned_df.append(episode_uri)

        # Save the classification results to an excel file
        episode_uri = episode_uri.replace(":", "_")

        # Save the classification results to an excel file
        df.to_excel(f'{path}classify_{episode_uri}.xlsx', index=False)

In [11]:
index_problem_df

[5, 22]

# New pinecone/distiluse-podcast-nq Results

In [None]:
from sklearn.metrics import ndcg_score
import numpy as np

# episodes whose sentences didn't get classified. spotify:episode:0X663c1I6j1cehJvy10WMm, spotify:episode:1Mi90UjG91rm73qvHzcG0t

# load data
data = pd.read_excel('output/Indicator_episode_results/classification_indicator_episodes.xlsx', sheet_name=None)
data.keys()

# get dfs to compare
final_dfs = list(data.keys())
final_dfs.remove('episode_sentences')
final_dfs.remove('episode_sentences_T')
final_dfs.remove('raw_output')

path = 'output/Indicator_episode_results/'

# get ndcg scores for each df
scored_dfs = []
for episode in final_dfs:

    # get df
    scored_df = data[episode]

    # get y_true
    scored_df['y_true'].fillna(0, inplace=True)

    # get y_pred
    scored_df['top_scores'] = 1
    
    # get ndcg score
    scored_df.sort_values(by=['top_scores'], ascending=False, inplace=True)

    # get ndc score
    scored_dfs.append(scored_df)

ndcg_scores = []

# get ndcg score for each df
for df in scored_dfs:

    # get y_true and y_pred
    true_relevance = np.asarray([df['y_true']])

    # get y_pred
    predicted_scores = np.asarray([df['top_scores']])

    # get ndcg score
    ndcg = ndcg_score(true_relevance, predicted_scores)
    ndcg_scores.append(ndcg)
    print("NDCG score: ", ndcg)

# get average ndcg score
avg_ndcg_score = np.mean(ndcg_scores)

# For each episode print average score and model 
model_name = 'pinecone/distiluse-podcast-nq'
print('Average NDCG score: ', avg_ndcg_score, "for model: ", model_name)

# Old Examine classification results

In [2]:
# episodes whose sentences didn't get classified. spotify:episode:0X663c1I6j1cehJvy10WMm, spotify:episode:1Mi90UjG91rm73qvHzcG0t
data = pd.read_excel('output/Indicator_episode_results/classification_indicator_episodes.xlsx', sheet_name=None)
data.keys()

dict_keys(['Sheet30', 'Sheet29', 'Sheet28', 'Sheet27', 'Sheet26', 'Sheet25', 'Sheet24', 'Sheet23', 'Sheet22', 'raw_output', 'Sheet21', 'Sheet20', 'Sheet19', 'Sheet18', 'Sheet17', 'Sheet16', 'Sheet15', 'Sheet14', 'Sheet13', 'Sheet12', 'Sheet11', 'Sheet10', 'Sheet9', 'Sheet8', 'Sheet7', 'Sheet6', 'Sheet5', 'Sheet4', 'Sheet3', 'Sheet2', 'Sheet1', 'episode_sentences', 'episode_sentences_T'])

In [3]:
# look at the resulting data from episodes
final_dfs = list(data.keys())
final_dfs.remove('episode_sentences')
final_dfs.remove('episode_sentences_T')
final_dfs.remove('raw_output')

In [4]:
data['Sheet27']

Unnamed: 0,episode_uri,human_identified,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,y_true
0,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,Oh that's my age.,0.0
1,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,Okay you got that.,0.0
2,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,I can it's so but it's Really a for it's afte...,0.0
3,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,Yeah.,0.0
4,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,I'm here.,0.0
5,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,Yeah.,0.0
6,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,,
7,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,,
8,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,,
9,spotify:episode:3Vr6AUCTQgVWoE137b4IdB,,,


In [5]:
path = 'output/Indicator_episode_results/'

scored_dfs = []

# go through each episode
for episode in final_dfs:
    scored_df = data[episode]

    # episode_uri = scored_df.iloc[0,0]
    # print(episode, episode_uri)

    # display(scored_df.head(2))

    # if empty score, impute 0
    scored_df['y_true'].fillna(0, inplace=True)

    # get top scores
    scored_df['top_scores'] = 1
    
    # sort scores
    scored_df.sort_values(by=['top_scores'], ascending=False, inplace=True)
    scored_dfs.append(scored_df)

In [6]:
from sklearn.metrics import ndcg_score
import numpy as np

model = 'morenolq/spotify-podcast-advertising-classification'

# ordered greatest to least
ndcg_scores = []

# loat scored data outputs
for df in scored_dfs:

    # get the episode uri
    episode_uri = df.iloc[0,0]

    # get the true relevance
    true_relevance = np.asarray([df['y_true']])
    
    try:
        predicted_scores = np.asarray([df['top_scores']])
        ndcg = ndcg_score(true_relevance, predicted_scores)
    except ValueError as error:
        print(error)
        print("Episode URI: ", episode_uri)
        continue

    
    ndcg_scores.append(ndcg)
    print("NDCG score: ", ndcg)

avg_ndcg_score = np.mean(ndcg_scores)
print('Average NDCG score: ', avg_ndcg_score, "for model: ", model)

NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.4169134448340216
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.44836183544011915
NDCG score:  0.44836183544011915
NDCG score:  0.0
NDCG score:  0.44836183544011915
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.3656233288925999
NDCG score:  0.3656233288925999
NDCG score:  0.0
NDCG score:  0.3656233288925999
NDCG score:  0.3656233288925999
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.0
NDCG score:  0.4169134448340216
NDCG score:  0.0
NDCG score:  0.3656233288925999
NDCG score:  0.44836183544011915
NDCG score:  0.0
NDCG score:  0.44836183544011915
NDCG score:  0.4169134448340216
Average NDCG score:  0.1773555385388553 for model:  morenolq/spotify-podcast-advertising-classification


# Previous work

In [3]:
results = pd.read_csv("output/test_transcripts.csv")
test_1_transcript = results.loc[1, 'transcript']
sent_tokens = nltk.sent_tokenize(test_1_transcript)# converts to list of sentences
len(sent_tokens)

676

In [4]:
sentence_outputs_tuples = []
start = time.time()
for i, s in enumerate(sent_tokens[:200]): 
    if i==0:
        context = "__START__"
    else:
        context = sent_tokens[i-1] 
    out = tokenizer(context,
                    s,
                    padding = "max_length",
                    max_length = 256,
                    truncation=True,
                    return_attention_mask=True,
                    return_tensors = 'pt')
    outputs = model_class(**out)
    
    # probabilities = F.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(outputs.logits, dim=-1)

    if predictions.item() == 1:
        sentence_outputs_tuples.append((s, predictions.item()))

end = time.time()
print("Time taken:", (end-start)/60, "minutes")


KeyboardInterrupt: 

In [5]:
for sentences in sentence_outputs_tuples:
    if sentences[1] == 1:
        print(sentences[0])

[("What's up, everybody?", 0),
 ('Welcome to the in the dome podcast podcast body are you doing hey, how you doing?',
  0),
 ("I'm pretty good myself.", 0),
 ('All right.', 0),
 ('What do you want to talk about today?', 0),
 ('We got a breakdown.', 0),
 ("I know there's not much stock boat.", 0),
 ("We still haven't broken down the Columbus game, but cardiac / comeback kids come back.",
  0),
 ('You know what that game room.', 0),
 ('Okay, I believe off the where we thought we were talking about a couple of episodes ago were talking about house like a toxic relationship and I was starting to get sucked back in.',
  0),
 ('Yep.', 0),
 ("And then I think you were to it's like when fully back on board is fully back with the Calgary Flames right now to start that podcast.",
  0),
 ("I was like, I'm not I'm not falling for it.", 0),
 ("I've been hurt too many times by these guys, but by the end of that podcast we switched but I was still whatever.",
  0),
 ("Yeah, I'm fine.", 0),
 ("I'm tol

In [38]:
test_transcript_class = pd.DataFrame.from_records(sentence_outputs_tuples)

In [42]:
test_transcript_class.sort_values(by=1, ascending=False).head(10)[0].values

array(['No, I believe I do believe I will say that.', 'I like the wild.',
       'But for it was fun.', "Okay, I don't have it in front of me.",
       "You can't follow that up with a loss you continue that momentum with another win.",
       "What's up, everybody?",
       "And again like we mentioned this yesterday to it's like why Riddick had to Riddick was the one who played the majority of those games.",
       'The guy that is questioned about how much of a workload can handle and the past.',
       "He like you're just playing night after night after night.",
       "And then even when Talbot starts to like play Lights Out you still start going back to Rick like the I don't like  I feel like more than anything that's fucked with his game."],
      dtype=object)

In [43]:
test_transcript_class.to_csv("output/test_transcript_class.csv", index=False)