In [None]:
import yaml
from CARP_classification import *
from theme_extractor import *

# Loading podcast sentences
with open('output.yaml', 'r') as file:
    data = yaml.safe_load(file)
    input_texts = [d['text'] for d in data]


# Test mode
test_mode = False
if test_mode:
    input_texts = input_texts[:30]

podcast_theme = extract_podcast_theme(input_texts) 
print(f"Podcast theme {podcast_theme}")

Podcast theme Podcast theme Space exploration and engineering challenges


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import time
import traceback


results = []
for i in range(0,len(input_texts),50):
    batch = input_texts[i:i+50]
    while True:
        try:
            batch_results = asyncio.run(classify_texts_async(batch, podcast_theme, model="gpt-3.5-turbo"))
            print(batch_results)
            results.extend(batch_results)
            break
        except Exception as e:
            traceback.print_exc()
            # Optionally, get the exception name and message separately
            error_name = type(e).__name__
            error_message = str(e)
            print(f"Error Type: {error_name}")
            print(f"Error Message: {error_message}")
            print(f'Batch starting from {i} being processed again, after 60 sec')
            time.sleep(60)
    print(f'Batch {i} was processed')

[{'input_text': 'The following is a conversation with Elon Musk.', 'classification': 1, 'clues': '1. **Keywords**: \n   - "Elon Musk" (known for SpaceX and space exploration)\n   - "Space" (directly related to the podcast theme)\n   - "exploration" (key aspect of the theme)\n   - "engineering" (related to the challenges in space exploration)\n   - "challenges" (referring to obstacles in engineering and space missions)\n\n2. **Phrases**: \n   - "space missions" (contextually relevant)', 'reasoning': 'To determine whether the input sentence relates to the podcast theme of space exploration and engineering challenges, we can follow a diagnostic reasoning process based on the provided clues.\n\n1. **Identify Key Components**:\n   - The input mentions "a conversation with Elon Musk." Given that Elon Musk is the CEO of SpaceX, his work is fundamentally tied to space exploration.\n   - Look for keywords in the clues: "Space," "exploration," "engineering," and "challenges."\n\n2. **Assess Rele

In [5]:
import pickle
with open("results.pkl",'wb') as file:
    pickle.dump(results, file)

In [7]:
with open("results.pkl",'rb') as file:
    results = pickle.load(file)
results[:2]

[{'input_text': 'The following is a conversation with Elon Musk.',
  'classification': 1,
  'clues': '1. **Keywords**: \n   - "Elon Musk" (known for SpaceX and space exploration)\n   - "Space" (directly related to the podcast theme)\n   - "exploration" (key aspect of the theme)\n   - "engineering" (related to the challenges in space exploration)\n   - "challenges" (referring to obstacles in engineering and space missions)\n\n2. **Phrases**: \n   - "space missions" (contextually relevant)',
  'reasoning': 'To determine whether the input sentence relates to the podcast theme of space exploration and engineering challenges, we can follow a diagnostic reasoning process based on the provided clues.\n\n1. **Identify Key Components**:\n   - The input mentions "a conversation with Elon Musk." Given that Elon Musk is the CEO of SpaceX, his work is fundamentally tied to space exploration.\n   - Look for keywords in the clues: "Space," "exploration," "engineering," and "challenges."\n\n2. **Asses

In [26]:
def remove_casual_segments(results,n=6,m=3):
    # Remove sentences if we have more than m casual sentences in a row
    indices_of_casual_sentences = []
    sentences_to_remove = []
    for i, sentence_params in enumerate(results):
        #print(len(indices_of_casual_sentences))
        if sentence_params['classification']==0:
            indices_of_casual_sentences.append(i)
        else:
            if len(indices_of_casual_sentences)>m:
                sentences_to_remove.extend(indices_of_casual_sentences)
            indices_of_casual_sentences = []
    # print(sentences_to_remove)
    # print(len(sentences_to_remove))

    # Make segments
    non_casual_results = [sentence_params['input_text'] for i, sentence_params in enumerate(results) if i not in sentences_to_remove]
    return non_casual_results



non_casual_results = remove_casual_segments(results)
non_casual_results

['The following is a conversation with Elon Musk.',
 'I need to get to this thing.',
 "Come on, you're Russian.",
 'You can be serious.',
 "Everyone's serious all the time in Russia.",
 'Yeah.',
 "Yeah, we'll get there.",
 "We'll get there.",
 "It's gotten soft.",
 'Allow me to say that the SpaceX launch',
 'of human beings to orbit on May 30th, 2020',
 'was seen by many as the first step',
 'in a new era of human space exploration.',
 'These human spaceflight missions were a beacon of hope',
 'to be excited about the future.',
 'Let me ask about Crew Dragon demo two.',
 'So that first flight with humans on board,',
 'how did you feel leading up to that launch?',
 'I was confident that at the time that we launched,',
 'that no one could think of anything at all to do',
 'that would improve the probability of success',
 'and we racked our brains to think of any possible way',
 'to improve the probability of success.',
 'We could not think of anything more and nor could NASA',
 "and so t