In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
import openai
import os
import feedparser

# Enter your RSS feed URL here
rss_url = 'https://api.substack.com/feed/podcast/1084089.rss'
# Convert .rss to XML to csv or dataframe, some manual work happened here already would need to productize to get to the v4.csv in a more automated way

# Parse the RSS feed
feed = feedparser.parse(rss_url)

# Initialize a list to hold all episode data
episodes = []

# Iterate over each entry in the feed
for entry in feed.entries:
    episode_data = {}
    # Dynamically extract all available fields and their values
    for key in entry.keys():
        episode_data[key] = entry[key]
    episodes.append(episode_data)

# Convert to DataFrame
df = pd.DataFrame(episodes)


# Save to CSV
df.to_csv('podcast_episodes_v2.csv', index=False)

# Some manual work happened here to get to v4 as a quick hack

# Replace 'your_path_here' with the actual path to your CSV file
file_path = 'podcast_episodes_v4.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [2]:

# Set your OpenAI API key from the environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:

# Function to extract the mp3 URL from the links column
def extract_mp3_url(links_str):
    links_list = literal_eval(links_str)  # Convert string representation of list to list
    for link in links_list:
        if link.get('type') == 'audio/mpeg':
            return link.get('href')  # Return the href value if type is audio/mpeg
    return None  # Return None if no audio/mpeg type is found

# Apply the function to the 'links' column and store the result in a new 'Audio' column
df['Audio'] = df['links'].apply(extract_mp3_url)

# Display the DataFrame to verify the new column
#print(df)

In [5]:
#parse summary text that contains intro stuff that would further need to be parsed, chapters, and a transcript
from bs4 import BeautifulSoup

# Assuming df is your dataframe and 'Summary' is the column with HTML content

def parse_summary(text):
    # Remove HTML tags
    soup = BeautifulSoup(text, "lxml")
    plain_text = soup.get_text(separator=' ')

    # Find sections
    timestamps_index = plain_text.find('Timestamps')
    transcript_index = plain_text.find('Transcript')

    # Extract sections
    intro = plain_text[:timestamps_index].strip()
    timestamps = plain_text[timestamps_index:transcript_index].replace('Timestamps', '', 1).strip()
    transcript = plain_text[transcript_index:].replace('Transcript', '', 1).strip()

    # Remove the last line from the transcript
    #transcript = '\n'.join(transcript.split('\n')[:-1])

    return intro, timestamps, transcript

# Apply the parsing function to each row in the dataframe
df[['intro', 'timestamps', 'transcript_withtime']] = df.apply(lambda row: pd.Series(parse_summary(row['summary'])), axis=1)

# Replace the specified text with an empty string i
df['transcript_withtime'] = df['transcript_withtime'].apply(lambda x: x.replace("www.latent.space/subscribe", "").strip())
df['transcript_withtime'] = df['transcript_withtime'].apply(lambda x: x.replace("Get full access to Latent Space at", "").strip())


In [6]:

# Define a function to remove timestamps and ensure no whitespaces before colons
def remove_timestamps_and_clean(text):
    # Regex to find timestamps in the format [00:00:00], possibly preceded by whitespaces
    timestamp_pattern = r'\s*\[\d{2}:\d{2}:\d{2}\]'
    text = re.sub(timestamp_pattern, '', text)
    
    # Additional step to remove whitespaces before colons
    whitespace_before_colon_pattern = r'\s+(?=:)'
    cleaned_text = re.sub(whitespace_before_colon_pattern, '', text)
    
    return cleaned_text

# Apply the function to each cell in the 'transcripts' column
df['transcript'] = df['transcript_withtime'].apply(remove_timestamps_and_clean)



In [7]:
#summarize the podcast transcript
def summarize_transcript(transcript):
    try:
        response = openai.chat.completions.create(
            model="gpt-4-turbo-preview",  # Adjust model name as needed
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Summarize this podcast transcript:\n\n{transcript}"}
            ]
        )
        summary = response.choices[0].message.content.strip()  # Adjusted access to the response
        return summary
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Apply summarization to the entire DataFrame
df['ai_summary'] = df['transcript'].apply(summarize_transcript)

# Summarize only the first row's transcript
#first_row_summary = summarize_transcript(df['transcript'].iloc[0])

# If you want to store it back in the DataFrame, you can do so like this:
#df.loc[0, 'summary'] = first_row_summary

# Or if you just want to print it out or use it elsewhere
#print(first_row_summary)


In [8]:
#key insights

def transcript_key_insights(transcript):
    try:
        response = openai.chat.completions.create(
            model="gpt-4-turbo-preview",  # Adjust model name as needed
            messages=[
                {"role": "system", "content": "You are a helpful assistant for a AI researcher that trying to stay up to date on AI research, by running this query in his pandas data frame. Exclude any explaination, just an ordered list 1. <top insight> 2. <second insight> 3. 4. 5... where <top insight> matches the most interesting insight to an AI researcher, <second insight> matches the second most interesting insight to an AI researcher and so on and so forth."},
                {"role": "user", "content": f"Respond with the key AI research insights that were discussed in this podcast transcript, exclude any preamble, and try to be specific in terms of AI insights so that someone would not need to listen to the podcast but can use this list of key insights instead as it will be combined with multiple episodes worth of insights. Don't include any preamble or post analysis, but just the list of insights. Here is the transcript:\n\n{transcript}"}
            ]
        )
        summary = response.choices[0].message.content.strip()  # Adjusted access to the response
        return summary
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Apply summarization to the entire DataFrame
df['key_insights'] = df['transcript'].apply(transcript_key_insights)

# Summarize only the first row's transcript
#first_row_summary = transcript_key_insights(df['transcript'].iloc[0])

# If you want to store it back in the DataFrame, you can do so like this:
#df.loc[0, 'summary'] = first_row_summary

# Or if you just want to print it out or use it elsewhere
#print(first_row_summary)

In [9]:

def transcript_highlight(transcript):
    try:
        response = openai.chat.completions.create(
            model="gpt-4-turbo-preview",  # Adjust model name as needed
            messages=[
                {"role": "system", "content": "You are a helpful assistant for AI engineers, and an expert at selecting the most insightful, impactful technical quotes from transcripts of podcasts."},
                {"role": "user", "content": f"Respond with the actionable quotes verbatum that might impact the thinking or behavior of an AI Engineer. Please include the most recent timecode before responding with the quote selected. Below is the transcript for your analysis:\n\n{transcript}"}
            ]
        )
        summary = response.choices[0].message.content.strip()  # Adjusted access to the response
        return summary
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Apply summarization to the entire DataFrame
df['quotes'] = df['transcript_withtime'].apply(transcript_highlight)

# Summarize only the first row's transcript
#first_row_summary = transcript_highlight(df['transcript_withtime'][0])

# If you want to store it back in the DataFrame, you can do so like this:
#df.loc[0, 'summary'] = first_row_summary

# Or if you just want to print it out or use it elsewhere
#print(first_row_summary)


In [10]:
def categories(transcript):
    try:
        response = openai.chat.completions.create(
            model="gpt-4-turbo-preview",  # Adjust model name as needed
            messages=[
                {"role": "system", "content": "You are a helpful assistant for a data analyst that is adding information to his pandas data frame. Exclude any explaination, just an ordered list 1. <top category> 2. <second category> 3. 4. 5. 6. 7. 8. 9. 10. where <top category> matches the most relevant category, <second category> matches the second most relevant category and so on and so forth"},
                {"role": "user", "content": f"Reply with an ordered list of 10 categories this podcast transcript covers in order of relevance. The categories can include 'Benchmarks 101' 'Datasets 101' 'RHLF 201' 'FlashAttention 'Transformers Math' 'Models' 'AI Engineer Career' 'UX' 'Multimodal' 'Agents' 'Coding Tools' 'LLM Tooling 'Hardware' 'Open Source' 'Finetuning' 'Startups' 'News' 'Event Recaps' 'Monthy Recaps' or other categories you think would be helpful to describe the topics covered in the below transcript. Here is the transcript:  \n\n{transcript}"}
            ]
        )
        summary = response.choices[0].message.content.strip()  # Adjusted access to the response
        return summary
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Apply summarization to the entire DataFrame
df['categories'] = df['transcript'].apply(categories)

# Summarize only the first row's transcript
#first_row_summary = categories(df['transcript_withtime'][0])

# If you want to store it back in the DataFrame, you can do so like this:
#df.loc[0, 'summary'] = first_row_summary

# Or if you just want to print it out or use it elsewhere
#print(first_row_summary)

In [None]:
# Display summary statistics for each column
# summary_stats = df.iloc[0]
summary_stats = df.head(1)
#summary_stats = df['summary'][22]
#summary_stats = df['Intro'][22]
#summary_stats = df['transcript'][1]
#summary_stats = df['timestamps'][1]
#summary_stats = df['transcript_withtime'][0]

print(summary_stats)

In [12]:
df.to_csv('Latent_space_all_data.csv', index=False)

#trimming unused columns
df.drop(['links', 'image_ref'], axis='columns', inplace=True)

df.to_csv('Latent_space_reduced.csv', index=False)



In [14]:
#trimming unused columns
df.drop(['transcript', 'transcript_withtime'], axis='columns', inplace=True)

df.to_csv('Latent_space_reduced.csv', index=False)

In [None]:
def extract_categories(categories_str):
    # Split the string into a list based on the newline character
    categories_list = categories_str.split('\n')
    # Extract the category text from each item in the list
    # We split on '. ' and take the second element to get the category name
    categories_cleaned = [item.split('. ', 1)[1] if '. ' in item else item for item in categories_list]
    return categories_cleaned

# Apply the function to the 'categories' column and expand the result into new columns
categories_df = pd.DataFrame(df['categories'].apply(extract_categories).tolist(), columns=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'])

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, categories_df], axis=1)

# Display the updated DataFrame to verify the results
#print(df.head())

In [None]:
# Count of each unique value in 'category1' column
category1_counts = df['category1'].value_counts()

# Display the counts
print(category1_counts)

In [24]:
# Concatenate the category columns into a single Series
all_categories_series = pd.concat([df[f'category{i}'] for i in range(1, 6)], ignore_index=True)

# Get the count of each unique value in the combined series
all_categories_counts = all_categories_series.value_counts()

# Display the combined counts
print(all_categories_counts)

Models                             44
AI Engineer Career                 33
Open Source                        31
Coding Tools                       25
Hardware                           24
Finetuning                         22
                                   15
Startups                           13
LLM Tooling                        11
Event Recaps                        9
News                                9
Monthy Recaps                       8
Multimodal                          7
Datasets 101                        6
Agents                              6
Transformers Math                   5
UX                                  3
Benchmarks 101                      2
FlashAttention                      2
Monthly Recaps                      2
Model Deployment and Production     1
Data Science Techniques             1
DataSets 101                        1
RLHF 201                            1
Machine Learning Systems            1
ML Development Lifecycle            1
Data Managem

In [52]:
# Replace 'your_path_here' with the actual path to your CSV file
file_path2 = 'Latent_space_reduced_v2.csv'

# Read the CSV file into a DataFrame
df2 = pd.read_csv(file_path2)

def extract_categories(categories_str):
    # Split the string into a list based on the newline character
    categories_list = categories_str.split('\n')
    # Extract the category text from each item in the list
    # We split on '. ' and take the second element to get the category name
    categories_cleaned = [item.split('. ', 1)[1] if '. ' in item else item for item in categories_list]
    return categories_cleaned

# Apply the function to the 'categories' column and expand the result into new columns
categories_df = pd.DataFrame(df2['categories'].apply(extract_categories).tolist(), columns=['category1', 'category2', 'category3', 'category4', 'category5', 'category6', 'category7', 'category8', 'category9', 'category10'])

# Concatenate the new columns with the original DataFrame
df2 = pd.concat([df2, categories_df], axis=1)

# Display the updated DataFrame to verify the results
#print(df.head())

df2.to_csv('Latent_space_reduced_v3.csv', index=False)

# Replace 'your_path_here' with the actual path to your CSV file
file_path3 = 'Isosahedronv3.csv'

# Read the CSV file into a DataFrame
df3 = pd.read_csv(file_path3)

In [53]:
# Merging and updating df2
for i in range(1, 11):  # For each category column in df2
    # Merge df2 with df3 to map x, y, z coordinates
    df2 = pd.merge(df2, df3, how='left', left_on=f'category{i}', right_on='Category', suffixes=('', f'_{i}'))
    
    # Drop the redundant 'category' column added from df3 after each merge
    df2.drop(columns=['Category','Connection','Connection2'], inplace=True)

    # Fill NaN values with 0 for unmatched categories
    df2.fillna({f'x_{i}': 0, f'y_{i}': 0, f'z_{i}': 0}, inplace=True)

df2 = df2.rename(columns={
    'x': 'x_1',
    'y': 'y_1',
    'z': 'z_1'
})
df2.fillna({f'x_1': 0, f'y_1': 0, f'z_1': 0}, inplace=True)
# Calculate average of all x, y, z values
x_columns = [f'x_{i}' for i in range(1, 11)]
y_columns = [f'y_{i}' for i in range(1, 11)]
z_columns = [f'z_{i}' for i in range(1, 11)]

df2['x_avg'] = df2[x_columns].mean(axis=1)
df2['y_avg'] = df2[y_columns].mean(axis=1)
df2['z_avg'] = df2[z_columns].mean(axis=1)

# Display the updated df2
#print(df2)

                                                                                                                                                                                            title  \
0                                                                                                                       Truly Serverless Infra for AI Engineers - with Erik Bernhardsson of Modal   
1                                                                                              Cloud Intelligence at the speed of 5000 tok/s - with Ce Zhang and Vipul Ved Prakash of Together AI   
2                                                                                                                                  Why StackOverflow usage is down 50% — with David Hsu of Retool   
3                                                                                                                                            The Four Wars of the AI Stack (Dec 2023 Audio Recap)   
4              

In [54]:
# Generate the column names to drop
cols_to_drop = [f'{dim}{i}' for i in range(1, 11) for dim in ['x_', 'y_', 'z_']]

# Drop the columns from df2
df2 = df2.drop(columns=cols_to_drop)

# Saving back to visualize
df2.to_csv('Latent_space_reduced_v4.csv', index=False)