In [1]:
import os
import pandas as pd
import re

from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv

In [2]:
load_dotenv('.env', override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Set up OpenAI client
client = OpenAI(api_key=openai_api_key)

In [4]:
# Function to get topics from transcript using OpenAI
def get_topics(transcript):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", 
             "content": """
                 You are a master teacher of beginner computer programmers with deep experience 
                 in identifying the concepts that novice learners may struggle with.
                """},
            {"role": "user", 
             "content": f"""
                 You are an AI assistant tasked with identifying important terms for a glossary 
                 in a beginner Python programming course. The course is designed for learners 
                 with no prior programming experience. Your job is to analyze the given course 
                 transcript and identify terms that should be included in the glossary.

                 When analyzing the transcript, focus on the following types of terms:
                 1. Basic programming concepts (e.g., variable, function, loop)
                 2. Python-specific terms and keywords (e.g., def, if, else, list comprehension)
                 3. Syntax elements (e.g., colon, indentation, brackets)
                 4. Common built-in functions (e.g., print(), input(), len())
                 5. Error messages (e.g., SyntaxError, IndentationError)
                 6. Development environment terms (e.g., IDE, console)
                 7. File operation concepts (e.g., read, write, file path)
                 8. Simple data structures (e.g., list, tuple, dictionary)
                 9. Basic object-oriented programming terms (e.g., class, object)
                 10. Module and library basics (e.g., import, package)
                 11. Version control basics (if mentioned)
                 12. Debugging terms (e.g., breakpoint, step through)
                 
                 Guidelines for selection:
                 - Prioritize terms that appear frequently or are crucial for understanding the content.
                 - Focus on terms directly related to Python programming.
                 - Include terms that might be challenging for absolute beginners.
                 - If a common word has a specific meaning in programming, include it.
                 - Avoid overly advanced concepts not central to the beginner course.

                Identify the 5 most important topics. Provide your topics as a comma-separated list. 
                Don't include any other characters in your response.

                Analyze the following course transcript and list the terms that should be included in the glossary based on these guidelines:
                 
                {transcript}
            """}
        ]
    )
    return [topic.strip() for topic in response.choices[0].message.content.split(',')]

In [5]:
# Fumction to remove any duplicate topics that may slip through
def remove_duplicates(df):
    topics = df['glossary_item'].tolist()
    topics_str = ", ".join(topics)
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies duplicate or very similar programming topics from a list."},
            {"role": "user", "content": f"Here is a list of programming topics: {topics_str}. Please identify any duplicates or very similar topics. For each set of duplicates, choose the most appropriate term to keep and list the terms to remove. Return your answer as a Python dictionary where keys are terms to keep and values are lists of terms to remove. Only include terms that have duplicates."}
        ]
    )
    
    duplicates_dict = eval(response.choices[0].message.content)
    
    # Flatten the list of terms to remove
    terms_to_remove = [item for sublist in duplicates_dict.values() for item in sublist]
    
    # Remove rows with duplicate terms
    df_cleaned = df[~df['glossary_item'].isin(terms_to_remove)]
    
    return df_cleaned


In [6]:
def get_description(topic, video_name, transcript_dir, style_prompt):
    # Read the transcript file
    with open(os.path.join(transcript_dir, video_name), 'r') as file:
        transcript = file.read()

    # Extract lesson number or name from video_name (assuming format like 'lesson_1.txt' or 'introduction_to_python.txt')
    lesson_identifier = video_name.split('.')[0].replace('_', ' ').title()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": f"You are a helpful assistant that provides concise descriptions of programming topics. {style_prompt}"},
            {"role": "user", "content": f"Provide a short description (50 words or less) of the programming topic: '{topic}'. This topic was introduced in the lesson '{lesson_identifier}'. Use the following transcript as context, and if it makes the description easier to understand, refer to specific examples or explanations from this lesson in your description. If you refer to the lesson, use its name or number ('{lesson_identifier}') instead of saying 'in the video':\n\n{transcript}"}
        ]
    )
    return response.choices[0].message.content.strip()


In [7]:
# Create empty DataFrame
df = pd.DataFrame(columns=['glossary_item', 'video_name', 'item_description'])

In [8]:
df.head()

Unnamed: 0,glossary_item,video_name,item_description


In [9]:
# Directory containing transcript files
transcript_dir = '/Users/tommynelson/Documents/Ap4E_2024_videos/C1/Descript_versions/transcripts'

In [10]:
# Process each transcript file
for filename in tqdm(os.listdir(transcript_dir), desc="Processing transcripts"):
    if filename.endswith('.txt'):
        with open(os.path.join(transcript_dir, filename), 'r') as file:
            transcript = file.read()
        
        topics = get_topics(transcript)
        
        for topic in topics:
            if topic not in df['glossary_item'].values:
                df = df._append({
                    'glossary_item': topic,
                    'video_name': filename,
                    'item_description': ''
                }, ignore_index=True)

Processing transcripts: 100%|███████████████████| 11/11 [00:06<00:00,  1.74it/s]


In [11]:
df['glossary_item']

0                      variable
1                      function
2                          loop
3                           def
4                       print()
5                         len()
6                      argument
7               print statement
8                       comment
9                 error message
10                          bug
11                       string
12                    variables
13                    functions
14                      strings
15                       type()
16                        print
17                     f-string
18                variable name
19             formatted string
20                 curly braces
21    floating-point formatting
22                       syntax
23                       prompt
24             Jupyter Notebook
25           coding environment
26                  Shift Enter
27                    Command V
28                    Control V
Name: glossary_item, dtype: object

In [12]:
# Remove duplicates
print("Removing duplicate topics...")
df = remove_duplicates(df)

Removing duplicate topics...


In [13]:
df['glossary_item']

0                      variable
1                      function
2                          loop
3                           def
4                       print()
5                         len()
6                      argument
8                       comment
9                 error message
10                          bug
11                       string
15                       type()
16                        print
17                     f-string
18                variable name
20                 curly braces
21    floating-point formatting
22                       syntax
23                       prompt
24             Jupyter Notebook
25           coding environment
26                  Shift Enter
27                    Command V
28                    Control V
Name: glossary_item, dtype: object

In [14]:
style_prompt = """
    Your writing should have a friendly tone, and avoid complicated language 
    that could be difficult for a learner taking their first python course to understand.
    
    If it helps to make the information in the description easier to understand for the learner, 
    include specific examples or explanations from the video context.
    
    Do not start your descriptions with "refer to" or "refers to"!
"""

In [15]:
# Get descriptions for each topic
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Generating descriptions"):
    description = get_description(row['glossary_item'], row['video_name'], transcript_dir, style_prompt)
    df.at[index, 'item_description'] = description

# Sort DataFrame alphabetically by glossary_item
df = df.sort_values('glossary_item')

# Create markdown file
with open('glossary.md', 'w') as f:
    f.write("# Python Course Glossary\n\n")
    for index, row in df.iterrows():
        item = row['glossary_item']
        description = row['item_description']
        video = row['video_name']
        
        # Create an anchor link
        anchor = re.sub(r'\W+', '-', item.lower())
        
        f.write(f"## [{item}](#{anchor})\n\n")
        f.write(f"{description}\n\n")
        f.write(f"*Introduced in: [{video}](path/to/video/{video})*\n\n")

print("Glossary created successfully!")

Generating descriptions: 100%|██████████████████| 24/24 [00:31<00:00,  1.32s/it]

Glossary created successfully!



