In [1]:
import os
import time
import google.generativeai as genai
from dotenv import load_dotenv
from google.generativeai import caching
import datetime
import pandas as pd
import PyPDF2
import os
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from io import BytesIO

  from .autonotebook import tqdm as notebook_tqdm


##Add a description of this step here

In [2]:
load_dotenv('/Users/netraranga/Desktop/Projects/.env')
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

In [3]:
# import pandas as pd
syllabus_df = pd.read_csv('syllabus_vf.csv')
syllabus_df_1 = syllabus_df.rename(columns={'Unnamed: 0':'Week', 'Unnamed: 1':'Lecture'}).drop(columns=['Deliverables', 'Additional Document Name'])

In [4]:
youtube_df = pd.read_csv('youtube_playlist_contents.csv')
youtube_df['Lecture'] = youtube_df.index + 1

In [58]:
def write_transcripts_to_files(df):
    # Create the directory if it doesn't exist
    output_dir = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts'
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        # Create a filename based on the lecture number
        filename = f"lecture_{row['Lecture']}_transcript.txt"
        file_path = os.path.join(output_dir, filename)
        
        # Write the video contents to the file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(row['Video Contents'])
        
        print(f"Transcript for Lecture {row['Lecture']} written to {file_path}")

# Call the function with your dataframe
write_transcripts_to_files(youtube_df)


Transcript for Lecture 1 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_1_transcript.txt
Transcript for Lecture 2 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_2_transcript.txt
Transcript for Lecture 3 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_3_transcript.txt
Transcript for Lecture 4 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_4_transcript.txt
Transcript for Lecture 5 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_5_transcript.txt
Transcript for Lecture 6 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_6_transcript.txt
Transcript for Lecture 7 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_7_transcript.txt
Transcript for Lecture 8 written to /Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_8

In [33]:
def create_title_page(title):
    packet = BytesIO()
    can = canvas.Canvas(packet, pagesize=letter)
    can.setFont("Helvetica", 24)
    can.drawString(100, 400, title)
    can.save()
    packet.seek(0)
    return PyPDF2.PdfReader(packet)

def merge_annotated_slides(combined_annotated_slides):
    docs_folder = '/Users/netraranga/Desktop/Projects/google_gemini/docs'
    consolidated_folder = os.path.join(docs_folder, 'consolidated')

    output_files = []
    
    for base_name in combined_annotated_slides:
        merger = PyPDF2.PdfMerger()
        original_file = os.path.join(docs_folder, f"{base_name}.pdf")
        annotated_file = os.path.join(docs_folder, f"{base_name}_annotated.pdf")
        
        # Add title page and original slides
        original_title = create_title_page(f"Original {base_name} Slides")
        merger.append(original_title)
        merger.append(original_file)
        
        # Add title page and annotated slides
        annotated_title = create_title_page(f"Annotated {base_name} Slides")
        merger.append(annotated_title)
        merger.append(annotated_file)
        
        # Output PDF file name
        output_pdf = os.path.join(consolidated_folder, f'combined_{base_name}_slides.pdf')
        
        try:
            merger.write(output_pdf)
            print(f"PDFs merged successfully for {base_name}. Output file: {output_pdf}")
            output_files.append(output_pdf)
        except Exception as e:
            print(f"Error writing output file for {base_name}: {str(e)}")
        finally:
            merger.close()
    
    return output_files

#Get list of files that need to be consolidated
combined_annotated_slides = []
for file_path in os.listdir('/Users/netraranga/Desktop/Projects/google_gemini/docs'):
      if 'annotated' in file_path:
            combined_annotated_slides.append(file_path.split('_')[0])

# Usage
output_files = merge_annotated_slides(combined_annotated_slides)

PDFs merged successfully for em. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_em_slides.pdf
PDFs merged successfully for decisiontrees. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_decisiontrees_slides.pdf
PDFs merged successfully for pca. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_pca_slides.pdf
PDFs merged successfully for ridge. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_ridge_slides.pdf
PDFs merged successfully for fairness. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_fairness_slides.pdf
PDFs merged successfully for privacy. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_privacy_slides.pdf
PDFs merged successfully for kmeans. Output file: /Users/netraranga/Desktop/Projects/google_gemini/docs/consolidated/combined_kmeans

In [18]:
###Order of files - use only lectures and annotated slides
lecture_1 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_1_transcript.txt'
file_1 = genai.upload_file(path=lecture_1)

lecture_2 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_2_transcript.txt'
file_2 = genai.upload_file(path=lecture_2)

# lin_alg_notes = '/Users/netraranga/Desktop/Projects/google_gemini/docs/linalg_notes.pdf'
# file_3 = genai.upload_file(path=lin_alg_notes)

# lin_alg_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/linalg_slides.pdf'
# file_3_1 = genai.upload_file(path=lin_alg_slides)

lecture_3 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_3_transcript.txt'
file_4 = genai.upload_file(path=lecture_3)

lecture_4 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_4_transcript.txt'
file_5 = genai.upload_file(path=lecture_4)

# probs_notes = '/Users/netraranga/Desktop/Projects/google_gemini/docs/prob_notes.pdf'
# file_6 = genai.upload_file(path=probs_notes)

# probs_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/prob_slides.pdf'
# file_6_1 = genai.upload_file(path=probs_slides)

lecture_5 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_5_transcript.txt'
file_7 = genai.upload_file(path=lecture_5)

lecture_6 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_6_transcript.txt'
file_8 = genai.upload_file(path=lecture_6)

# numpy_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/numpy_slides.pdf'
# file_9 = genai.upload_file(path=numpy_slides)

lecture_7 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_7_transcript.txt'
file_10 = genai.upload_file(path=lecture_7)

lecture_8 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_8_transcript.txt'
file_11 = genai.upload_file(path=lecture_8)

eval_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/eval_slides.pdf'
file_12 = genai.upload_file(path=eval_slides)

lecture_9 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_9_transcript.txt'
file_13 = genai.upload_file(path=lecture_9)

lecture_10 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_10_transcript.txt'
file_14 = genai.upload_file(path=lecture_10)

bias_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/bias_annotated.pdf'
file_15 = genai.upload_file(path=bias_slides)

ridge_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/ridge_annotated.pdf'
file_16 = genai.upload_file(path=ridge_slides)

lasso_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/lasso_annotated.pdf'
file_17 = genai.upload_file(path=lasso_slides)

midterm_review = '/Users/netraranga/Desktop/Projects/google_gemini/docs/midterm_review.pdf'
file_18 = genai.upload_file(path=midterm_review)

lecture_11 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_11_transcript.txt'
file_19 = genai.upload_file(path=lecture_11)

boosting_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/boosting.pdf'
file_20 = genai.upload_file(path=boosting_slides)

decision_trees_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/decisiontrees_annotated.pdf'
file_21 = genai.upload_file(path=decision_trees_slides)

# decision_trees_overfitting = '/Users/netraranga/Desktop/Projects/google_gemini/docs/decisiontrees_overfitting.pdf'
# file_22 = genai.upload_file(path=decision_trees_overfitting)

lecture_12 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_12_transcript.txt'
file_23 = genai.upload_file(path=lecture_12)

lecture_13 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_13_transcript.txt'
file_24 = genai.upload_file(path=lecture_13)

kmeans_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/kmeans_annotated.pdf'
file_25 = genai.upload_file(path=kmeans_slides)

em_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/em_annotated.pdf'
file_26 = genai.upload_file(path=em_slides)

pca_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/pca_annotated.pdf'
file_27 = genai.upload_file(path=pca_slides)

lecture_14 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_14_transcript.txt'
file_28 = genai.upload_file(path=lecture_14)

lecture_15 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_15_transcript.txt'
file_29 = genai.upload_file(path=lecture_15)

# ml_advice = '/Users/netraranga/Desktop/Projects/google_gemini/docs/ml_advice.pdf'
# file_30 = genai.upload_file(path=ml_advice)

lecture_16 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_16_transcript.txt'
file_31 = genai.upload_file(path=lecture_16)

learning_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/learning.pdf'
file_32 = genai.upload_file(path=learning_slides)

lecture_17 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_17_transcript.txt'
file_33 = genai.upload_file(path=lecture_17)

lecture_18 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_18_transcript.txt'
file_34 = genai.upload_file(path=lecture_18)

lecture_19 = '/Users/netraranga/Desktop/Projects/google_gemini/docs/transcripts/lecture_19_transcript.txt'
file_35 = genai.upload_file(path=lecture_19)

fairness_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/fairness_annotated.pdf'
file_36 = genai.upload_file(path=fairness_slides)

privacy_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/privacy_annotated.pdf'
file_37 = genai.upload_file(path=privacy_slides)

explanation_slides = '/Users/netraranga/Desktop/Projects/google_gemini/docs/explainability_annotated.pdf'
file_38 = genai.upload_file(path=explanation_slides)

textbook = '/Users/netraranga/Desktop/Projects/google_gemini/docs/textbook.pdf'
file_39 = genai.upload_file(path=textbook)

In [None]:
contents = [file_1, file_2, file_4, file_5, file_7, file_8, file_10, file_11, file_12, file_13, file_14, file_15, file_16, file_17, file_18, file_19, file_20, file_21, file_23, file_24, file_25, file_26, file_27, file_28, file_29, file_31, file_32, file_33, file_34, file_35, file_36, file_37, file_38, file_39]

In [75]:
### System Prompt
system_prompt = """You are an expert tutor specializing in machine learning, with comprehensive knowledge of the Stanford CS229 "Introduction to Machine Learning" course. You have access to all relevant materials, including:
- Annotated and regular lecture notes for each session.
- Transcripts of all recorded lectures.
- The complete course textbook.
Your role is to guide the user through the CS229 course material by:
1. **Providing clear, detailed explanations** of key machine learning concepts and algorithms, from foundational topics like linear regression and classification to advanced areas such as support vector machines and unsupervised learning.
2. **Connecting course concepts**, explaining how different topics (e.g., gradient descent, regularization) relate and build upon each other across lectures.
3. **Summarizing lectures and sections**, highlighting major takeaways, essential equations, and conceptual insights.
4. **Supporting exam preparation**, identifying high-impact topics, common pitfalls, and suggesting areas for further review."""


In [76]:
cache_content = caching.CachedContent.create(
    model='models/gemini-1.5-flash-001',
    display_name='textbook',
    system_instruction=(
    system_prompt),
    contents=[file_39],
    ttl=datetime.timedelta(minutes=10)
)

In [72]:
cache_content = caching.CachedContent.create(
    model='models/gemini-1.5-flash-001',
    display_name='lecture transcripts and notes',
    system_instruction=(
    'You are an expert at machine learning concepts and linear algebra. Your job is to serve as a personal tutor for the user. You will be given a files with notes on machine learning concepts and transcripts of live lectures. You will be asked a variety of questions, some about specific concepts and others about how concepts are related to each other based on the notes provided.'
),
    contents=[file_1, file_2, file_4, file_5, file_7, file_8, file_10, file_11, file_12, file_13, file_14, file_15, file_16, file_17, file_18, file_19, file_20, file_21, file_23, file_24, file_25, file_26, file_27, file_28, file_29, file_31, file_32, file_33, file_34, file_35, file_36, file_39],
    ttl=datetime.timedelta(minutes=10)
)

In [77]:
# Construct a GenerativeModel which uses the created cache
model = genai.GenerativeModel.from_cached_content(cached_content=cache_content)

# Query the model
response = model.generate_content([(
    'Give me an overview on this course - what makes it stand out from other Machine Learning courses? Create a timeline of key concepts that are covered over the duration of the course. Ensure your responses are grounded in the course content and do not generate a fake timeline.')])

print(response.usage_metadata)
print(response.text)

prompt_token_count: 181816
candidates_token_count: 543
total_token_count: 182359
cached_content_token_count: 181763

CS229 is a comprehensive introductory course to machine learning that delves deep into the mathematical foundations and practical applications of the field. While there are other machine learning courses available, CS229 stands out for several reasons:

**1. Mathematical Rigor:** CS229 emphasizes a strong mathematical understanding of machine learning concepts. This provides a deep foundation for understanding how algorithms work and how to develop new ones.

**2. Broad Coverage:** The course covers a wide range of machine learning topics, from supervised learning methods like linear regression and classification, to unsupervised learning techniques like clustering and dimensionality reduction. It also touches upon advanced areas such as kernel methods, support vector machines, and deep learning.

**3. Practical Applications:** The course doesn't just focus on theory; it

In [49]:
response = model.generate_content([(
    'What are some key concepts covered in the KMeans lecture that are not covered in the notes? Be very specific in the points you generate.')])

print(response.usage_metadata)
print(response.text)

prompt_token_count: 500823
candidates_token_count: 265
total_token_count: 501088
cached_content_token_count: 500794

Here are some key concepts covered in the KMeans lecture that are not covered in the notes:

* **The discussion on the convergence properties of the algorithm:**  The lecture emphasizes the algorithm's convergence to a local minimum, pointing out that the algorithm will always terminate due to the monotonically decreasing nature of the distance function.
* **KMeans++ initialization:** The lecture describes the algorithm as a way to improve the approximation ratio by computing a density estimation and placing the centers with respect to the density to spread them out in a nice way. 
* **Practical considerations of KMeans:** The lecture acknowledges that there is no one right answer for selecting the number of clusters (K) and that it often requires a modeling decision and domain knowledge. 
* **Visual intuition for the algorithm:** The lecture uses a lot of visual example

In [73]:
for c in caching.CachedContent.list():
  print(c)

CachedContent(
    name='cachedContents/96m75bq632ty',
    model='models/gemini-1.5-flash-001',
    display_name='lecture transcripts and notes',
    usage_metadata={
        'total_token_count': 650903,
    },
    create_time=2024-10-26 21:20:08.961391+00:00,
    update_time=2024-10-26 21:20:08.961391+00:00,
    expire_time=2024-10-26 21:30:07.246272+00:00
)


In [71]:
# for c in caching.CachedContent.list():
#   print(c) #Slide 1 to 15 are 166273 tokens
  #Slide 1 to 38 are 500798

for c in caching.CachedContent.list():
    c.delete()

In [None]:
### TO ODO
#-combines all of the slide contents into one file so it passes the cache min size limit
#Determine with chatgpt what are good questions - study guides on certain lectures and concepts
#Identify the differece between annotated notes and regular notes
#Create a study guide that is grounded in the lecture nad pulls additional key concepts from the notes
#Generate some python questions for certain lectures for the application piece 
#Watch a certain video and see if the LLM can retrieve the specific fact or instnce referenced in the video