## Prerequisites

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
!pip install openai docx2txt transformers bert-score --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone


## Import the modules

In [3]:
import os
import shutil
import json
import time
from tqdm.notebook import tqdm

import zipfile
import docx2txt

import pandas as pd

from openai import OpenAI
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

In [5]:
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))

## Loading the dataset

In [7]:
dataset_paths = ['/content/data/reading.zip', '/content/data/transcript.zip']
directory_to_extract_to = '/content/data'

for path_to_zip_file in dataset_paths:
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)

In [6]:
path_to_transcript = '/content/data/transcript'
path_to_reading_material = '/content/data/reading'

In [None]:
# # Remove VTT files
# files = os.listdir('/content/data/transcript')

# for f in files:
#     if '.vtt' in f:
#         os.remove('/content/data/transcript/'+f)

In [None]:
shutil.rmtree('/content/data/transcript/.ipynb_checkpoints')

In [11]:
print(len(os.listdir(path_to_transcript)), len(os.listdir(path_to_reading_material)))

7 22


In [12]:
def load_to_dataframe(path_to_transcript, path_to_reading_material):

    data = pd.DataFrame(columns=['transcript', 'reading_material'])

    transcript_files = os.listdir(path_to_transcript)
    name_list = []
    transcript_list = []
    reading_list = []

    for files in transcript_files:
        name_list.append(files[:-16])
        reading_files = str(files[:-16]+'_rm.txt').replace(' ', '_')

        transcript_files = os.path.join(path_to_transcript, files)
        reading_files = os.path.join(path_to_reading_material, reading_files)

        with open(transcript_files, 'r') as fp:
            transcript_list.append(fp.read())

        with open(reading_files, 'r') as fp:
            reading_list.append(fp.read())

    data['topic'] = name_list
    data['transcript'] = pd.Series(transcript_list)
    data['reading_material'] = pd.Series(reading_list)

    return data

In [14]:
data = load_to_dataframe(path_to_transcript, path_to_reading_material)

In [15]:
data

Unnamed: 0,transcript,reading_material,topic
0,"Hi, I’m Carrie Anne, and welcome to CrashCours...","Cryptography, or cryptology (from Ancient Gree...",Cryptography
1,Dictionaries are mappings from key objects to ...,"Python Dictionaries\nthisdict = {\n ""brand"": ...",Dictionaries
2,Strings are immutable sequences of characters....,Strings\nStrings in python are surrounded by e...,Strings
3,"Hi, I’m Carrie Anne, and welcome to CrashCours...",The World Wide Web (WWW or simply the Web) is ...,World_Wide_Web
4,Sets are unordered collections of distinct has...,"Python Sets\nmyset = {""apple"", ""banana"", ""cher...",Sets
5,we previously defined the cost function J in t...,"In vector calculus, the gradient of a scalar-v...",Gradient
6,Welcome to the WebAudio API lesson! I personna...,HTML5 Audio is a subject of the HTML5 specific...,Web_Audio_API


## Summary generation

In [16]:
def get_prompt(row):
    return [
        {"role": "system", "content": "Zero shot model."},
        {
            "role": "user",
            "content": f"""
                          Please write a 500 word summary on the following Transcript based on the Reading Material.
                          Pick relevant information from the Reading Material and add it to the summary of the Transcript.
                          The Summary should be only of the Transcript with a few information from the Reading Material.
                          If you are not sure of the summary, say 'I don't know'.

                          Transcript: {row.transcript}\n\n
                          Reading Material: {row.reading_material}\n\n
                          Answer:\n""",
        },
    ]

In [17]:
def api_call(messages, model):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages
        )
    except:
        return 'Text too long for the model'

    return response


# Main function to answer question
def generate_summary(row, prompt_func=get_prompt, model="gpt-3.5-turbo"):
    messages = prompt_func(row)
    response = api_call(messages, model)

    if isinstance(response, str):
        return response

    return response.choices[0].message.content

In [19]:
zero_shot = []

for i in tqdm(range(data.shape[0])):
    zero_shot.append(generate_summary(data.iloc[i], model="gpt-3.5-turbo-1106"))

  0%|          | 0/7 [00:00<?, ?it/s]

In [20]:
data['zero_shot'] = zero_shot

In [32]:
data

Unnamed: 0,transcript,reading_material,topic,zero_shot
0,"Hi, I’m Carrie Anne, and welcome to CrashCours...","Cryptography, or cryptology (from Ancient Gree...",Cryptography,The transcript introduces computer security an...
1,Dictionaries are mappings from key objects to ...,"Python Dictionaries\nthisdict = {\n ""brand"": ...",Dictionaries,The transcript discusses the use and manipulat...
2,Strings are immutable sequences of characters....,Strings\nStrings in python are surrounded by e...,Strings,The transcript highlights various operations a...
3,"Hi, I’m Carrie Anne, and welcome to CrashCours...",The World Wide Web (WWW or simply the Web) is ...,World_Wide_Web,The Transcript introduces the World Wide Web a...
4,Sets are unordered collections of distinct has...,"Python Sets\nmyset = {""apple"", ""banana"", ""cher...",Sets,"The transcript discusses sets, which are unord..."
5,we previously defined the cost function J in t...,"In vector calculus, the gradient of a scalar-v...",Gradient,The transcript discusses the algorithm called ...
6,Welcome to the WebAudio API lesson! I personna...,HTML5 Audio is a subject of the HTML5 specific...,Web_Audio_API,The WebAudio API lesson focused on the capabil...


In [33]:
few_shot = []

for i in tqdm(range(data.shape[0])):
    few_shot.append(generate_summary(data.iloc[i], model='ft:gpt-3.5-turbo-1106:ucsc:11fewshot20240304:8zG6iQyx'))

  0%|          | 0/7 [00:00<?, ?it/s]

In [34]:
data['few_shot'] = few_shot

In [35]:
data

Unnamed: 0,transcript,reading_material,topic,zero_shot,few_shot
0,"Hi, I’m Carrie Anne, and welcome to CrashCours...","Cryptography, or cryptology (from Ancient Gree...",Cryptography,The transcript introduces computer security an...,﻿Computer security always has to assume that t...
1,Dictionaries are mappings from key objects to ...,"Python Dictionaries\nthisdict = {\n ""brand"": ...",Dictionaries,The transcript discusses the use and manipulat...,Dictionaries are mappings from key objects to ...
2,Strings are immutable sequences of characters....,Strings\nStrings in python are surrounded by e...,Strings,The transcript highlights various operations a...,Strings in Python can be defined using either ...
3,"Hi, I’m Carrie Anne, and welcome to CrashCours...",The World Wide Web (WWW or simply the Web) is ...,World_Wide_Web,The Transcript introduces the World Wide Web a...,"The World Wide Web, often confused with the in..."
4,Sets are unordered collections of distinct has...,"Python Sets\nmyset = {""apple"", ""banana"", ""cher...",Sets,"The transcript discusses sets, which are unord...",Sets in Python are unordered collections of di...
5,we previously defined the cost function J in t...,"In vector calculus, the gradient of a scalar-v...",Gradient,The transcript discusses the algorithm called ...,"﻿In this video, the instructor introduces the ..."
6,Welcome to the WebAudio API lesson! I personna...,HTML5 Audio is a subject of the HTML5 specific...,Web_Audio_API,The WebAudio API lesson focused on the capabil...,The WebAudio API is a powerful tool that allow...


In [37]:
for i in tqdm(range(data.shape[0])):
    zero_shot_file_name = '/content/data/zero_shot/'+data.iloc[i]['topic']+'__zero.txt'

    with open(zero_shot_file_name, 'w') as f:
        f.write(data.iloc[i]['zero_shot'])

    few_shot_file_name = '/content/data/few_shot/'+data.iloc[i]['topic']+'__few.txt'

    with open(few_shot_file_name, 'w') as f:
        f.write(data.iloc[i]['few_shot'])

  0%|          | 0/7 [00:00<?, ?it/s]

In [39]:
shutil.make_archive('zero_shot_new', 'zip', '/content/data/zero_shot/')
shutil.make_archive('few_shot_new', 'zip', '/content/data/few_shot/')

'/content/few_shot_new.zip'