In [1]:
import json
import os
import openai
import whisper
from whisper.utils import format_timestamp
from sentence_transformers import SentenceTransformer, util

In [2]:
openai.api_key = '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' # os.getenv("OPENAI_API_KEY")

In [3]:
def create_transcript_str(whisper_out):
        
    transcript_str = []
    for segment in whisper_out['segments']:
        transcript_str.append(f"[{format_timestamp(segment['start'])}]:\t{segment['text']}")
    
    transcript_str = "\n".join(transcript_str)
    return transcript_str

def postprocess_points(raw_output):
    points = raw_output.split('\n-')
    points = [point.strip() for point in points]
    points = [point for point in points if point != '']
    return points

def find_closest_time(time, all_times):
    closest_time = min(all_times, key=lambda x: abs(x - time))
    return closest_time

def create_transcript_chunks(all_start_times, all_end_times, whisper_out, stride=45, length=60):
    '''Create larger chunks of the segments using a sliding window'''

    transcript_chunks = []
    for seek in range(0, int(all_end_times[-1]), stride):
        chunk = {'start': None, 'end': None, 'text': None}

        start_index = all_start_times.index(find_closest_time(seek, all_start_times))
        chunk['start'] = all_start_times[start_index]
        end_index = all_end_times.index(find_closest_time(seek + length, all_end_times))
        chunk['end'] = all_end_times[end_index]

        chunk['text'] = "".join([segment['text'] for segment in whisper_out['segments'][start_index:end_index+1]]).strip()

        transcript_chunks.append(chunk)
    
    return transcript_chunks

In [4]:
class LISAPipeline():
    def __init__(self, whisper_model, search_model):
        self.whisper_model = whisper.load_model(whisper_model)
        self.search_model = SentenceTransformer.load(search_model)
    
    def run_gpt3(self, prompt, max_tokens=256, temperature=0.5, top_p=1, frequency_penalty=0.0, presence_penalty=0.0):
        response = openai.Completion.create(
            engine="text-davinci-002",
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty
        )
        return response.choices[0].text
    
    def transcribe(self, audio_path):
        whisper_out = self.whisper_model.transcribe(audio_path, verbose=False)
        return whisper_out
    
    def minutes_of_meeting(self, transcript_str):

        mom_prompt = f"""Generate the minutes of the meeting for the following transcript:
        Meeting Transcription:
        {transcript_str}

        Meeting Minutes:
        -"""

        raw_minutes = '\n-' + self.run_gpt3(mom_prompt, temperature=0.5)
        minutes = postprocess_points(raw_minutes)

        return minutes

    def action_items(self, transcript_str):

        if 'hey lisa' not in transcript_str.lower():
            return []

        action_prompt = f"""Extract the Action Items / To-Do List from the Transcript.
        Meeting Transcription:
        {transcript_str}

        Action Items:
        -"""
        raw_action_items = self.run_gpt3(action_prompt, temperature=0.4)
        action_items = postprocess_points(raw_action_items)

        return action_items

    def create_index(self, whisper_out):
        '''Create search index by embedding the transcript segments'''
        all_start_times = [segment['start'] for segment in whisper_out['segments']]
        all_end_times = [segment['end'] for segment in whisper_out['segments']]

        transcript_chunks = create_transcript_chunks(all_start_times, all_end_times, whisper_out, stride=45, length=60)

        # Encode query and documents
        chunk_texts = [chunk['text'] for chunk in transcript_chunks]
        doc_emb = self.search_model.encode(chunk_texts)

        return doc_emb, transcript_chunks
    
    def search(self, doc_embeddings, transcript_chunks, query, top_k=3, threshold=16):
        # Compute dot score between query and all document embeddings
        query_embeddings = self.search_model.encode(query)
        scores = util.dot_score(query_embeddings, doc_embeddings)[0].cpu().tolist()

        chunks = [(chunk['start'], chunk['end'], chunk['text']) for chunk in transcript_chunks]

        # Combine docs & scores
        chunk_score_tuples = [(*chunks[i], scores[i]) for i in range(len(chunks))]

        # Sort by decreasing score
        chunk_score_tuples = sorted(chunk_score_tuples, key=lambda x: x[-1], reverse=True)

        # Output passages & scores
        results = []
        for start, end, text, score in chunk_score_tuples[:top_k]:
            if score > threshold:
                results.append((start, end, text))

        return results

    def __call__(self, audio_path):
        '''Run the pipeline on an audio file'''
        whisper_out = self.transcribe(audio_path)
        transcript_str = create_transcript_str(whisper_out)
        minutes = self.minutes_of_meeting(transcript_str)
        action_items = self.action_items(transcript_str)
        doc_emb, transcript_chunks = self.create_index(whisper_out)

        return minutes, action_items, doc_emb, transcript_chunks

In [5]:
lisa = LISAPipeline(whisper_model="whisper_models/medium.pt", search_model="multi-qa-mpnet-base-dot-v1")

In [None]:
minutes, action_items, doc_emb, transcript_chunks = lisa('Recording.m4a')

In [6]:
whisper_out = lisa.transcribe('Recording.m4a')

Detected language: English


100%|██████████| 20870/20870 [00:36<00:00, 574.00frames/s]


In [11]:
transcript_str = create_transcript_str(whisper_out)
minutes = lisa.minutes_of_meeting(transcript_str)
action_items = lisa.action_items(transcript_str)

In [12]:
doc_emb, transcript_chunks = lisa.create_index(whisper_out)

In [13]:
minutes

['The final module of the course on Neuro-Symbolic AI is discussed.',
 'The contents of the session are reviewed.',
 'The problem is introduced.',
 'A review of symbolic AI is conducted.',
 'The differences between neural networks and symbolic AI are explored.',
 'The architecture of neuro-symbolic AI is explained.']

In [14]:
action_items

[]

In [16]:
lisa.search(doc_emb, transcript_chunks, 'explain the issue with deep learning with an example')

[(133.2,
  192.39999999999998,
  "So maybe like water and person. So it might say there's water, okay. So the deep learning model might just say water and there are people in it, so it will either say people or person. But apart from that, if we humans are given a definition of a flood, then we will actually be able to reason about an image and get the flood answer, right? So as a human, what we do, we also see both water and people there, right? But we also see that there's water person, but also we see that it's not really a beach or anything, right? It's not like because even in a beach, people are there as well as water is there. But here it doesn't really look like a beach, there are trees and everyone is just submerged in and so on, right? So based on that reasoning that it's not a beach or a pool, we can like conclude like that it's a flood, right? So this is how we humans reason about it. But deep learning"),
 (181.4,
  218.92000000000002,
  "right? So based on that reasoning t