In [4]:
import json
import os
import openai
import whisper
import re
from whisper.utils import format_timestamp
from sentence_transformers import SentenceTransformer, util

In [5]:
openai.api_key = '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' # os.getenv("OPENAI_API_KEY")

In [6]:
def create_transcript(whisper_out):
        
    transcript_str = []
    transcript = []
    for segment in whisper_out['segments']:
        transcript_str.append(f"[{format_timestamp(segment['start'])}]:\t{segment['text']}")
        transcript.append(
            {
                'time': segment['start'],
                'timestamp': format_timestamp(segment['start']),
                'text': segment['text']
            }
        )
    
    transcript_str = "\n".join(transcript_str)
    return transcript_str, transcript

def postprocess_points(raw_output):
    raw_output = re.sub(r'\n\s*-', '\n-', raw_output)
    points = raw_output.split('\n-')
    points = [point.strip() for point in points]
    points = [point for point in points if point != '']
    return points

def find_closest_time(time, all_times):
    closest_time = min(all_times, key=lambda x: abs(x - time))
    return closest_time

def create_transcript_chunks(transcript, stride=3, length=3):
    '''Create larger chunks of the segments using a sliding window'''
    all_start_times = [segment['time'] for segment in transcript]

    transcript_chunks = []
    for i in range(0, len(all_start_times), stride):
        chunk = {}

        chunk['time'] = all_start_times[i]

        chunk['text'] = "".join([segment['text'] for segment in transcript[i:i+length]]).strip()

        transcript_chunks.append(chunk)
    
    return transcript_chunks

In [7]:
class LISAPipeline():
    def __init__(self, whisper_model, search_model):
        print('Loading Whisper...')
        self.whisper_model = whisper.load_model(whisper_model)
        print('Loading Sentence Transformer...')
        self.search_model = SentenceTransformer.load(search_model)
        print('Models loaded!')
    
    def run_gpt3(self, prompt, max_tokens=256, temperature=0.5, top_p=1, frequency_penalty=0.0, presence_penalty=0.0):
        response = openai.Completion.create(
            engine="text-davinci-002",
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty
        )
        return response.choices[0].text
    
    def transcribe(self, audio_path):
        whisper_out = self.whisper_model.transcribe(audio_path, verbose=False)
        return whisper_out
    
    def minutes_of_meeting(self, transcript_str):

        mom_prompt = f"""Generate a meeting summary/notes for the following transcript:
        Meeting Transcription:
        {transcript_str}

        Instructions:
        1. Do not use the same words as in the transcript.
        2. Use proper grammar and punctuation.
        3. Use bullets to list the points.
        4. Add as much detail as possible.

        Meeting Minutes:
        -"""

        raw_minutes = self.run_gpt3(mom_prompt, temperature=0.5)
        minutes = postprocess_points(raw_minutes)

        return minutes

    def action_items(self, transcript_str):

        action_prompt = f"""Extract the Action Items / To-Do List from the Transcript.
        Meeting Transcription:
        {transcript_str}

        Action Items:
        -"""
        raw_action_items = self.run_gpt3(action_prompt, temperature=0.4)
        action_items = postprocess_points(raw_action_items)

        return action_items

    def create_index(self, whisper_out, transcript):
        '''Create search index by embedding the transcript segments'''
        all_start_times = [segment['start'] for segment in whisper_out['segments']]
        all_end_times = [segment['end'] for segment in whisper_out['segments']]

        transcript_chunks = create_transcript_chunks(all_start_times, all_end_times, transcript, stride=45, length=60)

        # Encode query and documents
        chunk_texts = [chunk['text'] for chunk in transcript_chunks]
        doc_emb = self.search_model.encode(chunk_texts)

        return doc_emb, transcript_chunks
    
    def search(self, doc_embeddings, transcript_chunks, query, top_k=3, threshold=14):
        # Compute dot score between query and all document embeddings
        query_embeddings = self.search_model.encode(query)
        scores = util.dot_score(query_embeddings, doc_embeddings)[0].cpu().tolist()

        chunks = [(chunk['start'], chunk['text']) for chunk in transcript_chunks]

        # Combine docs & scores
        chunk_score_tuples = [(*chunks[i], scores[i]) for i in range(len(chunks))]

        # Sort by decreasing score
        chunk_score_tuples = sorted(chunk_score_tuples, key=lambda x: x[-1], reverse=True)

        # Output passages & scores
        results = []
        for start, text, score in chunk_score_tuples[:top_k]:
            results.append({
                'time': start,
                'timestamp': format_timestamp(start),
                'text': text,
                'confidence': score
            })
            # if score > threshold:
            #     results.append((start, end, text))
            # print('Score', score, text)

        return results

    def __call__(self, audio_path):
        '''Run the pipeline on an audio file'''
        whisper_out = self.transcribe(audio_path)
        transcript_str, transcript = create_transcript(whisper_out)
        minutes = self.minutes_of_meeting(transcript_str)
        action_items = self.action_items(transcript_str)
        doc_emb, transcript_chunks = self.create_index(whisper_out, transcript)

        return minutes, action_items, doc_emb, transcript_chunks, transcript_str, transcript

In [9]:
lisa = LISAPipeline(whisper_model="../whisper_models/medium.pt", search_model="../multi-qa-mpnet-base-dot-v1")

Loading Whisper...
Loading Sentence Transformer...
Models loaded!


In [11]:
minutes, action_items, doc_emb, transcript_chunks, transcript_str, transcript = lisa('Sample Meeting.mp3')

Detected language: English


100%|██████████| 19355/19355 [00:45<00:00, 427.42frames/s]


In [25]:
def create_transcript_chunks(transcript, stride=3, length=3):
    '''Create larger chunks of the segments using a sliding window'''
    all_start_times = [segment['time'] for segment in transcript]

    transcript_chunks = []
    for i in range(0, len(all_start_times), stride):
        chunk = {}

        chunk['time'] = all_start_times[i]

        chunk['text'] = "".join([segment['text'] for segment in transcript[i:i+length]]).strip()

        transcript_chunks.append(chunk)
    
    return transcript_chunks

In [27]:
transcript

[{'time': 0.0,
  'timestamp': '00:00.000',
  'text': " so yeah so let's get started so let me just share my screen okay so I did the"},
 {'time': 24.22,
  'timestamp': '00:24.220',
  'text': ' toy neurosymbolic thing so I created a sample neurosymbolic program synthesis'},
 {'time': 31.439999999999998,
  'timestamp': '00:31.440',
  'text': ' engine so I used GPT-3 for that so there were a few observations that I made but'},
 {'time': 37.08,
  'timestamp': '00:37.080',
  'text': ' before that let me tell you what I did so I basically created a few prompt'},
 {'time': 43.879999999999995,
  'timestamp': '00:43.880',
  'text': ' example where I put in all the models so for each model I had a description I had'},
 {'time': 50.72,
  'timestamp': '00:50.720',
  'text': ' the arguments and the data types and the return what it returns and I also had a'},
 {'time': 59.08,
  'timestamp': '00:59.080',
  'text': ' few problem statements couple problem statement as examples with their'},
 {'time': 

In [30]:
create_transcript_chunks(transcript, stride=2, length=2)

[{'time': 0.0,
  'text': "so yeah so let's get started so let me just share my screen okay so I did the toy neurosymbolic thing so I created a sample neurosymbolic program synthesis"},
 {'time': 31.439999999999998,
  'text': 'engine so I used GPT-3 for that so there were a few observations that I made but before that let me tell you what I did so I basically created a few prompt'},
 {'time': 43.879999999999995,
  'text': 'example where I put in all the models so for each model I had a description I had the arguments and the data types and the return what it returns and I also had a'},
 {'time': 59.08,
  'text': 'few problem statements couple problem statement as examples with their solution workflows so that was my prompting so a few observations that I'},
 {'time': 67.08,
  'text': 'made were prompting with examples of feedback and corrections made the final up would be more correct in one shot without requiring further changes so I'},
 {'time': 75.12,
  'text': 'also did a feedback l

: 

In [30]:
import numpy as np

In [None]:
np.array(doc_emb.tolist()) == doc_emb

: 

In [None]:
whisper_out = lisa.transcribe('Recording.m4a')

In [None]:
transcript_str = create_transcript_str(whisper_out)
minutes = lisa.minutes_of_meeting(transcript_str)
action_items = lisa.action_items(transcript_str)

In [None]:
doc_emb, transcript_chunks = lisa.create_index(whisper_out)

In [24]:
minutes

['Bhavish introduced himself.']

In [25]:
action_items

[]

In [26]:
lisa.search(doc_emb, transcript_chunks, 'explain the issue with deep learning with an example')

[]

In [None]:
import moviepy.editor as mp

In [None]:
clip = mp.VideoFileClip('Recording.mp4')

In [None]:
clip.audio.write_audiofile("Recording2.mp3")