<a href="https://colab.research.google.com/github/marioschlosser/meeting-gpt/blob/main/Meeting_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Give access to your Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#@title Initialize huggingface.co
huggingface_token = "" #@param {type:"string"}

In [None]:
#@title Load models and libraries

%%capture
!pip install torch torchvision torchaudio
!pip install pyannote.audio
!pip install openai-whisper

from pyannote.audio import Pipeline
import whisper
import ssl
import pandas as pd

ssl._create_default_https_context = ssl._create_unverified_context

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=huggingface_token)

# load Whisper model
model = whisper.load_model("small.en")

In [None]:
#@title Provide the Google Drive path to your input audio (.wav) file
file_name = "" #@param {type:"string"}

In [None]:
#@title Run transcription and diarization and save to files
diarization = pipeline(file_name)

# copy diarization.itertracks into an array
diarization_array = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    diarization_array.append([turn.start, turn.end, speaker])

# compress successive rows with the same speaker into one row
diarization_array_compressed = []
for i in range(len(diarization_array)):
    if i == 0:
        diarization_array_compressed.append(diarization_array[i])
    else:
        if diarization_array[i][2] == diarization_array_compressed[-1][2]:
            diarization_array_compressed[-1][1] = diarization_array[i][1]
        else:
            diarization_array_compressed.append(diarization_array[i])

# print the compressed array
for row in diarization_array_compressed:
    print(row)
  
result = model.transcribe(file_name)
segments = result["segments"]

# data has all the transcript data
data = []
for segment in segments:
    meta = {
        "start": segment["start"],
        "end": segment["end"],
        "tokens": segment["tokens"],
        "text": segment["text"]
    }
    data.append(meta)

# for each segment in data, match the closest start and end times to the diarization and get the speaker
for segment in data:
    start = segment["start"]
    end = segment["end"]
    # find the speaker in diarization array where the overlap is the largest
    old_optimum = 0
    for row in diarization_array_compressed:
        optimum = min(end, row[1]) - max(start, row[0])
        if optimum > old_optimum:
            segment["speaker"] = row[2]
            old_optimum = optimum

# create dataframe from data
df = pd.DataFrame(data)

# save full transcript with speakers to csv
df.groupby((df['speaker'] != df['speaker'].shift()).cumsum()).agg(text=('text',lambda x: " ".join(x)), speaking=('speaker','first')).reset_index().dropna(subset=['speaking'])[['speaking', 'text']].apply(lambda x: ': '.join(x),axis=1).to_csv(path_or_buf="transcript.csv", header=False, index=False)

# calculate number of tokens per speaker as length of token list
df["num_tokens"] = df.tokens.apply(lambda x: len(x))

# calculate number of tokens per speaker as sum over num_tokens
tokens_per_speaker = df.groupby("speaker")["num_tokens"].sum()

# first calculate speaking time as end - start, then sum over all segments per speaker
df["speaking_time"] = df.end - df.start
speakingtime_per_speaker = df.groupby("speaker")["speaking_time"].sum()

# join the two series on speaker
speakers = tokens_per_speaker.to_frame().join(speakingtime_per_speaker.to_frame())

# calculate tokens per minute
speakers["tokens_per_minute"] = speakers.num_tokens / speakers.speaking_time * 60

# extract one sentence per speaker and start time
speakers.index = speakers.index.astype(str)
speakers["start"] = df.groupby("speaker")["start"].min()
speakers["text"] = df.groupby("speaker")["text"].first()

# show speaker, text and start time
for speaker in speakers.index:
    print(speaker, "Start time: ", speakers.start[speaker], "Tokens per minute: ", speakers.tokens_per_minute[speaker], "Speaking time: ", speakers.speaking_time[speaker], speakers.text[speaker])

# save to csv
speakers.to_csv("speakers.csv")