In [30]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from copy import deepcopy
import re
import pandas as pd

In [31]:
filename = "meet2_fin"  # Example filename without extension

In [11]:
def find_speaker(frame,overlay=True):
    frame = cv2.resize(frame, (1000, 750))

    # Convert to HSV color space
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
    lower_green = np.array([20, 40, 60])
    upper_green = np.array([80, 255, 255])

    # Create a mask for green color
    mask = cv2.inRange(hsv, lower_green, upper_green)

    # Find contours in the mask
    contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    max_rect = None
    max_rect_area = 0
    if contours:
        for contour in contours:
            # Approximate the contour to a polygon
            approx = cv2.approxPolyDP(contour, 0.02 * cv2.arcLength(contour, True), True)

            # If the polygon has four sides, it could be a rectangle
            if len(approx) == 4:
                # Get the bounding box coordinates
                x, y, w, h = cv2.boundingRect(approx)
                if w * h > max_rect_area:
                    max_rect = {"x": x, "y": y, "w": w, "h": h}
                    max_rect_area = max_rect["w"] * max_rect["h"]
    cutout=None
    if max_rect:
        cutout=frame[max_rect["y"]:max_rect["y"]+max_rect["h"], max_rect["x"]:max_rect["x"]+max_rect["w"]]
        if overlay==True:
            cv2.rectangle(frame, (max_rect["x"], max_rect["y"]),
                          (max_rect["x"] + max_rect["w"], max_rect["y"] + max_rect["h"]), (0, 0, 255), 3)
    return frame,max_rect,cutout

In [12]:
from PIL import Image , ImageOps
import pytesseract
def get_speaker_name(cutout):
#     plt.imshow(cutout)
#     plt.show()
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Change this to your Tesseract path
    hsv = cv2.cvtColor(cutout, cv2.COLOR_BGR2HSV)
    lower_white = np.array([0, 0, 200])
    upper_white = np.array([180, 55, 255])
    mask = cv2.inRange(hsv, lower_white, upper_white)
    result = cv2.bitwise_and(cutout, cutout, mask=mask)
    text = pytesseract.image_to_string(result)
    print("text:",text)
    speaker_name=text.replace('\n','')
    speaker_name = re.sub(r'[^A-Za-z0-9 ]+', '', speaker_name)
    if speaker_name=="":
        speaker_name="No Speaker"
    print(speaker_name)
    return text 

In [13]:
import Levenshtein as lev

def find_similar(name, name_list):
    similarity_threshold = 0.5
    for existing_name in name_list:
        similarity = lev.ratio(existing_name, name)
        if similarity >= similarity_threshold:
            return True, existing_name
    return False, None

In [16]:

cap = cv2.VideoCapture(f'dataset/{filename}.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)

output_video_path=f"dataset/pre_proc/{filename}_overlay.mp4"
if os.path.exists(output_video_path):
    os.remove(output_video_path)
# Define the codec and create VideoWriter object to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # or 'XVID' for .avi format
out = cv2.VideoWriter(output_video_path, fourcc, fps, (1000, 750))

frame_count = 0

# Calculate the number of frames to skip (process every 1 second)
skip_frames = int(fps)

panel_center_x=-100
panel_center_y=-100
cur_speaker_name="No Speaker"
speaker_names=[]
df = pd.DataFrame(columns=["time","speaker"])
time_sec=0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Process every 5th frame
    if frame_count % skip_frames == 0:
        frame,rect,cutout=find_speaker(frame)
        
        if not (cur_speaker_name!="No Speaker" and (panel_center_x-50<rect["x"]<panel_center_x+50 and \
                    panel_center_y-50<rect["y"]<panel_center_y+50)):
            cur_speaker_name=get_speaker_name(cutout)
        
        print("cur_speaker_name",cur_speaker_name)  
        print("speaker_names",speaker_names)
        is_similar,similar_speaker_name= find_similar(cur_speaker_name, speaker_names)
        if is_similar==False:
            speaker_names.append(cur_speaker_name)
        else:
            cur_speaker_name=similar_speaker_name
            
        text=cur_speaker_name    
#         print(text)
        # Write the processed frame to the output video

        position = (50, 50)  # (x, y) coordinates

        # Define the font and text properties
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 1
        font_color = (255, 255, 255)  # White color in BGR
        font_thickness = 2

        # Overlay the text on the image
        cv2.putText(frame, text, position, font, font_scale, font_color, font_thickness)
        time_sec=time_sec+1
        df.loc[len(df)] = {"time":time_sec,"speaker":cur_speaker_name}

    out.write(frame)
    frame_count += 1
df.to_csv(f"dataset/pre_proc/{filename}_speakers.csv")
# Release everything when the job is finished
cap.release()
out.release()
cv2.destroyAllWindows()

text: Kristie KT Thomas,

Kristie KT Thomas
cur_speaker_name Kristie KT Thomas,

speaker_names []
text: Kristie KT' Thomas.

Kristie KT Thomas
cur_speaker_name Kristie KT' Thomas.

speaker_names ['Kristie KT Thomas,\n']
text: Kristie KT' Thomas.

Kristie KT Thomas
cur_speaker_name Kristie KT' Thomas.

speaker_names ['Kristie KT Thomas,\n']
text: Eric vohnson - CTO.

Eric vohnson  CTO
cur_speaker_name Eric vohnson - CTO.

speaker_names ['Kristie KT Thomas,\n']
text: Eric vohnson - CTO.

Eric vohnson  CTO
cur_speaker_name Eric vohnson - CTO.

speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']
text: Eric vohnson - CTO.

Eric vohnson  CTO
cur_speaker_name Eric vohnson - CTO.

speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']
text: Eric vohnson - CTO.

Eric vohnson  CTO
cur_speaker_name Eric vohnson - CTO.

speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']
text: Eric vohnson - CTO.

Eric vohnson  CTO
cur_speaker_name Eric vohnson - CTO.

speaker_name

text: Fric sonnson - CTO.

Fric sonnson  CTO
cur_speaker_name Fric sonnson - CTO.

speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']
text: Eri¢ onnson- CTO” |,


Eri onnson CTO 
cur_speaker_name Eri¢ onnson- CTO” |,


speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']
text: Fic sonnson-CTO

Fic sonnsonCTO
cur_speaker_name Fic sonnson-CTO

speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']
text: Bric sonnson - CTO.

Bric sonnson  CTO
cur_speaker_name Bric sonnson - CTO.

speaker_names ['Kristie KT Thomas,\n', 'Eric vohnson - CTO.\n']


In [15]:
cap.release()
out.release()
cv2.destroyAllWindows()

In [32]:
from moviepy.editor import VideoFileClip
import os
from IPython.display import Audio
from pydub import AudioSegment
import deepspeech
import numpy as np
import wave

In [33]:

input_video_path = os.path.join("dataset", f"{filename}.mp4")
output_audio_path = os.path.join("dataset", "pre_proc", f"{filename}.mp3")

In [34]:
video = VideoFileClip(input_video_path)
audio = video.audio
audio.write_audiofile(output_audio_path)

MoviePy - Writing audio in dataset\pre_proc\meet2_fin.mp3


                                                                                                                       

MoviePy - Done.




In [35]:
audio = Audio(output_audio_path, autoplay=True)
audio

In [36]:
audio_path_wav = f"{output_audio_path[:-4]}.wav"
audio = AudioSegment.from_mp3(output_audio_path)
audio = audio.set_channels(1)  # Convert to mono
audio = audio.set_frame_rate(16000)  # Set frame rate to 16kHz
audio.export(audio_path_wav, format="wav")

<_io.BufferedRandom name='dataset\\pre_proc\\meet2_fin.wav'>

In [37]:

def split_audio(df, audio_path_wav, output_dir):
    segments=[]
    start_time=0
    duration=1
    cur_speaker=df["speaker"][0]
    speaker_map={}
    speaker_id=1
    for i in range(1,len(df)):
        if df["speaker"][i] not in speaker_map.keys():
            speaker_map[str(df["speaker"][i])]=str(speaker_id)
            speaker_id=speaker_id+1
            
        if cur_speaker!=df["speaker"][i] or i==len(df)-1:
            segments.append({"start":start_time,"end":start_time+duration,"speaker_id":speaker_map[str(df["speaker"][i-1])]})
            cur_speaker=df["speaker"][i]
            start_time=df["time"][i]
        else:
            duration=duration+1
    count=0
    for segment in segments:
        count=count+1
        output = f"{output_dir}\\a{count}_{str(segment['speaker_id'])}.wav"

        sound = AudioSegment.from_wav(audio_path_wav) # for mp3: AudioSegment.from_mp3()

        StrtTime = float(segment["start"]) * 1000
        EndTime  = float(segment["end"]) * 1000
        extract = sound[StrtTime:EndTime]

        # save
        extract.export(output, format="wav")

    return speaker_map


In [38]:
import shutil
output_dir = f"dataset/pre_proc/{filename}_audio_splits"

# Delete the folder if it exists
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

# Create a fresh new folder
os.makedirs(output_dir, exist_ok=True)

df=pd.read_csv(f"dataset/pre_proc/{filename}_speakers.csv")
speaker_map=split_audio(df, audio_path_wav, output_dir)

In [39]:
import whisper
import glob
model_whisper_t = whisper.load_model("tiny")

In [41]:
reversed_speaker_map = {v: k for k, v in speaker_map.items()}
audio_splits=glob.glob(f"{output_dir}/*")
for audio in audio_splits: 
    key = f'{audio.rsplit("_", 1)[-1][:-4]}'  # Extract the key from the audio file name
    if key in reversed_speaker_map:
        speaker = reversed_speaker_map[key]
        predicted_text = model_whisper_t.transcribe(audio)["text"]
        print(f"Speaker {speaker}:")
        print(predicted_text)
    else:
        # Handle the case where the key is not found in the dictionary
        pass

Speaker Eric Jonnson - CTO.
:
 And the reason we measure data is that like one of the most likely failure modes is that we lose the community. Yeah. So we're just, or we're just goofy as the, um,
Speaker Sid Siprand- CEO. py
:
 like one of the most likely failure modes is that we lose the community. Yeah. So we're just, or we're just goofy as the,
Speaker 4

:
 most likely failure modes is that we lose the community. Yeah. So we're just, or we're gets goofy as is the,
Speaker Sid Siprand- CEO. py
:
 your modes is that we lose the community. Yeah. So we're just, or we're gets goofy as is the
Speaker nan:
 that we lose the community. Yeah. So we're just, or we're gets goofy as is the, um,
Speaker Sid Siprand- CEO. py
:
 Yeah. So we're just, or we're gets goofy as is the, um,
Speaker Eric Jonnson - CTO.
:
 Eric, where gets goofy is, is that,
Speaker Eric Jonnson - CTO.
:
 Well, I mean, development is larger, maybe they give more frequently or something. I'll see how it goes. All right. An

In [None]:
# Summarizer pipeline