# The transcript notebook analysis

Importing the main libaries used insode the code and global variables

In [45]:
from docx import Document
import re
import os
import openpyxl
from datetime import datetime
import pprint



transcripts_folder_path = os.getcwd()+'/raw_transcripts/'
time_format = "%H:%M:%S"
pretty_printer = pprint.PrettyPrinter(width=40, compact=True)

In [50]:
def calculate_total_second(spk_dict_time):
    t_value = 0
    for value in spk_dict_time.values():
        print(value)
        t_value += value
    
    print("--------")
    print(t_value)


def find_doc_files_in_current_folder() -> list:
    """Find the docx file in the current folder

    Returns:
        list: list of the files name as str
    """

    doc_files = []

    for filename in os.listdir(transcripts_folder_path):
        # filename.startswith("~") -> ignores the temprory files when that words make when file is open
        if filename.endswith(".docx") and not filename.startswith("~"):
            doc_files.append(filename)

    return doc_files

def extract_time_from_string(time_string):
    # Split the string by spaces
    parts = time_string.split()
    for part in parts:
        try:
            # Try to parse a time from the part
            time = datetime.strptime(part, "%H:%M:%S")
            return time
        except ValueError:
            print(ValueError)
            pass
    return None

def calc_duration_in_second(time1, time2):
    
    # time1 = datetime.strptime(time_str1, time_format)
    # time2 = datetime.strptime(time_str2, time_format)
    time_difference = time2 - time1
    seconds_difference = time_difference.total_seconds()

    return seconds_difference

def find_speaker_change(file_name : str) -> list:

    # Create a list to store discussions
    discussions = []
    current_discussion = ""
    
    doc = Document(transcripts_folder_path+file_name)
    # Define a regular expression pattern to match speaker lines
    speaker_pattern = re.compile(r'\d{2}:\d{2}:\d{2} Speaker \d')

    for paragraph in doc.paragraphs:
        text = paragraph.text

        # Check if the paragraph matches the speaker pattern
        if re.match(speaker_pattern, text):
            # Start a new discussion when a new speaker is detected
            if current_discussion:
                discussions.append(current_discussion)
                
            current_discussion = text + "\n"
        else:
            # Append the text to the current discussion
            current_discussion += text + "\n"


    # Append the last discussion to the list
    if current_discussion:
        discussions.append(current_discussion)

    pattern = r'\bSpeaker \d+\b'
    temp_speaker_numbers = ''
    highest_speaker = 0
    discussion_number = 0
    speaker_time_dict = {}

    for discussion in discussions:

        # extracting speakers from the discussion.
        if re.match(speaker_pattern, discussion):
            speaker_numbers = re.findall(pattern, discussion)
            
            # ignore same speaker in the discussion
            if temp_speaker_numbers == '':
                discussion_number +=1
                temp_speaker_numbers = speaker_numbers
                # print("new speaker detected!")
                # print(speaker_numbers)
                if speaker_numbers[0] not in speaker_time_dict:
                    str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                    time_1 = extract_time_from_string(str_speaker_with_time)
                    speaker_time_dict[speaker_numbers[0]] = 0

            elif speaker_numbers != temp_speaker_numbers:
                discussion_number +=1
                
                str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                time_2 = extract_time_from_string(str_speaker_with_time)

                delta_seconds = calc_duration_in_second(time1=time_1,time2=time_2)

                if temp_speaker_numbers[0] not in speaker_time_dict:
                    speaker_time_dict[temp_speaker_numbers[0]] = delta_seconds
                else:
                    speaker_time_dict[temp_speaker_numbers[0]] += delta_seconds

                # update the temp_speaker_numbers(current speaker) 
                time_1 = time_2
                temp_speaker_numbers = speaker_numbers
            
            # getting the last discussion time
            else:
                str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                time_3 = extract_time_from_string(str_speaker_with_time)

            #finding the number of the speakers in the meeting
            num_pattern = r'\d+'
            spk_numbers = int(re.findall(num_pattern, speaker_numbers[0])[0])
            if highest_speaker < spk_numbers:
                highest_speaker = spk_numbers
        
    # calculating last discussion time to the dictionary
    delta_seconds = calc_duration_in_second(time1=time_2,time2=time_3)
    speaker_time_dict[speaker_numbers[0]] += delta_seconds
    pretty_printer.pprint(speaker_time_dict)
    calculate_total_second(speaker_time_dict)



    print(f"file: {file_name} has {discussion_number} discussion.")
    

    return [file_name, discussion_number, highest_speaker]




list_of_docx_files_name = find_doc_files_in_current_folder()
# print(list_of_docx_files_name)
# print("=============")
list_of_docx_files_name = ['June 2019 ACIP Meeting - Combination Vaccines; Public Comment.docx']

workbook = openpyxl.Workbook()
sheet = workbook.active
excel_row = ['name', 'discussion number', 'highest speaker number']
sheet.append(excel_row)
for file_name in list_of_docx_files_name:
    excel_row = find_speaker_change(file_name)
    sheet.append(excel_row)


workbook.save("ACIP meeting discussion analysis.xlsx")

{'Speaker 1': 290.0,
 'Speaker 10': 183.0,
 'Speaker 11': 200.0,
 'Speaker 12': 184.0,
 'Speaker 13': 185.0,
 'Speaker 14': 173.0,
 'Speaker 2': 51.0,
 'Speaker 3': 267.0,
 'Speaker 4': 740.0,
 'Speaker 5': 30.0,
 'Speaker 6': 12.0,
 'Speaker 7': 23.0,
 'Speaker 8': 192.0,
 'Speaker 9': 188.0}
290.0
51.0
267.0
740.0
30.0
12.0
23.0
192.0
188.0
183.0
200.0
184.0
185.0
173.0
--------
2718.0
file: June 2019 ACIP Meeting - Combination Vaccines; Public Comment.docx has 42 discussion.


In [None]:
# for value in speaker_time_dict.values():
#     print(value)