# The transcript notebook analysis

Importing the main libaries used insode the code and global variables

In [35]:
from docx import Document
import re
import os
import openpyxl
from datetime import datetime
import pprint



transcripts_folder_path = os.getcwd()+'/raw_transcripts/'
time_format = "%H:%M:%S"
pretty_printer = pprint.PrettyPrinter(width=40, compact=True)

In [44]:
def find_doc_files_in_current_folder() -> list:
    """Find the docx file in the current folder

    Returns:
        list: list of the files name as str
    """

    doc_files = []

    for filename in os.listdir(transcripts_folder_path):
        # filename.startswith("~") -> ignores the temprory files when that words make when file is open
        if filename.endswith(".docx") and not filename.startswith("~"):
            doc_files.append(filename)

    return doc_files

def extract_time_from_string(time_string):
    # Split the string by spaces
    parts = time_string.split()
    for part in parts:
        try:
            # Try to parse a time from the part
            time = datetime.strptime(part, "%H:%M:%S")
            return time
        except ValueError:
            print(ValueError)
            pass
    return None

def calc_duration_in_second(time1, time2):
    
    # time1 = datetime.strptime(time_str1, time_format)
    # time2 = datetime.strptime(time_str2, time_format)
    time_difference = time2 - time1
    seconds_difference = time_difference.total_seconds()

    return seconds_difference

def find_speaker_change(file_name : str) -> list:

    # Create a list to store discussions
    discussions = []
    current_discussion = ""
    
    doc = Document(transcripts_folder_path+file_name)
    # Define a regular expression pattern to match speaker lines
    speaker_pattern = re.compile(r'\d{2}:\d{2}:\d{2} Speaker \d')

    for paragraph in doc.paragraphs:
        text = paragraph.text

        # Check if the paragraph matches the speaker pattern
        if re.match(speaker_pattern, text):
            # Start a new discussion when a new speaker is detected
            if current_discussion:
                discussions.append(current_discussion)
                
            current_discussion = text + "\n"
        else:
            # Append the text to the current discussion
            current_discussion += text + "\n"


    # Append the last discussion to the list
    if current_discussion:
        discussions.append(current_discussion)

    pattern = r'\bSpeaker \d+\b'
    temp_speaker_numbers = ''
    highest_speaker = 0
    discussion_number = 0
    speaker_time_dict = {}

    for discussion in discussions:

        # extracting speakers from the discussion.
        if re.match(speaker_pattern, discussion):
            speaker_numbers = re.findall(pattern, discussion)
            
            # ignore same speaker in the discussion
            if temp_speaker_numbers == '':
                discussion_number +=1
                temp_speaker_numbers = speaker_numbers
                # print("new speaker detected!")
                # print(speaker_numbers)
                if speaker_numbers[0] not in speaker_time_dict:
                    str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                    time_1 = extract_time_from_string(str_speaker_with_time)
                    speaker_time_dict[speaker_numbers[0]] = 0

            elif speaker_numbers != temp_speaker_numbers:
                discussion_number +=1
                
                str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                time_2 = extract_time_from_string(str_speaker_with_time)
                print(time_2)
                print(temp_speaker_numbers[0])
                print("============")

                delta_seconds = calc_duration_in_second(time1=time_1,time2=time_2)

                if temp_speaker_numbers[0] not in speaker_time_dict:
                    speaker_time_dict[temp_speaker_numbers[0]] = delta_seconds
                else:
                    speaker_time_dict[temp_speaker_numbers[0]] += delta_seconds

                # update the temp_speaker_numbers(current speaker) 
                time_1 = time_2
                temp_speaker_numbers = speaker_numbers
            
            # calculating last discussion time
            else:
                str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                time_3 = extract_time_from_string(str_speaker_with_time)

            #finding the number of the speakers in the meeting
            num_pattern = r'\d+'
            spk_numbers = int(re.findall(num_pattern, speaker_numbers[0])[0])
            if highest_speaker < spk_numbers:
                highest_speaker = spk_numbers
        
    # Adding last discussion time to the dictionary
    delta_seconds = calc_duration_in_second(time1=time_2,time2=time_3)
    speaker_time_dict[speaker_numbers[0]] += delta_seconds
    pretty_printer.pprint(speaker_time_dict)


    print(f"file: {file_name} has {discussion_number} discussion.")
    

    return [file_name, discussion_number, highest_speaker]




list_of_docx_files_name = find_doc_files_in_current_folder()
# print(list_of_docx_files_name)
# print("=============")
list_of_docx_files_name = ['June 2019 ACIP Meeting - Votes.docx']

workbook = openpyxl.Workbook()
sheet = workbook.active
excel_row = ['name', 'discussion number', 'highest speaker number']
sheet.append(excel_row)
for file_name in list_of_docx_files_name:
    excel_row = find_speaker_change(file_name)
    sheet.append(excel_row)

workbook.save("ACIP meeting discussion analysis.xlsx")

1900-01-01 00:01:10
Speaker 1
1900-01-01 00:04:40
Speaker 2
1900-01-01 00:05:06
Speaker 3
1900-01-01 00:05:10
Speaker 5
1900-01-01 00:05:14
Speaker 7
1900-01-01 00:05:16
Speaker 4
1900-01-01 00:05:19
Speaker 6
1900-01-01 00:05:22
Speaker 8
1900-01-01 00:05:25
Speaker 9
1900-01-01 00:05:30
Speaker 10
1900-01-01 00:05:34
Speaker 11
1900-01-01 00:05:38
Speaker 12
1900-01-01 00:05:40
Speaker 13
1900-01-01 00:05:43
Speaker 14
1900-01-01 00:05:49
Speaker 15
1900-01-01 00:06:12
Speaker 3
1900-01-01 00:06:34
Speaker 1
1900-01-01 00:07:08
Speaker 2
1900-01-01 00:07:20
Speaker 3
1900-01-01 00:07:28
Speaker 12
1900-01-01 00:07:31
Speaker 11
1900-01-01 00:07:33
Speaker 10
1900-01-01 00:07:36
Speaker 9
1900-01-01 00:07:40
Speaker 16
1900-01-01 00:07:46
Speaker 7
1900-01-01 00:08:32
Speaker 3
1900-01-01 00:08:52
Speaker 4
1900-01-01 00:09:26
Speaker 6
1900-01-01 00:09:50
Speaker 10
1900-01-01 00:11:28
Speaker 11
1900-01-01 00:11:31
Speaker 13
1900-01-01 00:12:00
Speaker 12
1900-01-01 00:12:49
Speake