# The transcript notebook analysis

Importing the main libaries used insode the code and global variables

**transcripts_folder_path** : Folder that contains the raw transcripts.

**time_format** : The time format that has been shown inside the world files (transcript).

**speaker_time_dict** : This dictionary will have two keys:
  - **'speaker total time'**: the duration that the speaker has talked inside the file in seconds.
  - **'speaker number of time'**: number of times the speaker has talked.


In [1]:
from docx import Document
import re
import os
# import openpyxl
from datetime import datetime
import pprint
import pandas as pd



transcripts_folder_path = os.getcwd()+'/raw_transcripts/'
time_format = "%H:%M:%S"
pretty_printer = pprint.PrettyPrinter(width=40, compact=True)
speaker_time_dict = {}

**speaker_pattern** : The full pattern, for example, "00:00:00 Speaker 1."

**current_discussion** : Since the transcript usually has multiple occurrences of the same speaker in a row, this value combines all instances of the same speakers into a single representation.

**highest_speaker** : This value displays the maximum number of different speakers in the file.

**discussion_number** : This value indicates the total number of discussions present in the raw transcript.


In [3]:
def calculate_total_second(spk_dict_time):
    t_value = 0
    
    for value in spk_dict_time.keys():
        if 'total time' in value:
            t_value += spk_dict_time[value]


def find_doc_files_in_current_folder() -> list:
    """Find the docx file in the current folder
    It ignores the temperory files which are getting opened
    when files are opened '~'
    Returns:
        list: list of the files name as str
    """

    doc_files = []

    for filename in os.listdir(transcripts_folder_path):
        # filename.startswith("~") -> ignores the temprory files when that words make when file is open
        if filename.endswith(".docx") and not filename.startswith("~"):
            doc_files.append(filename)

    return doc_files

def extract_time_from_string(time_string):
    # Split the string by spaces
    parts = time_string.split()
    for part in parts:
        try:
            # Try to parse a time from the part
            time = datetime.strptime(part, "%H:%M:%S")
            return time
        except ValueError:
            print(ValueError)
            pass
    return None

def calc_duration_in_second(time1, time2):
    
    time_difference = time2 - time1
    seconds_difference = time_difference.total_seconds()

    return seconds_difference

def find_speaker_change(file_name : str) -> list:

    # Create a list to store discussions
    discussions = []
    current_discussion = ""

    global speaker_time_dict
    
    doc = Document(transcripts_folder_path+file_name)
    # Define a regular expression pattern to match speaker lines
    speaker_pattern = re.compile(r'\d{2}:\d{2}:\d{2} Speaker \d')

    for paragraph in doc.paragraphs:
        text = paragraph.text

        # Check if the paragraph matches the speaker pattern
        if re.match(speaker_pattern, text):
            # Start a new discussion when a new speaker is detected
            if current_discussion:
                discussions.append(current_discussion)
                
            current_discussion = text + "\n"
        else:
            # Append the text to the current discussion
            current_discussion += text + "\n"


    # Append the last discussion to the list
    if current_discussion:
        discussions.append(current_discussion)

    pattern = r'\bSpeaker \d+\b'
    temp_speaker_numbers = ''
    highest_speaker = 0
    discussion_number = 0
    
    for discussion in discussions:

        # extracting speakers from the discussion.
        if re.match(speaker_pattern, discussion):
            speaker_numbers = re.findall(pattern, discussion)
            
            # ignore same speaker in the discussion
            if temp_speaker_numbers == '':
                discussion_number +=1
                temp_speaker_numbers = speaker_numbers
                
                if speaker_numbers[0] not in speaker_time_dict:
                    str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                    time_1 = extract_time_from_string(str_speaker_with_time)
                    speaker_time_dict[speaker_numbers[0]+' total time'] = 0
                    speaker_time_dict[speaker_numbers[0]+' number of time'] = 0

            elif speaker_numbers != temp_speaker_numbers:
                discussion_number +=1
                
                str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                time_2 = extract_time_from_string(str_speaker_with_time)

                delta_seconds = calc_duration_in_second(time1=time_1,time2=time_2)
            
                if temp_speaker_numbers[0]+' total time' not in speaker_time_dict:
                    speaker_time_dict[temp_speaker_numbers[0]+' total time'] = delta_seconds
                    speaker_time_dict[temp_speaker_numbers[0]+' number of time'] = 1
                else:
                    
                    speaker_time_dict[temp_speaker_numbers[0]+' total time'] += delta_seconds
                    speaker_time_dict[temp_speaker_numbers[0]+' number of time'] += 1

                # update the temp_speaker_numbers(current speaker) 
                time_1 = time_2
                temp_speaker_numbers = speaker_numbers
            
            # getting the last discussion time
            else:
                str_speaker_with_time = re.findall(r'\d{2}:\d{2}:\d{2} Speaker \d', discussion)[0]
                time_3 = extract_time_from_string(str_speaker_with_time)

            # finding the number of the speakers in the meeting
            num_pattern = r'\d+'
            spk_numbers = int(re.findall(num_pattern, speaker_numbers[0])[0])
            if highest_speaker < spk_numbers:
                highest_speaker = spk_numbers
        
    # calculating last discussion time to the dictionary
    delta_seconds = calc_duration_in_second(time1=time_2,time2=time_3)
    
    speaker_time_dict[speaker_numbers[0]+' total time'] += delta_seconds
    speaker_time_dict[speaker_numbers[0]+' number of time'] += 1
    
    # pretty_printer.pprint(speaker_time_dict)
    # calculate_total_second(speaker_time_dict)


    print(f"file: {file_name} has {discussion_number} discussion.")
    
    
    file_analysis ={**{'file_name': file_name, 'discussion_number': discussion_number, 'highest_speaker':highest_speaker}, **speaker_time_dict}
    # pretty_printer.pprint(file_analysis)

    speaker_time_dict = {}
    # return [file_name, discussion_number, highest_speaker]
    return file_analysis




list_of_docx_files_name = find_doc_files_in_current_folder()
# list_of_docx_files_name = ['June 2019 ACIP Meeting - Combination Vaccines; Public Comment.docx']


data = []
for file_name in list_of_docx_files_name:
    data.append(find_speaker_change(file_name))

pretty_printer.pprint(data)

df = pd.DataFrame(data)
df.to_excel('ACIP_meeting_discussion_analysis.xlsx', index=False)

file: June 2019 ACIP Meeting - Votes.docx has 229 discussion.
file: June 2019 ACIP Meeting - Combination Vaccines; Public Comment.docx has 42 discussion.
file: June 2019 ACIP Generated transcript from audio.docx has 134 discussion.
[{'Speaker 1 number of time': 12,
  'Speaker 1 total time': 201.0,
  'Speaker 10 number of time': 11,
  'Speaker 10 total time': 173.0,
  'Speaker 11 number of time': 10,
  'Speaker 11 total time': 205.0,
  'Speaker 12 number of time': 11,
  'Speaker 12 total time': 147.0,
  'Speaker 13 number of time': 18,
  'Speaker 13 total time': 223.0,
  'Speaker 14 number of time': 12,
  'Speaker 14 total time': 177.0,
  'Speaker 15 number of time': 10,
  'Speaker 15 total time': 64.0,
  'Speaker 16 number of time': 5,
  'Speaker 16 total time': 15.0,
  'Speaker 17 number of time': 14,
  'Speaker 17 total time': 141.0,
  'Speaker 18 number of time': 2,
  'Speaker 18 total time': 73.0,
  'Speaker 19 number of time': 2,
  'Speaker 19 total time': 6.0,
  'Speaker 2 number

In [19]:
import pandas as pd

data = [
    {
        'Speaker 1': [290.0, 14],
        'Speaker 10': [183.0, 1],
        'Speaker 11': [200.0, 2],
        'Speaker 12': [184.0, 2],
        'Speaker 13': [185.0, 2],
        'Speaker 14': [173.0, 1],
        'Speaker 2': [51.0, 2],
        'Speaker 3': [267.0, 3],
        'Speaker 4': [740.0, 7],
        'Speaker 5': [30.0, 2],
        'Speaker 6': [12.0, 2],
        'Speaker 7': [23.0, 1],
        'Speaker 8': [192.0, 1],
        'Speaker 9': [188.0, 2],
        'discussion_number': 42,
        'file_name': "June 2019 ACIP Meeting - Combination Vaccines; Public Comment.docx",
        'highest_speaker': 14
    },
    {
        'Speaker 1': [290.0, 14],
        'Speaker 10': [183.0, 1],
        'Speaker 11': [200.0, 2],
        'Speaker 12': [184.0, 2],
        'Speaker 13': [185.0, 2],
        'Speaker 14': [173.0, 1],
        'Speaker 15': [173.0, 1],
        'Speaker 2': [51.0, 2],
        'Speaker 3': [267.0, 3],
        'Speaker 4': [740.0, 7],
        'Speaker 5': [30.0, 2],
        'Speaker 6': [12.0, 2],
        'Speaker 7': [23.0, 1],
        'Speaker 8': [192.0, 1],
        'Speaker 9': [188.0, 2],
        'discussion_number': 42,
        'file_name': "June 2019 ACIP Meeting - Combination Vaccines; Public Comment.docx",
        'highest_speaker': 14
    }
]

df = pd.DataFrame(data)
df.to_excel('output.xlsx', index=False)
