In [15]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

import sys, os
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd()))
sys.path.insert(0, config.root_path)
import nltk

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [16]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

<IPython.core.display.Javascript object>

In [6]:
config.root_path

'/Users/amitmaraj/Library/CloudStorage/GoogleDrive-amit.maraj@gmail.com/My Drive/SCHOOL/PhD/Code/QMSum'

<IPython.core.display.Javascript object>

In [8]:
import json

# read the dataset
# please enter the path of your data
split = "train"
data_path = "./data/ALL/jsonl/" + split + ".jsonl"
data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))
n_meetings = len(data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 162 meetings in the train set.


<IPython.core.display.Javascript object>

In [21]:
from nltk import word_tokenize

# tokneize a sent
def tokenize(sent):
    tokens = " ".join(word_tokenize(sent.lower()))
    return tokens

<IPython.core.display.Javascript object>

In [27]:
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace("{vocalsound}", "")
    text = text.replace("{disfmarker}", "")
    text = text.replace("a_m_i_", "ami")
    text = text.replace("l_c_d_", "lcd")
    text = text.replace("p_m_s", "pms")
    text = text.replace("t_v_", "tv")
    text = text.replace("{pause}", "")
    text = text.replace("{nonvocalsound}", "")
    text = text.replace("{gap}", "")
    return text

<IPython.core.display.Javascript object>

In [29]:
clean_data("Okay , it's a {disfmarker}")

"Okay , it's a "

<IPython.core.display.Javascript object>

In [7]:
# data

<IPython.core.display.Javascript object>

### Exploration

To get all the topics based on the given spans, each text span is for the turn (i.e., the speaker's turn). So each text spen is the entirety of a speakers speech for that period. Let's explore with one sample.

In [8]:
first_sample = data[1]
topic_list = first_sample["topic_list"]
meeting_transcripts = first_sample["meeting_transcripts"]

<IPython.core.display.Javascript object>

In [9]:
topic_list, meeting_transcripts

([{'topic': 'Prototype presentation', 'relevant_text_span': [['0', '38']]},
  {'topic': 'Budget balancing', 'relevant_text_span': [['39', '254']]},
  {'topic': 'Product evaluation', 'relevant_text_span': [['255', '426']]},
  {'topic': 'A summary for the whole project',
   'relevant_text_span': [['427', '516']]}],
 [{'speaker': 'Project Manager',
   'content': "That should hopefully do the trick , um . 'Kay . Sorry about the small delay . Falling a little bit behind schedule . And that's uh fifteen twenty five . Okay . So just to try and roughly go over what we agreed in the last one , um we're gonna go for something uh uh how was it ? Uh The new black , I believe ."},
  {'speaker': 'Marketing', 'content': '{gap} .'},
  {'speaker': 'Project Manager',
   'content': "Um something that looks good 'cause that seems to be in preference to actual functionality in the end , though we should never avoid functionality , of course . Uh many of our components are gonna be standard , off the shelf 

<IPython.core.display.Javascript object>

In [30]:
import pprint

# shape: [["sentence 1", "sentence 2"], ["sentence3", "sentence4"]]
text_segments = []
for topic in topic_list:
    prev_end_span = -1
    for text_span in topic["relevant_text_span"]:
        start_of_span = int(text_span[0])
        end_of_span = int(text_span[-1])

        # MISSING TOPIC CASE
        # this means that there are some sentences without a topic
        # for example, [0, 19] and [24, 29], where 20 - 23 are missing
        if start_of_span != (prev_end_span + 1):
            missing_topic_meeting = meeting_transcripts[
                prev_end_span + 1 : start_of_span - 1
            ]
            missing_topic_full_segment = []
            if len(missing_topic_meeting) != 0:
                for segment in missing_topic_meeting:
                    # clean the text before insertion
                    cleaned_content = clean_data(segment["content"])
                    # get the content for the segment and throw it into the list of segments
                    missing_topic_full_segment.append(cleaned_content)
                text_segments.append(missing_topic_full_segment)

        # TOPIC CASE
        topic_meeting = meeting_transcripts[start_of_span:end_of_span]
        if "meeting" not in topic:
            topic["meeting"] = []
        topic["meeting"].append(topic_meeting)

        full_segment = []
        for segment in topic_meeting:
            # clean the text before insertion
            cleaned_content = clean_data(segment["content"])
            # get the content for the segment and throw it into the list of segments
            full_segment.append(cleaned_content)

        text_segments.append(full_segment)

        # make sure we have the previous end span for sentences that don't have topics
        prev_end_span = end_of_span

pprint.pprint(text_segments)

[["That should hopefully do the trick , um . 'Kay . Sorry about the small "
  "delay . Falling a little bit behind schedule . And that's uh fifteen twenty "
  'five . Okay . So just to try and roughly go over what we agreed in the last '
  "one , um we're gonna go for something uh uh how was it ? Uh The new black , "
  'I believe .',
  ' .',
  "Um something that looks good 'cause that seems to be in preference to "
  'actual functionality in the end , though we should never avoid '
  'functionality , of course . Uh many of our components are gonna be standard '
  ', off the shelf , but it seemed like we were gonna require at least an '
  'advanced chip and we were still very much for the idea of using an L_C_D_ '
  'display . Um other things were we were hoping to use rubber , most likely '
  'gonna be double curved , etcetera . Okay . So um due to your hard work , we '
  'might as well let the uh two designers go first , and uh show us the '
  'prototype .',
  "Okay , it's a ",
  "Qui

<IPython.core.display.Javascript object>

### Preprocess all the data for text segmentation

In [36]:
def preprocess_text_segmentation(data):
    # shape: [["sentence 1", "sentence 2"], ["sentence3", "sentence4"]]
    text_segments = []
    for sample in data:
        # get the topic list and meeting transcript from the current sample
        topic_list = sample["topic_list"]
        meeting_transcripts = sample["meeting_transcripts"]

        for topic in topic_list:
            prev_end_span = -1
            for text_span in topic["relevant_text_span"]:
                # for this specific topic, get the relevant start and end spans
                start_of_span = int(text_span[0])
                end_of_span = int(text_span[-1])

                # MISSING TOPIC CASE
                # this means that there are some sentences without a topic
                # for example, [0, 19] and [24, 29], where 20 - 23 are missing
                if start_of_span != (prev_end_span + 1):
                    missing_topic_meeting = meeting_transcripts[
                        prev_end_span + 1 : start_of_span - 1
                    ]
                    if len(missing_topic_meeting) != 0:
                        missing_topic_full_segment = []
                        for segment in missing_topic_meeting:
                            tokenized_content = segment["content"]
                            # clean the text before insertion
                            cleaned_content = clean_data(tokenized_content)
                            # get the content for the segment and throw it into the list of segments
                            missing_topic_full_segment.append(cleaned_content)
                        text_segments.append(missing_topic_full_segment)

                # REAL TOPIC CASE
                topic_meeting = meeting_transcripts[start_of_span:end_of_span]
                if "meeting" not in topic:
                    topic["meeting"] = []
                topic["meeting"].append(topic_meeting)

                full_segment = []
                for segment in topic_meeting:
                    # get the content for the segment and throw it into the list of segments
                    full_segment.append(segment["content"])

                text_segments.append(full_segment)
    
    return text_segments


<IPython.core.display.Javascript object>

In [32]:
len(text_segments)

1443

<IPython.core.display.Javascript object>

In [33]:
print([len(s) for s in text_segments])

[53, 53, 229, 283, 175, 459, 45, 38, 38, 215, 254, 171, 426, 89, 2, 16, 19, 10, 30, 22, 53, 16, 76, 22, 99, 13, 113, 4, 28, 571, 600, 112, 713, 144, 860, 123, 12, 130, 143, 219, 363, 64, 428, 520, 19, 44, 15, 60, 96, 164, 75, 253, 250, 23, 13, 44, 78, 130, 66, 197, 222, 23, 279, 304, 9, 315, 57, 380, 55, 18, 70, 89, 107, 197, 262, 462, 131, 594, 165, 27, 27, 94, 124, 93, 41, 42, 3, 46, 26, 73, 19, 93, 10, 104, 63, 168, 19, 188, 72, 130, 130, 263, 394, 267, 13, 13, 47, 171, 31, 61, 36, 98, 72, 206, 34, 2, 45, 48, 114, 163, 323, 487, 10, 24, 30, 306, 342, 173, 537, 70, 19, 13, 33, 52, 86, 11, 98, 60, 159, 202, 68, 68, 67, 136, 76, 213, 73, 287, 42, 330, 40, 27, 27, 5, 33, 22, 56, 16, 73, 7, 81, 20, 102, 17, 8, 177, 186, 198, 385, 261, 647, 148, 24, 37, 77, 33, 123, 180, 323, 72, 396, 41, 437, 208, 4, 215, 220, 249, 487, 242, 732, 241, 140, 285, 35, 149, 42, 192, 69, 262, 23, 321, 35, 330, 330, 122, 453, 486, 940, 383, 9, 10, 76, 89, 40, 132, 126, 264, 58, 289, 60, 56, 552, 92, 14, 15, 3,

<IPython.core.display.Javascript object>

In [34]:
# average number of strings in each segment
print(sum([len(s) for s in text_segments]) / len(text_segments))

155.63478863478863


<IPython.core.display.Javascript object>

### Save the data to a file

All meetings

In [18]:
import pickle

file_name = "./all_meetings_text_segments_train.pkl"

<IPython.core.display.Javascript object>

In [38]:
open_file = open(file_name, "wb")
pickle.dump(text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

In [46]:
# open_file = open(file_name, "rb")
# loaded_list = pickle.load(open_file)
# open_file.close()
# loaded_list

<IPython.core.display.Javascript object>

Academic meetings

In [39]:
# read the dataset
# please enter the path of your data
split = "train"
data_path = "./data/Academic/jsonl/" + split + ".jsonl"
academic_data = []
with open(data_path) as f:
    for line in f:
        academic_data.append(json.loads(line))
n_meetings = len(academic_data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 41 meetings in the train set.


<IPython.core.display.Javascript object>

In [40]:
academic_text_segments = preprocess_text_segmentation(academic_data)

<IPython.core.display.Javascript object>

In [44]:
file_name = "./academic_meetings_text_segments_train.pkl"
open_file = open(file_name, "wb")
pickle.dump(academic_text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

Committee meetings

In [42]:
# read the dataset
# please enter the path of your data
split = "train"
data_path = "./data/Committee/jsonl/" + split + ".jsonl"
committee_data = []
with open(data_path) as f:
    for line in f:
        committee_data.append(json.loads(line))
n_meetings = len(committee_data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 24 meetings in the train set.


<IPython.core.display.Javascript object>

In [43]:
committee_text_segments = preprocess_text_segmentation(committee_data)

<IPython.core.display.Javascript object>

In [47]:
file_name = "./committee_meetings_text_segments_train.pkl"
open_file = open(file_name, "wb")
pickle.dump(committee_text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

Product meetings

In [48]:
# read the dataset
# please enter the path of your data
split = "train"
data_path = "./data/Product/jsonl/" + split + ".jsonl"
product_data = []
with open(data_path) as f:
    for line in f:
        product_data.append(json.loads(line))
n_meetings = len(product_data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 97 meetings in the train set.


<IPython.core.display.Javascript object>

In [49]:
product_text_segments = preprocess_text_segmentation(product_data)

<IPython.core.display.Javascript object>

In [19]:
file_name = "./committee_meetings_text_segments_train.pkl"
open_file = open(file_name, "wb")
pickle.dump(product_text_segments, open_file)
open_file.close()

NameError: name 'product_text_segments' is not defined

<IPython.core.display.Javascript object>

In [None]:
product_text_segments