In [3]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

import sys, os
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd()))
sys.path.insert(0, config.root_path)
import nltk

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<IPython.core.display.Javascript object>

In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

<IPython.core.display.Javascript object>

In [5]:
config.root_path

'/Users/amitmaraj/Documents/PhD/context-encoder-qmsum'

<IPython.core.display.Javascript object>

In [23]:
import json

# read the dataset
# please enter the path of your data
split = "train"
data_path = "./data/ALL/jsonl/" + split + ".jsonl"
data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))
n_meetings = len(data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 162 meetings in the train set.


<IPython.core.display.Javascript object>

In [24]:
from nltk import word_tokenize

# tokneize a sent
def tokenize(sent):
    tokens = " ".join(word_tokenize(sent.lower()))
    return tokens

<IPython.core.display.Javascript object>

In [25]:
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace("{vocalsound}", "")
    text = text.replace("{disfmarker}", "")
    text = text.replace("a_m_i_", "ami")
    text = text.replace("l_c_d_", "lcd")
    text = text.replace("p_m_s", "pms")
    text = text.replace("t_v_", "tv")
    text = text.replace("{pause}", "")
    text = text.replace("{nonvocalsound}", "")
    text = text.replace("{gap}", "")
    return text

<IPython.core.display.Javascript object>

In [26]:
clean_data("Okay , it's a {disfmarker}")

"Okay , it's a "

<IPython.core.display.Javascript object>

In [10]:
# data

<IPython.core.display.Javascript object>

### Exploration

To get all the topics based on the given spans, each text span is for the turn (i.e., the speaker's turn). So each text spen is the entirety of a speakers speech for that period. Let's explore with one sample.

In [27]:
first_sample = data[1]
topic_list = first_sample["topic_list"]
meeting_transcripts = first_sample["meeting_transcripts"]

<IPython.core.display.Javascript object>

In [28]:
topic_list, meeting_transcripts

([{'topic': 'Prototype presentation', 'relevant_text_span': [['0', '38']]},
  {'topic': 'Budget balancing', 'relevant_text_span': [['39', '254']]},
  {'topic': 'Product evaluation', 'relevant_text_span': [['255', '426']]},
  {'topic': 'A summary for the whole project',
   'relevant_text_span': [['427', '516']]}],
 [{'speaker': 'Project Manager',
   'content': "That should hopefully do the trick , um . 'Kay . Sorry about the small delay . Falling a little bit behind schedule . And that's uh fifteen twenty five . Okay . So just to try and roughly go over what we agreed in the last one , um we're gonna go for something uh uh how was it ? Uh The new black , I believe ."},
  {'speaker': 'Marketing', 'content': '{gap} .'},
  {'speaker': 'Project Manager',
   'content': "Um something that looks good 'cause that seems to be in preference to actual functionality in the end , though we should never avoid functionality , of course . Uh many of our components are gonna be standard , off the shelf 

<IPython.core.display.Javascript object>

In [15]:
import pprint

# shape: [["sentence 1", "sentence 2"], ["sentence3", "sentence4"]]
text_segments = []
for topic in topic_list:
    prev_end_span = -1
    for text_span in topic["relevant_text_span"]:
        start_of_span = int(text_span[0])
        end_of_span = int(text_span[-1])

        # MISSING TOPIC CASE
        # this means that there are some sentences without a topic
        # for example, [0, 19] and [24, 29], where 20 - 23 are missing
        if start_of_span != (prev_end_span + 1):
            missing_topic_meeting = meeting_transcripts[
                prev_end_span + 1 : start_of_span - 1
            ]
            missing_topic_full_segment = []
            if len(missing_topic_meeting) != 0:
                for segment in missing_topic_meeting:
                    # clean the text before insertion
                    cleaned_content = clean_data(segment["content"])
                    # get the content for the segment and throw it into the list of segments
                    missing_topic_full_segment.append(cleaned_content)
                text_segments.append(missing_topic_full_segment)

        # TOPIC CASE
        topic_meeting = meeting_transcripts[start_of_span:end_of_span]
        if "meeting" not in topic:
            topic["meeting"] = []
        topic["meeting"].append(topic_meeting)

        full_segment = []
        for segment in topic_meeting:
            # clean the text before insertion
            cleaned_content = clean_data(segment["content"])
            # get the content for the segment and throw it into the list of segments
            full_segment.append(cleaned_content)

        text_segments.append(full_segment)

        # make sure we have the previous end span for sentences that don't have topics
        prev_end_span = end_of_span

pprint.pprint(text_segments)

[["That should hopefully do the trick , um . 'Kay . Sorry about the small "
  "delay . Falling a little bit behind schedule . And that's uh fifteen twenty "
  'five . Okay . So just to try and roughly go over what we agreed in the last '
  "one , um we're gonna go for something uh uh how was it ? Uh The new black , "
  'I believe .',
  ' .',
  "Um something that looks good 'cause that seems to be in preference to "
  'actual functionality in the end , though we should never avoid '
  'functionality , of course . Uh many of our components are gonna be standard '
  ', off the shelf , but it seemed like we were gonna require at least an '
  'advanced chip and we were still very much for the idea of using an L_C_D_ '
  'display . Um other things were we were hoping to use rubber , most likely '
  'gonna be double curved , etcetera . Okay . So um due to your hard work , we '
  'might as well let the uh two designers go first , and uh show us the '
  'prototype .',
  "Okay , it's a ",
  "Qui

<IPython.core.display.Javascript object>

### Preprocess all the data for text segmentation

In [29]:
def preprocess_text_segmentation(data):
    # shape: [["sentence 1", "sentence 2"], ["sentence3", "sentence4"]]
    text_segments = []
    for sample in data:
        # get the topic list and meeting transcript from the current sample
        topic_list = sample["topic_list"]
        meeting_transcripts = sample["meeting_transcripts"]

        for topic in topic_list:
            prev_end_span = -1
            for text_span in topic["relevant_text_span"]:
                # for this specific topic, get the relevant start and end spans
                start_of_span = int(text_span[0])
                end_of_span = int(text_span[-1])

                # MISSING TOPIC CASE
                # this means that there are some sentences without a topic
                # for example, [0, 19] and [24, 29], where 20 - 23 are missing
                if start_of_span != (prev_end_span + 1):
                    missing_topic_meeting = meeting_transcripts[
                        prev_end_span + 1 : start_of_span - 1
                    ]
                    if len(missing_topic_meeting) != 0:
                        missing_topic_full_segment = []
                        for segment in missing_topic_meeting:
                            tokenized_content = segment["content"]
                            # clean the text before insertion
                            cleaned_content = clean_data(tokenized_content)
                            # get the content for the segment and throw it into the list of segments
                            missing_topic_full_segment.append(cleaned_content)
                        text_segments.append(missing_topic_full_segment)

                # REAL TOPIC CASE
                topic_meeting = meeting_transcripts[start_of_span:end_of_span]
                if "meeting" not in topic:
                    topic["meeting"] = []
                topic["meeting"].append(topic_meeting)

                full_segment = []
                for segment in topic_meeting:
                    # get the content for the segment and throw it into the list of segments
                    full_segment.append(segment["content"])

                text_segments.append(full_segment)
    
    return text_segments


<IPython.core.display.Javascript object>

In [30]:
len(text_segments)

7

<IPython.core.display.Javascript object>

In [31]:
print([len(s) for s in text_segments])

[38, 38, 215, 254, 171, 426, 89]


<IPython.core.display.Javascript object>

In [32]:
# average number of strings in each segment
print(sum([len(s) for s in text_segments]) / len(text_segments))

175.85714285714286


<IPython.core.display.Javascript object>

### Save the data to a file

All meetings

In [36]:
import pickle

file_name = "./all_meetings_text_segments_train.pkl"

<IPython.core.display.Javascript object>

In [38]:
open_file = open(file_name, "wb")
pickle.dump(text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

In [46]:
# open_file = open(file_name, "rb")
# loaded_list = pickle.load(open_file)
# open_file.close()
# loaded_list

<IPython.core.display.Javascript object>

Academic meetings

In [44]:
# read the dataset
# please enter the path of your data
split = "test"
data_path = "./data/Academic/jsonl/" + split + ".jsonl"
academic_data = []
with open(data_path) as f:
    for line in f:
        academic_data.append(json.loads(line))
n_meetings = len(academic_data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 9 meetings in the test set.


<IPython.core.display.Javascript object>

In [45]:
academic_text_segments = preprocess_text_segmentation(academic_data)

<IPython.core.display.Javascript object>

In [46]:
file_name = f"./academic_meetings_text_segments_{split}.pkl"
open_file = open(file_name, "wb")
pickle.dump(academic_text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

Committee meetings

In [47]:
# read the dataset
# please enter the path of your data
split = "test"
data_path = "./data/Committee/jsonl/" + split + ".jsonl"
committee_data = []
with open(data_path) as f:
    for line in f:
        committee_data.append(json.loads(line))
n_meetings = len(committee_data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 6 meetings in the test set.


<IPython.core.display.Javascript object>

In [48]:
committee_text_segments = preprocess_text_segmentation(committee_data)

<IPython.core.display.Javascript object>

In [49]:
file_name = f"./committee_meetings_text_segments_{split}.pkl"
open_file = open(file_name, "wb")
pickle.dump(committee_text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

Product meetings

In [50]:
# read the dataset
# please enter the path of your data
split = "test"
data_path = "./data/Product/jsonl/" + split + ".jsonl"
product_data = []
with open(data_path) as f:
    for line in f:
        product_data.append(json.loads(line))
n_meetings = len(product_data)
print("Total {} meetings in the {} set.".format(n_meetings, split))

Total 20 meetings in the test set.


<IPython.core.display.Javascript object>

In [51]:
product_text_segments = preprocess_text_segmentation(product_data)

<IPython.core.display.Javascript object>

In [52]:
file_name = f"./product_meetings_text_segments_{split}.pkl"
open_file = open(file_name, "wb")
pickle.dump(product_text_segments, open_file)
open_file.close()

<IPython.core.display.Javascript object>

In [53]:
product_text_segments

[["Here we go . Welcome everybody . Um , I'm Abigail Claflin . You can call me Abbie . 'S see . PowerPoint , that's not it . There we go . So this is our kick off meeting . Um and I guess we should all get acquainted  .  Let's  shall we all introduce ourselves ?",
  "Hi I'm Chiara , I'm the um Marketing Expert . Um , would you like me to talk about my aims at the moment , or would you like me to just say my name and then we can talk about business later ?",
  "I think we'll get around to that , yeah . So this is just introductions yeah .",
  "We'll get round to that later . My name is Chiara and I'm the Marketing Expert .",
  "Okay . I forgot to s say I'm the Project Manager but I figured you all knew that already ,",
  '',
  '',
  ' um so .',
  " I'm Stephanie and I am the User Interface Designer ."],
 ["Okay . Um so f here's our agenda for today . Um we're gonna do some tool training , project plan and discuss then close . {vocalsound} Um so . So our aim is to produce a remote contro

<IPython.core.display.Javascript object>