## Converting Emails into ConvoKit format
Needed to apply the politeness models

In [1]:
try:
    import convokit
except ModuleNotFoundError:
    !pip install convokit

Collecting convokit
  Downloading convokit-3.0.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.2/183.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl (6.9 kB)
Collecting dill>=0.2.9 (from convokit)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m8.2 MB/s[0m eta [36

In [4]:
import pandas as pd
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict
import re

### 1. Load emails

In [55]:
file_name = 'normal_emails.csv'
file_path = '/content/drive/MyDrive/COMP550/data'

df = pd.read_csv(f"{file_path}/{file_name}", index_col='Original Index')

In [56]:
corpus_speakers = {k: Speaker(id=k) for k in df['Sender'].unique()}

Sanity checking use-level data:

In [57]:
print("number of speakers in the data = {}".format(len(corpus_speakers)))

number of speakers in the data = 13


### __2. Creating utterance objects__

In [58]:
utterance_data = df

In [60]:
utterance_corpus = {}

count = 0
for idx, utterance in tqdm(utterance_data.iterrows()):

    try:
        speaker, text = utterance['Sender'], utterance['Email']
    except:
        print(utterance_info)

    meta = {'poi': utterance['POI'], 'exec_200': utterance['Exec 200'], 'exec_300': utterance['Exec 300'], 'date': utterance['Date'].strip()}

    # root & reply_to will be updated later, timestamp is not applicable
    utterance_corpus[idx] = Utterance(id=idx, speaker=corpus_speakers[speaker], text=text, meta=meta)

print("Total number of utterances = {}".format(len(utterance_corpus)))

33950it [00:02, 11343.94it/s]

Total number of utterances = 33950





In [62]:
utterance_list = utterance_corpus.values()

### Make the Corpus object

In [63]:
# Note that by default the version number is incremented
email_corpus = Corpus(utterances=utterance_list)

### Processing utterance texts

In [67]:
from convokit.text_processing import TextParser

In [68]:
parser = TextParser(verbosity=10000)

In [69]:
email_corpus = parser.transform(email_corpus)

10000/33950 utterances processed
20000/33950 utterances processed
30000/33950 utterances processed
33950/33950 utterances processed


In [70]:
email_corpus.get_utterance(utt_id).retrieve_meta('parsed')

[{'rt': 1,
  'toks': [{'tok': 'Attached',
    'tag': 'VBN',
    'dep': 'acomp',
    'up': 1,
    'dn': []},
   {'tok': 'is', 'tag': 'VBZ', 'dep': 'ROOT', 'dn': [0, 3, 7]},
   {'tok': 'the', 'tag': 'DT', 'dep': 'det', 'up': 3, 'dn': []},
   {'tok': 'information', 'tag': 'NN', 'dep': 'attr', 'up': 1, 'dn': [2, 6]},
   {'tok': 'you', 'tag': 'PRP', 'dep': 'nsubj', 'up': 6, 'dn': []},
   {'tok': 'have', 'tag': 'VBP', 'dep': 'aux', 'up': 6, 'dn': []},
   {'tok': 'requested', 'tag': 'VBN', 'dep': 'relcl', 'up': 3, 'dn': [4, 5]},
   {'tok': '.', 'tag': '.', 'dep': 'punct', 'up': 1, 'dn': []}]},
 {'rt': 0,
  'toks': [{'tok': 'Thanks', 'tag': 'NNS', 'dep': 'ROOT', 'dn': [1, 4]},
   {'tok': ',', 'tag': ',', 'dep': 'punct', 'up': 0, 'dn': [2]},
   {'tok': '\n', 'tag': '_SP', 'dep': 'dep', 'up': 1, 'dn': []},
   {'tok': 'Brad', 'tag': 'NNP', 'dep': 'compound', 'up': 4, 'dn': []},
   {'tok': 'Jones', 'tag': 'NNP', 'dep': 'appos', 'up': 0, 'dn': [3]}]}]

### __Saving created datasets__
To complete the final step of dataset conversion, we want to save the dataset such that it can be loaded later for reuse. You may want to specify a name. The default location to find the saved datasets will be __./convokit/saved-copora__ in your home directory, but you can also specify where you want the saved corpora to be.

In [71]:
email_corpus.dump(re.sub(".csv", "", file_name), base_path="/content/drive/MyDrive/COMP550/convokit_corpora")