 # Clean data

Running the following cells reads data of the form

    {"author": "name_speaker1", "quote":"quote1"}
    {"author": "name_speaker2", "quote": "quote2"}
    ...

from a json lines file called ``raw_data.jl`` and saves a text file called ``clean_data.txt`` of the form

    agent_name_speaker1 cleaned_quote1 
    agent_name_speaker2 cleaned_quote2 
    ...
    
Cleaning includes lower casing, removing non-essential punctuation and making n-grams.

The input file has to be in the same folder as this notebook.

The code uses a few design decisions that make it work on large files; up to 5GB of text should not be a problem. 

In [1]:
import json
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
import string
import os

## 1. Remove punctuation, make lower case, and append speaker token to utterances

In [2]:
# We add some symbols to standard punctuation, because they have different ascii representation
PUNCTUATION = string.punctuation.replace("_", "") + "“”’‘‚…–"  

# We remove # and @ signs to preserve them in tweets 
PUNCTUATION = PUNCTUATION.replace("#", "").replace("@", "")

In [3]:
data_loader = open("raw_data.jl", "r")

# loop over each tweet; this is preferred 
# to loading into a dataframe when files are very big
for idx, row in enumerate(data_loader):

    try:
        # retrieve the text
        row_dict = json.loads(row)
        text = row_dict["quote"]

    except json.decoder.JSONDecodeError:
        print(f"Decoding problem in row {idx} with content <{row}>. Breaking here.")
        break

    # append text with speaker token
    text = "agent_" + str(row_dict["author"]).strip().replace(" ", "_") + " " + text

    # remove punctuation
    text = ''.join(char for word in text for char in word if char not in PUNCTUATION)

    # make lower case
    text = " ".join([word.lower() for word in text.split()])

    if text:
        with open("clean_data.txt", 'a+') as f:
            f.write('%s\n' % text)

data_loader.close()

## 2. Make n-grams

In [4]:
def sentence_generator(path):
    """Read sentences from disk one-by-one"""
    with open(path, 'r') as f:
        for line in f:
            yield line.strip().split()

print("Making bigrams...")

# make the model that builds bigrams
gram_model = Phrases(sentence_generator("clean_data.txt"),
                     min_count=70,
                     threshold=10,
                     max_vocab_size=2000000,
                     connector_words=ENGLISH_CONNECTOR_WORDS)
gram_model.freeze()

# write sentences with bigrams into temporary file
with open("clean_data-temp.txt", 'w') as f:
    for sentence in sentence_generator("clean_data.txt"):
        new_sentence = gram_model[sentence]
        new_sentence = " ".join(new_sentence) + "\n"
        f.write(new_sentence)

print("...and trigrams...")

# repeat procedure on processed text to get bigrams of words (including bigrams of bigrams)
gram_model = Phrases(sentence_generator("clean_data-temp.txt"),
                     min_count=70,
                     threshold=10,
                     max_vocab_size=2000000,
                     connector_words=ENGLISH_CONNECTOR_WORDS)

gram_model.freeze()
 
# overwrite original text with sentences with up to 4-grams
with open("clean_data.txt", 'w') as f:
    for sentence in sentence_generator("clean_data-temp.txt"):            
        new_sentence = gram_model[sentence]
        new_sentence = " ".join(new_sentence) + "\n"
        f.write(new_sentence)

    # delete temporary file
    os.remove("clean_data-temp.txt")
    print("...done.")

Making bigrams...
...and trigrams...
...done.
