In [1]:
from utils import *

messages = pd.read_csv("data/messages.csv", index_col=0)

# Data preprocessing

## Spell check, segmentation

In [2]:
symspell = SymSpell(prefix_length=10)

# ENV variable DICTIONARY_PATH
symspell.load_dictionary(DICTIONARY_PATH, term_index=0, count_index=1)

def correctText(text):
    return symspell.lookup_compound(text, max_edit_distance=2)[0].term

def segmentText(text):
    return symspell.word_segmentation(text, max_edit_distance=0).corrected_string

messages['segmentedContent'] = messages['content'].apply(lambda x: segmentText(x))
messages['correctedContent'] = messages['content'].apply(lambda x: correctText(x))

## Automated sentiment analysis

In [3]:
sia = SentimentIntensityAnalyzer()
messages['autoSentiment'] = messages['correctedContent'].apply(lambda x: sia.polarity_scores(x)['compound'])

## Word/character counts

In [4]:
messages['wordCount'] = messages['correctedContent'].apply(lambda x: len(x.split()))
messages['charCount'] = messages['correctedContent'].apply(lambda x: len(x))

## Informativeness score calculation

In [5]:
def informativeness(text):
    words = re.findall(r"\b\w+'\w+|\w+\b", text.lower())
    totalSurprisal = 0
    for word in words:
        frequency = word_frequency(word, 'en', wordlist='large', minimum=0.0)
        surprisal = -math.log2(frequency) if frequency != 0 else 0
        totalSurprisal += surprisal
    return totalSurprisal

In [6]:
messages['informativeness'] = messages['correctedContent'].apply(lambda x: informativeness(x))

In [7]:
messages.to_csv("data/messages.csv")