### Importing Messages Into Pandas

In [2]:
import sys
sys.path.append('..')

In [3]:
import pandas as pd

In [94]:
df = pd.read_csv('../data/Message_Backup_2019-05-17.csv',\
                 usecols=['phone_number', 'text', 'is_from_me', 'date'],\
                 index_col='date',\
                 parse_dates=True,\
                )
                 #nrows=100)

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 248900 entries, 2016-04-15 21:54:26 to 2019-05-17 11:30:51
Data columns (total 3 columns):
phone_number    246536 non-null object
text            248900 non-null object
is_from_me      248900 non-null int64
dtypes: int64(1), object(2)
memory usage: 7.6+ MB


###  Text Preprocessing

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [37]:
from collections import Counter
from itertools import chain

In [82]:
def tokenize_text(text):
    """ lowercases text and filters out non-alphabetic words """
    return [w for w in word_tokenize(text.lower()) if w.isalpha()]

In [83]:
def remove_stopwords(tokens, corpus='english'):
    """ filters out tokens according to corpus of stopwords """
    return [t for t in tokens if t not in stopwords.words(corpus)]

In [84]:
wordnet_lemmatizer = WordNetLemmatizer()

In [85]:
def lemmatize(tokens):
    return [wordnet_lemmatizer.lemmatize(t) for t in tokens]

In [96]:
# tokenize the texts to sentences
df['sentences'] = df['text'].apply(sent_tokenize)

In [97]:
# tokenize the messages into words
df['words']  = df['text'].apply(tokenize_text)

In [98]:
# remove stop words from tokens
df['nostop_words'] = df['words'].apply(remove_stopwords)

In [99]:
# lemmatize tokens w/ stopwords removed
df['lemmatized'] = df['nostop_words'].apply(lemmatize)

### Yearly Word Frequency Counts

In [131]:
# brief digression into flattening lists

def flatten1(lol):
    return [item for sublist in lol for item in sublist]

def flatten2(lol):
    for sublist in lol:
        for item in sublist:
            yield item

def flatten3(lol):
    return list(chain.from_iterable(lol))

In [39]:
def flatten(lol):
    """ converts a list of lists (lol) into one list """
    return list(chain.from_iterable(lol))

In [132]:
# for each year, we have a collection of messages
# each message has a list of processed tokens
# we want one list that contains all of the tokens for each year
yearly = df.groupby(df.index.year)

In [102]:
# reduce each year to corresponding list of tokens
full_tokens = yearly['lemmatized'].aggregate(flatten)

In [133]:
# create counter for each year
counters = {year: Counter(full_tokens[year]) for year in full_tokens.index}

In [134]:
for year, counter in counters.items():
    print("{} Top 5 Words:\n\t{}".format(year, counter.most_common(5)))

2016 Top 5 Words:
	[('like', 3085), ('get', 2184), ('good', 2056), ('yeah', 1928), ('know', 1572)]
2017 Top 5 Words:
	[('like', 4093), ('lol', 3731), ('get', 2770), ('yeah', 2159), ('good', 1925)]
2018 Top 5 Words:
	[('like', 1737), ('get', 1336), ('u', 1303), ('yeah', 1215), ('good', 1151)]
2019 Top 5 Words:
	[('like', 249), ('love', 223), ('good', 203), ('loved', 185), ('know', 176)]
