# Firsts

If we consider all the messages ever sent to, and recieved by, _the corpus_, when did each word enter the corpus? Who put it there? What does it say about a person if they put a lot of new words into the corpus, and what even is a word? 

---

Load up a tonne of libraries

In [None]:
import datetime
import json
import os
import pickle
import random
import re
import textwrap
from pathlib import Path
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.spatial import ConvexHull

import message_helpers as mh
from hangouts_loader import load_hangouts


In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams["font.sans-serif"] = ["Segoe UI Emoji"]


In [None]:
pickle_name = "all_convo.pickle"
pickle_path = Path(pickle_name)


Set your name here. This is so that you can take yourself out of some of the graphs. Because these are conversations, naievely, they go A B A B and so on, so you'll be roughly 50% of the messages, which makes other trends hard to see.

In [None]:
MY_NAME = "Ben Doherty"


In [None]:
def first_word(x):
    try:
        return x.split()[0]
    except:
        return x


In [None]:
all_convo_df = pd.read_pickle(pickle_path)
print(f"done: all_convo_df has {all_convo_df.shape[0]} rows")
all_convo_df.head()


In [None]:
print(
    f"Overall, there are {len(all_convo_df)}, messages in this dataset. "
    f"These come from about {len(all_convo_df.sender_name.unique())} people, "
    f"covering a period of {str(all_convo_df.datetime.max()-all_convo_df.datetime.min()).split(' days')[0]} days "
    f"between {all_convo_df.datetime.min():%B, %Y} and {all_convo_df.datetime.max():%B, %Y}. "
    f"Over {len(all_convo_df.platform.unique())} platforms:"
)
all_convo_df.platform.value_counts()


## Love, want, hate

What do we love, hate, want, and want to do? Let's look into the text content of the messages a bit.

In [None]:
love_df = all_convo_df[["i love" in str(x).lower() for x in all_convo_df.content]]
want_df = all_convo_df[["i want" in str(x).lower() for x in all_convo_df.content]]
hate_df = all_convo_df[["i hate" in str(x).lower() for x in all_convo_df.content]]
want_you_df = all_convo_df[
    ["i want you" in str(x).lower() for x in all_convo_df.content]
]

print("love", love_df.shape[0])
print("want", want_df.shape[0])
print("hate", hate_df.shape[0])
print("want you", want_you_df.shape[0])


In [None]:
things_to_love = [
    x.lower().split("i love")[1].replace("!", "").replace(".", "").strip()
    for x in love_df.content
]
pd.Series(things_to_love).value_counts()[:50].plot.barh()


In [None]:
things_to_want = [
    x.lower().split("i want")[1].replace("!", "").replace(".", "")
    for x in want_df.content
]
pd.Series(things_to_want).value_counts()[:50].plot.barh()


In [None]:
def first_word(x):
    try:
        return x.split()[0]
    except:
        return x


love_vc = pd.Series([first_word(x).strip() for x in things_to_love]).value_counts()
love_vc[:50].plot.barh()
plt.title("what word comes straight after 'i love'?\n(All messages, in and out)")


In [None]:
want_vc = pd.Series(
    [first_word(x).strip().replace(".", "").replace(" ", "") for x in things_to_want]
).value_counts()
plt.title("what word comes straight after 'i want'?\n(All messages, in and out)")

In [None]:
things_to_want_you = [
    x.lower().split("i want you")[1].replace("!", "").replace(".", "")
    for x in want_you_df.content
]
pd.Series(things_to_want_you).value_counts()[:50].plot.barh()


In [None]:
want_you_df.sender_name.value_counts()


In [None]:
want_you_df[want_you_df.sender_name == "Charles Ogilvie"][
    ["content", "sender_name", "datetime"]
]


In [None]:
things_to_want = [
    x.lower().split("i want")[1].replace("!", "").replace(".", "")
    for x in want_df.content
]
pd.Series(things_to_want).value_counts()[:50].plot.barh()


In [None]:
things_to_hate = [
    x.lower().split("i hate")[1].replace("!", "").replace(".", "")[:100].strip()
    for x in hate_df.content
    if "sending out impersonal" not in x
]
pd.Series(things_to_hate).value_counts()[:50].plot.barh()
plt.title("I hate ...\nPulled from all messages, in and out")


In [None]:
love_vc = pd.Series([first_word(x).strip() for x in things_to_love]).value_counts()
love_vc[:50].plot.barh()
plt.title("what word comes straight after 'i love'?\n(All messages, in and out)")


In [None]:
want_vc = pd.Series(
    [first_word(x).strip().replace(".", "").replace(" ", "") for x in things_to_want]
).value_counts()
want_vc[:50].plot.barh()
plt.title("what word comes straight after 'i want'?\n(All messages, in and out)")


In [None]:
love_vc[love_vc > 3].plot.barh()


Let's look into that list of things we love in a lot more detail. Not the common stuff, this is the unusual:

In [None]:
# pd.Series([x[0] for x in love_vc[love_vc < 3].index.to_list()]).value_counts().plot.barh()
lvc = pd.Series(things_to_love).value_counts()
with open("love.txt", "w", encoding="utf-8") as f:
    f.write("\n- ".join(sorted(lvc[lvc < 3].index.to_list())))
