# Firsts

If we consider all the messages ever sent to, and recieved by, _the corpus_, when did each word enter the corpus? Who put it there? What does it say about a person if they put a lot of new words into the corpus, and what even is a word? 

---

Load up a tonne of libraries

In [None]:
import datetime
import json
import os
import pickle
import random
import re
import textwrap
from pathlib import Path
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.spatial import ConvexHull

import message_helpers as mh
from hangouts_loader import load_hangouts


In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams["font.sans-serif"] = ["Segoe UI Emoji"]


In [None]:
pickle_name = "all_convo.pickle"
pickle_path = Path(pickle_name)


Set your name here. This is so that you can take yourself out of some of the graphs. Because these are conversations, naievely, they go A B A B and so on, so you'll be roughly 50% of the messages, which makes other trends hard to see.

In [None]:
MY_NAME = "Ben Doherty"


In [None]:
all_convo_df = pd.read_pickle(pickle_path)
print(f"done: all_convo_df has {all_convo_df.shape[0]} rows")
all_convo_df.head()


In [None]:
print(
    f"Overall, there are {len(all_convo_df)}, messages in this dataset. "
    f"These come from about {len(all_convo_df.sender_name.unique())} people, "
    f"covering a period of {str(all_convo_df.datetime.max()-all_convo_df.datetime.min()).split(' days')[0]} days "
    f"between {all_convo_df.datetime.min():%B, %Y} and {all_convo_df.datetime.max():%B, %Y}. "
    f"Over {len(all_convo_df.platform.unique())} platforms:"
)
all_convo_df.platform.value_counts()


In [None]:
def time_to_seconds(dt):
    t = dt.time()
    seconds = (t.hour * 60 + t.minute) * 60 + t.second
    return seconds


all_convo_df["date"] = all_convo_df.datetime.apply(lambda x: x.date())
all_convo_df["time"] = all_convo_df.datetime.apply(lambda x: x.time())
all_convo_df["seconds"] = all_convo_df.datetime.apply(time_to_seconds)
all_convo_df.sample(5)


In [None]:
SEXY_WORDS = [
    "balls",
    "clit",
    "cock",
    "dick",
    "dildo",
    "fuck me",
    "fuck you",
    # "fuck", # fuck overwhealms everything (at least for me)
    "head",
    "lick",
    "lips",
    "masterbat",
    "nipple",
    "orgasm",
    "play",
    "pussy",
    "spank",
    "suck",
    "toys",
    "vibrator",
    "wand",
    "wank",
]


def is_sexy(content):
    try:
        if any(x.lower() in content for x in SEXY_WORDS):
            return "sexy"
    except:
        pass
    return "not"


In [None]:
all_convo_df["sexy"] = all_convo_df.content.apply(is_sexy)

fig = plt.figure()
ax = fig.add_subplot(111)
for name, data in all_convo_df.groupby("sexy"):
    if name == "sexy":
        ax.scatter(data.date, data.seconds, s=30, alpha=0.5, c="red", marker="x")
    else:
        ax.scatter(data.date, data.seconds, s=10, alpha=0.1, c="blue", marker=".")

# ax.yaxis_date()
seconds_in_a_day = 24 * 60 * 60
ax.yaxis.set_major_locator(plt.MaxNLocator(30))
# plt.yticks(plt.yticks()[0], [datetime.timedelta(seconds=t)  for t in plt.yticks()[0]])
plt.ylim([0, seconds_in_a_day])

ax.xaxis.set_major_locator(plt.MaxNLocator(30))
fig.autofmt_xdate()
# plt.xlim(['2020-07-18', '2021-07-21'])

plt.suptitle("When do we talk sexy?")
plt.title("\n".join(textwrap.wrap(f"Occurance of {', '.join(SEXY_WORDS)}", 100)))
plt.ylabel("seconds after midnight GMT")

plt.show()


In [None]:
top = 50

pool = " ".join(all_convo_df[all_convo_df.sexy == "sexy"].content.to_list())
clean = re.sub(
    mh.PUNCTUATION_REGEX,
    " ",
    pool,
    flags=re.VERBOSE,  # and replace it with a single space
)
stopped = [w.lower() for w in clean.split() if w.lower() not in mh.STOP_WORDS]
vc = pd.Series(stopped).value_counts()
vc[:top].plot.barh()
plt.title(f'Top {top} most common words in "sexy" messages')


In [None]:
pool = " ".join([str(x) for x in all_convo_df.content])
clean = re.sub(
    mh.PUNCTUATION_REGEX,
    " ",
    pool,
    flags=re.VERBOSE,  # and replace it with a single space
)
stopped = [w.lower() for w in clean.split() if w.lower() not in mh.STOP_WORDS]
vc = pd.Series(stopped).value_counts()
vc[:top].plot.barh()
plt.title(f"Top {top} most common words in all messages")


In [None]:
ratios = {}
for name, df in all_convo_df.groupby("sender_name"):
    if df.shape[0] > 1000:
        vc = df.sexy.value_counts()
        ratios[name] = (vc.get("sexy", 1)) / vc["not"]
highly_sexy = pd.Series(ratios).sort_values()
highly_sexy.plot.barh()


In [None]:
# print(highly_sexy.index)
highly_sexy_df = all_convo_df[
    [x in list(highly_sexy.index) for x in all_convo_df.sender_name]
]


In [None]:
occurances = []
for name, df in highly_sexy_df.groupby("sender_name"):
    d = {"name": name}
    pool = " ".join([str(x) for x in df.content])
    for w in SEXY_WORDS:
        oc = pool.count(w)
        d[w] = oc
    occurances.append(d)


In [None]:
sdf = pd.DataFrame(occurances)
sdf = sdf.set_index("name")
sdf.head()


In [None]:
sdf_normed = sdf.div(sdf.sum(axis=1), axis=0)
sdf_normed.plot.barh(edgecolor="none")
plt.title("Occurances of these words (normalised per person)")


In [None]:
sdf[sdf.index != MY_NAME].plot.barh(edgecolor="none")
plt.title("Occurances of these words (not normalised per person)")


In [None]:
p = [
    MY_NAME,
    "Irina Belova",
    "Ivana Kuzmanovska",
    "Lucy Rimmer",
    "Maddie Johanson",
    "Meike Wijers",
]
sdf_normed.loc[p].plot.barh(
    edgecolor="none",
    width=0.7,
)
plt.title(f"Occurances of these words in messages from \n{p} (normalised)")


In [None]:
sdf.sum(axis=0).plot.barh()
plt.title("General occurance of these words")
