# Firsts

If we consider all the messages ever sent to, and recieved by, _the corpus_, when did each word enter the corpus? Who put it there? What does it say about a person if they put a lot of new words into the corpus, and what even is a word? 

---

Load up a tonne of libraries

In [None]:
import datetime
import json
import os
import pickle
import random
import re
import textwrap
from pathlib import Path
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.spatial import ConvexHull

import message_helpers as mh
from hangouts_loader import load_hangouts


In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams["font.sans-serif"] = ["Segoe UI Emoji"]


In [None]:
pickle_name = "all_convo.pickle"
pickle_path = Path(pickle_name)


Set your name here. This is so that you can take yourself out of some of the graphs. Because these are conversations, naievely, they go A B A B and so on, so you'll be roughly 50% of the messages, which makes other trends hard to see.

In [None]:
MY_NAME = "Ben Doherty"


In [None]:
all_convo_df = pd.read_pickle(pickle_path)
print(f"done: all_convo_df has {all_convo_df.shape[0]} rows")
all_convo_df.head()


In [None]:
print(
    f"Overall, there are {len(all_convo_df)}, messages in this dataset. "
    f"These come from about {len(all_convo_df.sender_name.unique())} people, "
    f"covering a period of {str(all_convo_df.datetime.max()-all_convo_df.datetime.min()).split(' days')[0]} days "
    f"between {all_convo_df.datetime.min():%B, %Y} and {all_convo_df.datetime.max():%B, %Y}. "
    f"Over {len(all_convo_df.platform.unique())} platforms:"
)
all_convo_df.platform.value_counts()


In [None]:
if os.path.isfile("firsts.pickle"):
    firsts_df = pd.read_pickle("firsts.pickle")
else:
    firsts = []
    firsts_dicts = []
    url_regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"

    for i, row in all_convo_df.iterrows():
        merged = str(row.content).replace("'", "")
        no_urls = re.sub(url_regex, " ", merged)
        just_words = re.sub(r"[^\w]", " ", no_urls).split()
        unique_words = set(just_words)
        for word in unique_words:
            w = str(word).lower()
            if (not w.isnumeric()) and (w not in firsts):
                # print(f"|{word}|", "first in:", row.content, row.datetime)
                d = dict(
                    word=w,
                    message=str(row.content),
                    datetime=row.datetime,
                    by=row.sender_name,
                    intials=row.initials,
                )
                firsts.append(w)
                firsts_dicts.append(d)


    firsts_df = pd.DataFrame(firsts_dicts).set_index("datetime")
    firsts_df.to_pickle("firsts.pickle")

firsts_df


In [None]:
firsts_df.by.value_counts().tail(5)#[:30]


In [None]:
firsts_df[firsts_df.word == "my"]

In [None]:
firsts_df[firsts_df.by == "Ayelen Moure"]

## Most unique message

Which message has the most first time uses in it? In my case it's 

> At the cost of punching myself in the chest so hard I nearly broke a rib/stopped my heart, I managed to do a couple of good superman-carves into flat 180. They look rad, if I don't crash (about ⅛ of the time)

which if I capitalise the firsts, is:

> At the COST of PUNCHING MYSELF in the CHEST so HARD I NEARLY BROKE a RIB STOPPED MY HEART  I MANAGED to do a COUPLE of good SUPERMAN CARVES INTO FLAT 180  They LOOK RAD  IF I don t CRASH  ABOUT ⅛ of the TIME 

Pretty amazing that nobody said "my" until 2018 

In [None]:
# fc = firsts_df.message.value_counts()
# fc[[(len(x)<500) for x in fc.index]]

In [None]:
# most_unique_message = "At the cost of punching myself in the chest so hard I nearly broke a rib/stopped my heart, I managed to do a couple of good superman-carves into flat 180. They look rad, if I don't crash (about ⅛ of the time)"
# # um = most_unique_message.split(" /-")
# um = re.split("[ -/]", most_unique_message, flags=re.IGNORECASE)
# for i, word in enumerate(um):
#     try:
#         m = firsts_df[firsts_df.word == word].message[0]
#         # print(word, m)
#         if m == most_unique_message:
#             um[i] = word.upper()
#     except:
#         pass
# " ".join(um)

In [None]:
# ts= pd.Timestamp("2018-01-27 12:42:56.523")
# start = ts-pd.Timedelta(minutes=1)
# end = ts+pd.Timedelta(minutes=25)
# all_convo_df[(all_convo_df.datetime >start) & (all_convo_df.datetime < end)][["sender_name", "content","source_convo"]]

In [None]:
grp = firsts_df.groupby(pd.Grouper(freq="w"))
f = grp.count().word
f.plot()
plt.annotate(
    f"Busiest period ({f.idxmax()})\n{f.max()} new words", xy=(f.idxmax(), f.max())
)
busiest = grp.get_group(f.idxmax())
busiest

In [None]:
", ".join(busiest.word.to_list())

In [None]:
people_per_period = []
index = []
grp = firsts_df.groupby(pd.Grouper(freq="m"))
# for name, fd in  grp:
#     print(df.sender_name.value_counts())
for period, df in grp:
    index.append(period)
    vc = df.by.value_counts()
    people_per_period.append(vc[:int(len(vc)*0.2)].to_dict())

new_pp_df = pd.DataFrame(people_per_period, index=index)
new_pp_df.drop(["Ben Doherty"], inplace=True, axis="columns", errors="ignore")
new_pp_df.head(3)


In [None]:
new_pp_df.plot.area(stacked=True)
plt.xlim(["2013-01-01", "2021-07-01"])
plt.legend(ncol=5)


In [None]:
people_per_period = []
index = []
grp = firsts_df.groupby(pd.Grouper(freq="2m"))
for period, df in grp:
    index.append(period)
    vc = df.by.value_counts()
    people_per_period.append(vc[:5].to_dict())

new_pp_df = pd.DataFrame(people_per_period, index=index)
new_pp_df.drop(["Ben Doherty"], inplace=True, axis="columns", errors="ignore")
ax = new_pp_df.plot.bar(stacked=True)
# plt.xlim(["2013-01-01", "2021-07-01"])

plt.legend(ncol=5)
for container in ax.containers:
    # customize the label to account for cases when there might not be a bar section
    labels = [f"{h:.0f}" if (h := v.get_height()) > 25 else "" for v in container]

    # set the bar label
    ax.bar_label(container, labels=labels, label_type="center")


In [None]:
n_top_ranked = 20
freq = "4m"

grp = firsts_df.groupby(pd.Grouper(freq=freq))
d = []
for period, df in grp:
    index.append(period)
    vc = df.value_counts(subset=["by", "intials"])
    # print(vc, type(vc))
    # vci = vc.index
    # intials = df.intials[0] if df.shape[0]>0 else "?"
    for i,((name, initials), value) in enumerate(vc.iteritems()):
        # print(i, name, initials, value)
        d.append(
            {
                "period": period,
                "name": name,
                "rank": i + 1,
                "intials": initials,
            }
        )


df = pd.DataFrame(d)
df.head()

In [None]:
top_sources = df[df["period"] == df["period"].max()].nsmallest(n_top_ranked, "rank")

fig, ax = plt.subplots(
    # figsize=(8, 5),
    subplot_kw=dict(ylim=(0.5, 0.5 + n_top_ranked)),
)

ax.xaxis.set_major_locator(MultipleLocator(365))
ax.yaxis.set_major_locator(MultipleLocator(1))

yax2 = ax.secondary_yaxis("right")
yax2.yaxis.set_major_locator(FixedLocator(top_sources["rank"].to_list()))
yax2.yaxis.set_major_formatter(FixedFormatter(top_sources["name"].to_list()))

for name, name_df in df.groupby("name"):
    if not name_df.empty:
        marker_initials = f"${name_df.intials.iloc[0]}$"
        if name in top_sources.name.to_list():
            ls = random.sample(["-", "--", "-.", ":"], 1)[0]
            markersize = 15
            lw = 5
        else:
            ls = "-"
            markersize = 8
            lw = 1
        ax.plot(
            "period",
            "rank",
            marker=marker_initials,
            markersize=markersize,
            data=name_df,
            mfc="w",
            lw=lw,
            ls=ls,
            solid_capstyle="round",
        )


ax.invert_yaxis()
ax.set(
    xlabel="Period",
    ylabel="Rank",
    title="Ranking of number of new words introduced into the corpus",
)
ax.grid(axis="x")
plt.tight_layout()


In [None]:
for name, df in firsts_df.groupby("by"):
    if name != MY_NAME and df.shape[0] > 300:
        df.groupby(pd.Grouper(freq="q")).by.count().plot(label=name)
plt.legend()
plt.title(
    "When do people introduce new words?\n"
    "(filtered by over 300 new words total, "
    "aggregated over quarterly periods)"
)
plt.xlim(["2013-01-01", "2021-07-01"])
plt.ylim([0, 400])


But what if there are a lot of words like "heyyyyyy"? Can we take them out?

In this sample set, it leaves 1659 of 2946

In [None]:
# tune this number, more repetitive numbers have bigger numbers, so if you want
# to leave them in, increase it. To see the silly words, flip the comparison
# from < to >, and to see the scores, swap the x for the line above it.
thresh = 1.6
# person = "Meike Wijers"
# person = "Ivana Kuzmanovska"
person = "Byron Sullivan"
person_words = firsts_df[firsts_df.by == person].word.to_list()
# (len(x) / len(set(x)), x)
real_words = [x for x in person_words if len(x) < (len(set(x)) * thresh)]
silly_words = [x for x in person_words if len(x) > (len(set(x)) * thresh)]
print(
    '\n"real" words:',
    ", ".join(real_words),
    '\n\n"silly" words:',
    ", ".join(silly_words),
)


In [None]:
with open("words_dictionary.json") as d:
    the_dictionary = json.load(d)
firsts_df["in_the_dictionary"] = firsts_df.word.apply(
    lambda x: "yes" if the_dictionary.get(x) else "no"
)
firsts_df.sample(3)


In [None]:
pc_real = {}
real_words_dicts = []
for name, df in firsts_df.groupby("by"):
    num_messages = df.shape[0]
    vc = df.in_the_dictionary.value_counts()
    num_in_dict = vc["yes"]
    pc_real[name] = (num_in_dict / num_messages) * 100
    amc = all_convo_df[all_convo_df.sender_name == name].shape[0]
    real_words_dicts.append(
        {
            "name": name,
            "total": num_messages,
            "real_count": num_in_dict,
            "unreal_count": vc["no"],
            "pc": (num_in_dict / num_messages) * 100,
            "all_message_count": amc,
        }
    )
real_words_df = pd.DataFrame(real_words_dicts)
real = pd.Series(pc_real).sort_values(ascending=False)
real.plot.barh()
# real.head(50)


In [None]:
real_words_df[real_words_df.total>200].sort_values("pc", ascending=False).tail(15)

In [None]:
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.axes(projection='3d')

ax.scatter3D(real_words_df.real_count, real_words_df.unreal_count, real_words_df.all_message_count)
plt.xlim([0, 1100])
plt.ylim([0, 2500])
ax.set_zlim(0, 10000)

In [None]:

# real_words_df.plot.scatter("real_count", "unreal_count")
from mpl_toolkits import mplot3d

fig = plt.figure()
ax = plt.axes(projection='3d')

for i,row in real_words_df.iterrows():
    ax.scatter(xs=row.real_count, ys=row.unreal_count, zs=row.all_message_count , label=row["name"])
    if row.real_count>100 or row.unreal_count>250:
        ax.annotate(row["name"], (row.real_count, row.unreal_count))
plt.xlim([0, 1100])
plt.ylim([0, 2500])
ax.set_zlim(0, 10000)
plt.xlabel("count of words found in the dictionary")
plt.ylabel("count of words not found in the dictionary")
ax.set_zlabel("count of all messages sent by this person")
plt.show()

In [None]:
", ".join(
    firsts_df[
        (firsts_df.by == "Charles Ogilvie") & (firsts_df.in_the_dictionary == "no")
    ].word.to_list()
)


In [None]:
d = {}
for period, df in firsts_df.groupby(pd.Grouper(freq="m")):
    d[period] = [x for x in df.word.to_list() if x.isnumeric() is False]
words_in_period = pd.DataFrame.from_dict(d, orient="index").T
# TODO: replace none with "" so this prints in a nice looking way
words_and_months = words_in_period.applymap(lambda x: "" if x is None else x)
words_and_months.to_csv("words_and_months.csv")
words_and_months
# The idea here was to make a printed bar chart where the words were the bars, 
# but at A0, each line is about 0.3mm high, so the word is about half that.