# Firsts

If we consider all the messages ever sent to, and recieved by, _the corpus_, when did each word enter the corpus? Who put it there? What does it say about a person if they put a lot of new words into the corpus, and what even is a word? 

---

Load up a tonne of libraries

In [None]:
import datetime
import json
import os
import pickle
import random
import re
import textwrap
from pathlib import Path
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.spatial import ConvexHull

import message_helpers as mh
from hangouts_loader import load_hangouts


In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams["font.sans-serif"] = ["Segoe UI Emoji"]


In [None]:
pickle_name = "all_convo.pickle"
pickle_path = Path(pickle_name)


Set your name here. This is so that you can take yourself out of some of the graphs. Because these are conversations, naievely, they go A B A B and so on, so you'll be roughly 50% of the messages, which makes other trends hard to see.

In [None]:
MY_NAME = "Ben Doherty"


In [None]:
all_convo_df = pd.read_pickle(pickle_path)
print(f"done: all_convo_df has {all_convo_df.shape[0]} rows")
all_convo_df.head()


In [None]:
print(
    f"Overall, there are {len(all_convo_df)}, messages in this dataset. "
    f"These come from about {len(all_convo_df.sender_name.unique())} people, "
    f"covering a period of {str(all_convo_df.datetime.max()-all_convo_df.datetime.min()).split(' days')[0]} days "
    f"between {all_convo_df.datetime.min():%B, %Y} and {all_convo_df.datetime.max():%B, %Y}. "
    f"Over {len(all_convo_df.platform.unique())} platforms:"
)
all_convo_df.platform.value_counts()


# Plotting time that might show sleep

This is a bit nasty, I don't really like it, but it seems to work. There _must_ be a better way.

Convert the time component of the datetime to just-date and just-time (but in seconds, as a number) and then plot that.

The y axis is nasty becasue it doesn't show as time, just as numbers. I assume that they're seconds after midnight GMT.

In [None]:
def time_to_seconds(dt):
    t = dt.time()
    seconds = (t.hour * 60 + t.minute) * 60 + t.second
    return seconds


all_convo_df["date"] = all_convo_df.datetime.apply(lambda x: x.date())
all_convo_df["time"] = all_convo_df.datetime.apply(lambda x: x.time())
all_convo_df["seconds"] = all_convo_df.datetime.apply(time_to_seconds)
all_convo_df.sample(5)


In [None]:
platform_colours = {"Facebook": "blue", "Hangouts": "green", "Instagram": "orange"}
fig = plt.figure()
ax = fig.add_subplot(111)
for name, df in all_convo_df.groupby("platform"):
    ax.scatter(
        df.date, df.seconds, s=20, alpha=0.04, label=name, c=platform_colours[name]
    )
ax.yaxis_date()
fig.autofmt_xdate()
plt.title("All messages in and out, coloured by platform")
plt.ylabel("seconds after midnight GMT")
leg = plt.legend()
for lh in leg.legendHandles:
    lh.set_alpha(1)
plt.show()


Looking at this, we can see pretty clearly that there's a stripe where I sleep, except for 2016&mdash;2018 where I'm moving back and forth between Sydney and Vancouver, so the sleep and the timezones are all messed up.

The pale area on the left is because there's a missing dataset from the Skype era. Google Hangouts bursts onto the scene in 2013, then dissapears almost as fast in 2017, with Instagram making a little showing, but not in earnest until very recently.

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
z = 4
for name, df in all_convo_df.groupby("gender"):
    if name != "me":
        ax.scatter(df.date, df.seconds, s=10, alpha=0.04, label=name, zorder=z)
        z -= 1  # this reverses the default plotting order
ax.yaxis_date()
fig.autofmt_xdate()
plt.title("all messages in and out, coloured by gender")
plt.ylabel("seconds after midnight GMT")
leg = plt.legend()
for lh in leg.legendHandles:
    lh.set_alpha(1)
plt.show()


In [None]:
sns.jointplot(
    data=all_convo_df, x="date", y="seconds", marker=".", s=5
)  # , hue="platform")


So, this makes sense to me, but it's a bit gross. The y axis is all messed up, it should be showing time, not number of seconds. I'm also not sure if time goes forward up or down the page.

It'd be good to work out a way of identifying sleep periods. Maybe that's a period that starts after 10pm that's offline for more than 4 hours?

In [None]:
# plt.rcParams["figure.figsize"] = (30, 30)

cut = 500
labels = []
odd_df = None

fontP = FontProperties()
fontP.set_size("x-small")
all_initials = []

fig = plt.figure()
ax = fig.add_subplot(111)
for name, df in all_convo_df.groupby("sender_name"):
    if df.shape[0] > cut:
        initials=df.initials.iloc[0]
        label = f"{name} ({initials}, {df.shape[0]})"

        if name == MY_NAME:
            marker = ","
            ax.scatter(
                df.date,
                df.seconds,
                s=0.3,
                alpha=0.3,
                linewidth=0,
                label=label,
                marker=marker,
            )
        elif len(initials) > 0:
            marker = f"${initials}$"
            ax.scatter(
                df.date,
                df.seconds,
                s=10 if len(initials) == 2 else 15,
                alpha=0.2,
                linewidth=0,
                label=label,
                marker=marker,
            )
        else:
            # marker = "1"
            # print(name, "odd one", df.content.head(10))
            odd_df = df

        labels.append(label)
    else:
        ax.scatter(
            df.date,
            df.seconds,
            s=15,
            alpha=0.1,
            marker="x",
        )
ax.yaxis_date()
fig.autofmt_xdate()
plt.title("all messages in and out")
plt.ylabel("seconds after midnight GMT")
leg = plt.legend(
    title=f"People with more than {cut} messages",
    bbox_to_anchor=(-0.1, -0.1),
    # bbox_to_anchor=(1.05, 1),
    loc="upper left",
    prop=fontP,
    ncol=9,
    markerscale=3,
)
for lh in leg.legendHandles:
    lh.set_alpha(1)

# plt.xlim(["2012-01-01", "2021-05-01"])

plt.savefig("all_messages.svg")
# plt.show()
# print(labels)


plot = sns.lmplot(
    x="date",
    y="seconds",
    data=all_convo_df[all_convo_df["chatty_people"]],
    # data=all_convo_df,
    hue="sender_name",
    fit_reg=False,
    legend=True,
    palette="Set2",
    col="sender_name",
    col_wrap=4,
    scatter_kws=dict(s=50, alpha=0.2),
)


In [None]:
# sns.set(rc={"figure.figsize": (15, 15)})
plot = sns.lmplot(
    x="date",
    y="seconds",
    data=all_convo_df[all_convo_df.sender_name != MY_NAME],
    hue="sender_name",
    markers="x",
    fit_reg=False,
    legend=False,
    palette="Set1",
    scatter_kws=dict(s=30, alpha=0.1),
    height=20,
    aspect=20 / 20,
)
# plt.xticks(rotation=45);
plt.savefig("all_incoming.svg")
plt.savefig("all_incoming.png")


data = all_convo_df[all_convo_df.sender_name != MY_NAME]
data = data[data.chatty_people]
print(data.shape)
data.sample(3)


In [None]:
sns.set(rc={"figure.figsize": (10, 10)})
fig, ax = plt.subplots()
g = sns.scatterplot(
    x="date",
    y="seconds",
    data=data,
    hue="sender_name",
    legend=False,
    palette="Set1",
    s=30,
    alpha=0.1,
    ax=ax,
)
# g.legend(bbox_to_anchor=(1.5, 1))


In [None]:
def encircle(x, y, ax=None, **kw):
    if not ax:
        ax = plt.gca()
    p = np.c_[x, y]
    hull = ConvexHull(p)
    poly = plt.Polygon(p[hull.vertices, :], **kw)
    ax.add_patch(poly)


fig, ax = plt.subplots()
for x, d in data.groupby("sender_name"):
    if d.shape[0] > 4000:
        m, b = np.polyfit(d.timestamp_ms, d.seconds, 1)
        plt.plot(d.timestamp_ms, m * d.timestamp_ms + b)
        col = plt.gca().lines[-1].get_color()
        sc = ax.scatter(d.timestamp_ms, d.seconds, s=10, alpha=0.4, label=x, color=col)
        encircle(d.timestamp_ms, d.seconds, ax=ax, ec=col, fc="none")
        plt.annotate(x, (d.timestamp_ms.mean(), d.seconds.mean()), size=10, color=col)

plt.show()


In [None]:
bd_out_df = all_convo_df[all_convo_df.sender_name == MY_NAME]
bd_out_df.set_index("datetime").groupby(
    pd.Grouper(freq="2M")
).count().sender_name.plot()
plt.title("Messages sent per 2 Months")


In [None]:
for name, df in all_convo_df.groupby("platform"):
    df.set_index("datetime").groupby(pd.Grouper(freq="2M")).count().sender_name.plot(
        label=name
    )
plt.legend()
plt.title("Messages sent per 2 Months")


In [None]:
count_series = []
for name, df in all_convo_df.groupby("platform"):
    count_series.append(
        {
            "data": df.set_index("datetime")
            .groupby(pd.Grouper(freq="2M"))
            .count()
            .sender_name,
            "label": name,
        }
    )
pd.concat(
    [d["data"] for d in count_series], axis=1, keys=[d["label"] for d in count_series]
).plot.area()
# plt.legend()
plt.title("Messages sent per 2 Months")


In [None]:
for name, df in all_convo_df.groupby("gender"):
    df.set_index("datetime").groupby(pd.Grouper(freq="2M")).count().sender_name.plot(
        label=name
    )
plt.legend()
plt.title("Messages sent per 2 Months")


In [None]:
count_series = []
for name, df in all_convo_df.groupby("gender"):
    count_series.append(
        {
            "data": df.set_index("datetime")
            .groupby(pd.Grouper(freq="2M"))
            .count()
            .sender_name,
            "label": name,
        }
    )
pd.concat(
    [d["data"] for d in count_series], axis=1, keys=[d["label"] for d in count_series]
).plot.area()
# plt.legend()
plt.title("Messages sent per 2 Months")


In [None]:
fig, ax = plt.subplots()
for x, d in all_convo_df.groupby("sender_name"):
    if d.shape[0] > 4000 and x != MY_NAME:
        per_period = (
            d.set_index("datetime").groupby(pd.Grouper(freq="2m")).count().sender_name
        )
        per_period.plot(label=x)
        col = plt.gca().lines[-1].get_color()
        plt.annotate(x, (per_period.idxmax(), per_period.max()), size=10, color=col)
plt.legend()
plt.title("Messages sent per 2 months")


In [None]:
count_series = []
freq = "Q"
for name, df in all_convo_df.groupby("sender_name"):
    if df.shape[0] > 4000 and name != MY_NAME:
        count_series.append(
            {
                "data": df.set_index("datetime")
                .groupby(pd.Grouper(freq=freq))
                .count()
                .sender_name,
                "label": name,
            }
        )
count_series.sort(key=lambda x: x["data"].sum())
res = pd.concat(
    [d["data"] for d in count_series], axis=1, keys=[d["label"] for d in count_series]
)
res.plot.area()
plt.title(f"Messages sent per {freq}")
plt.ylabel(f"Number of messages sent in that {freq}")
plt.xlim(["2012-01-01", "2021-05-01"])

# Annotations
running_total = 0
for name, count in res.iloc[-2].iteritems():
    try:
        running_total = running_total + int(count)
    except:
        pass
    if count > 200:
        # print(name, count, type(count), running_total)
        plt.annotate(
            "—" + name, (res.index[-2], running_total - (count / 2)), fontsize=7
        )

plt.annotate("Irina Belova", xy=("2015-07-01", 3000), fontsize=7, ha="center")
plt.annotate("Charlie", xy=("2017-04-01", 2500), fontsize=7, ha="center")
plt.annotate("Julz", xy=("2020-06-01", 2800), fontsize=7, ha="center")
plt.annotate("David Wilcox", xy=("2014-04-01", 500), fontsize=7, ha="center")


The above graph is pretty interesing.

Some notes: 

- It's aggregated by quarter because if done by month or number of months, it will start the aggregation by the first month that a person messaged. That means that some people will get aggregated in even months, others in odd. Strangely, most people are in one group, so it is super bumpy.
- Annotating area charts is really dificult. There's not an obvious way to do it, so this method is a bit of a hack, but seems neat enough. It'd be good to add in some other annotations too.

In [None]:
im = all_convo_df[
    [x in ["Meike Wijers", "Irina Belova"] for x in all_convo_df.sender_name]
]


In [None]:
trips = pd.read_csv("trips.csv", parse_dates=[0, 1])
trips["duration"] = trips.to - trips["from"]
trips.head()


In [None]:
for x, d in im.groupby("sender_name"):
    per_period = (
        d.set_index("datetime").groupby(pd.Grouper(freq="w")).count().sender_name
    )
    per_period.plot(label=x)
    col = plt.gca().lines[-1].get_color()
    plt.annotate(x, (per_period.idxmax(), per_period.max()), size=10, color=col)
plt.legend()
plt.title("Messages sent per week")


def time_window_annotate(start_date, end_date, y, text, size=7, facecolor="black"):
    plt.annotate(
        text,
        xy=(start_date + (end_date - start_date) / 2, y),
        xycoords="data",
        ha="center",
        xytext=(0, -20),
        textcoords="offset points",
        fontsize=size,
    )
    plt.annotate(
        "",
        xy=(end_date, y),
        xytext=(start_date, y),
        xycoords="data",
        textcoords="data",
        arrowprops={"arrowstyle": "|-|,widthA=0.2,widthB=0.2", "color": facecolor},
    )


plt.annotate(
    "Tiki\nParty",
    fontsize=7,
    xy=("2018-02-24", 550),
    xycoords="data",
    xytext=(10, 50),
    textcoords="offset points",
    arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.2", color="k"),
)

time_window_annotate("2019-07-30", "2021-07-01", 100, "Meike away")
time_window_annotate("2014-04-17", "2014-05-03", 100, "Anthea's\nwedding")
time_window_annotate("2016-01-07", "2016-02-15", 700, "IB in Canada\nalone")
for i, row in trips[trips.duration > datetime.timedelta(days=7)].iterrows():
    time_window_annotate(
        row["from"], row.to, (100 + (i * 20)), f"{row.city}\n{row.purpose}"
    )
