# Firsts

If we consider all the messages ever sent to, and recieved by, _the corpus_, when did each word enter the corpus? Who put it there? What does it say about a person if they put a lot of new words into the corpus, and what even is a word? 

---

Load up a tonne of libraries

In [1]:
import datetime
import json
import os
import pickle
import random
import re
import textwrap
from pathlib import Path
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.spatial import ConvexHull

import message_helpers as mh
from hangouts_loader import load_hangouts


In [2]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams["font.sans-serif"] = ["Segoe UI Emoji"]


In [3]:
pickle_name = "all_convo.pickle"
pickle_path = Path(pickle_name)


Set your name here. This is so that you can take yourself out of some of the graphs. Because these are conversations, naievely, they go A B A B and so on, so you'll be roughly 50% of the messages, which makes other trends hard to see.

In [4]:
MY_NAME = "Ben Doherty"


In [5]:
all_convo_df = pd.read_pickle(pickle_path)
print(f"done: all_convo_df has {all_convo_df.shape[0]} rows")
all_convo_df.head()


done: all_convo_df has 951556 rows


Unnamed: 0,sender_name,timestamp_ms,content,type,is_unsent,users,source_convo,datetime,message_length,platform,...,ip,gifs,files,audio_files,missed,event_id,input_names,initials,gender,clean_content
0,Ben Doherty,1521518000000.0,This probably isn't the best name for this,Generic,False,,1161remedialsessions_b_fdlwp_va,2018-03-20 14:47:48.721,42,Facebook,...,,,,,,,Ben Doherty,BD,me,probably isn't best name
1,Ben Doherty,1521503000000.0,You waved hello to the group.,Share,False,,1161remedialsessions_b_fdlwp_va,2018-03-20 10:44:18.115,29,Facebook,...,,,,,,,Ben Doherty,BD,me,waved hello group
2,Ben Doherty,1600076000000.0,Oops,Generic,False,,40thbirthdaydinner_xskxnp_jva,2020-09-14 19:40:35.115,4,Facebook,...,,,,,,,Ben Doherty,BD,me,oops
3,Ben Doherty,1600076000000.0,This poll is no longer available.,Generic,False,,40thbirthdaydinner_xskxnp_jva,2020-09-14 19:40:24.749,33,Facebook,...,,,,,,,Ben Doherty,BD,me,poll longer available
4,Ben Doherty,1600076000000.0,This poll is no longer available.,Generic,False,,40thbirthdaydinner_xskxnp_jva,2020-09-14 19:40:10.299,33,Facebook,...,,,,,,,Ben Doherty,BD,me,poll longer available


In [6]:
print(
    f"Overall, there are {len(all_convo_df)}, messages in this dataset. "
    f"These come from about {len(all_convo_df.sender_name.unique())} people, "
    f"covering a period of {str(all_convo_df.datetime.max()-all_convo_df.datetime.min()).split(' days')[0]} days "
    f"between {all_convo_df.datetime.min():%B, %Y} and {all_convo_df.datetime.max():%B, %Y}. "
    f"Over {len(all_convo_df.platform.unique())} platforms:"
)
all_convo_df.platform.value_counts()


Overall, there are 951556, messages in this dataset. These come from about 87 people, covering a period of 5472 days between January, 2007 and December, 2021. Over 3 platforms:


Facebook     750279
Hangouts     129683
Instagram     71594
Name: platform, dtype: int64

In [10]:
all_convo_df.sender_name.value_counts()[:20]

Ben Doherty             492887
Meike Wijers             67526
Irina Belova             50729
Ivana Kuzmanovska        44311
Lucy Rimmer              39013
Jenn Martin              34331
Bree-Danielle Wyatt      21717
Maddie Johanson          12670
Charles Ogilvie          12314
Natalie Barnes           10703
David Wilcox              9482
Annisa Rivera Rizal       9253
Katherine Withnell        8627
Erika Bloomingdale        7588
Alessandra Moschella      7557
Elizabeth Deacon          7468
Jülz Milthorpe            7273
Byron Sullivan            6880
Sarah Maloof              5700
Nazmul Azim Khan          5568
Name: sender_name, dtype: int64

# Finding streaks

The goal here is to see how long communication is maintained over different sized periods.

Using [this](https://joshdevlin.com/blog/calculate-streaks-in-pandas/) as a guide, lets start by looking at streaks in just one person's comms.

In [None]:
person_of_intrerest = "Meike Wijers"
period = "1d"

In [None]:
mw_df = all_convo_df[all_convo_df.sender_name == person_of_intrerest]
mw_df.head()

In [None]:
mpd = mw_df.set_index("datetime").groupby(pd.Grouper(freq=period)).count().sender_name
mpd.plot()
plt.title(f"Number of messages per day recieved from {person_of_intrerest}")


In [None]:
m_df = mpd.to_frame(name="message_count")
m_df["message_flag"] = m_df.message_count.apply(lambda x: x > 0)
m_df["start_of_streak"] = m_df.message_flag.ne(m_df.message_flag.shift())
m_df["streak_id"] = m_df.start_of_streak.cumsum()
m_df["streak_counter"] = m_df.groupby("streak_id").cumcount() + 1
m_df.streak_counter = m_df.apply(
    lambda row: row.streak_counter if row.message_flag else 0,
    axis=1
)
m_df.head()


In [None]:
m_df["streak_counter"].plot()
plt.title(
    f"Length of streaks of conversations with {person_of_intrerest}\n"
    f"Where the period is {period}"
)
plt.ylabel(f"number of {period}s of continuous communication")

In [None]:
def make_streak_df(convo_df, period="1d"):
    poi_list = []
    for person_of_intrerest in convo_df.sender_name.unique():
        mw_df = convo_df[convo_df.sender_name == person_of_intrerest]
        mpd = (
            mw_df.set_index("datetime")
            .groupby(pd.Grouper(freq=period))
            .count()
            .sender_name
        )
        m_df = mpd.to_frame(name="message_count")
        m_df["message_flag"] = m_df.message_count.apply(lambda x: x > 0)
        m_df["start_of_streak"] = m_df.message_flag.ne(m_df.message_flag.shift())
        m_df["streak_id"] = m_df.start_of_streak.cumsum()
        m_df["streak_counter"] = m_df.groupby("streak_id").cumcount() + 1
        m_df.streak_counter = m_df.apply(
            lambda row: row.streak_counter
            if row.message_flag
            else (-1 * row.streak_counter),
            axis=1,
        )
        s = m_df.streak_counter
        s.index = s.index.normalize()
        poi_list.append(s.to_frame(name=person_of_intrerest))

    everyone_df = pd.concat(poi_list, axis="index")
    # This last step is really nasty. I can't work out why it's not concatenating 
    # the DFs nicely and merging the common index values.
    everyone_df = everyone_df.groupby(pd.Grouper(freq=period)).sum() 
    return everyone_df


everyone_df = make_streak_df(all_convo_df, "1d")


In [None]:
def plot_streak_graph(
    df,
    period="1d",
    period_name="days",
    time_span=["2013-01-01", "2022-01-05"],
    ylim=[0, 500],
):
    plt.style.use("ggplot")

    trim = df.drop(MY_NAME, axis="columns")
    trim.fillna(0).plot(kind="line", style="-", legend=None)
    # trim.interpolate().plot(kind="line", style="-", legend=None)

    if time_span:
        plt.xlim(time_span)
    else:
        time_span = [df.index[0], df.index[-1]]
    if ylim:
        plt.ylim(ylim)
    else:
        for col in trim:
            peak = trim[col].min()
            peak_idx = trim[col].idxmin()
            plt.annotate(
                f"{col} ({peak} {period_name})",
                (peak_idx, peak),
                rotation=90,
                ha="left",
            )
    # Rotate text (https://matplotlib.org/3.1.1/gallery/text_labels_and_annotations/text_rotation_relative_to_line.html)
    raw_ang = np.array((45,))
    l2 = np.array((1, 1))
    trans_ang = plt.gca().transData.transform_angles(raw_ang, l2.reshape((1, 2)))[0]

    for col in trim:
        peak = trim[col].max()
        peak_idx = trim[col].idxmax()
        plt.annotate(
            f"{col} ({int(peak)} {period_name}) {peak_idx.strftime('%b %d')}",
            (peak_idx, peak),
            rotation=trans_ang,
            ha="left",
        )

    # plt.legend(ncol=5)
    plt.title(
        f"Length of streaks of conversations during the period {time_span[0]} to {time_span[1]}\n"
        f"Where the period is {period}"
    )
    plt.ylabel(f"number of {period}s of continuous communication")
    plt.show()


plot_streak_graph(everyone_df, period="1d", period_name="days", ylim=[0, 400])


In [None]:
everyone_df = make_streak_df(all_convo_df, "1d")
plot_streak_graph(everyone_df, period="1d", time_span=False, period_name="days", ylim=[0, 400])
plot_streak_graph(everyone_df, period="1d", period_name="days", ylim=[0, 400])

In [None]:
everyone_df = make_streak_df(all_convo_df, "7d")
plot_streak_graph(everyone_df, period="7d", period_name="weeks", ylim=False)
plot_streak_graph(everyone_df, period="1w", period_name="weeks", ylim=[0, 250])

In [None]:
everyone_df = make_streak_df(all_convo_df, "1m")
plot_streak_graph(everyone_df, period="1m", period_name="months", ylim=False)