# Importing all the conversations

This is a bit trickier as you need to do something with all the conversations you're loading up.

In [None]:
import datetime
import json
import os
import pickle
import random
import re
import textwrap
from pathlib import Path
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy.spatial import ConvexHull

import message_helpers as mh


In [None]:
plt.rcParams["figure.figsize"] = (20, 20)
plt.rcParams["font.sans-serif"] = ["Segoe UI Emoji"]


In [None]:
pickle_name = "all_convo.pickle"
pickle_path = Path(pickle_name)


In [None]:
def parse_obj(obj):
    for key in obj:
        if isinstance(obj[key], str):
            obj[key] = obj[key].encode("latin_1").decode("utf-8")
        elif isinstance(obj[key], list):
            obj[key] = list(
                map(
                    lambda x: x
                    if type(x) != str
                    else x.encode("latin_1").decode("utf-8"),
                    obj[key],
                )
            )
        pass
    return obj


def sumarise_convo(name, data, verbose=False):
    words = {}
    words[name] = data.content.str.cat(sep=" ")
    wordcount = len(words[name].split(" "))

    unique_words = set(words[name].split(" "))

    pool = " ".join([str(x) for x in data.content.to_list()]).lower()
    clean = re.sub(mh.PUNCTUATION_REGEX, " ", pool, flags=re.VERBOSE)
    # and replace it with a single space
    stopped = list(set([w for w in clean.split() if w not in mh.STOP_WORDS]))

    if verbose:
        print(
            f"{name} wrote {wordcount} words ({len(words[name])} characters)"
            f" and used {len(stopped)} different words."
        )
    return {
        "participant": name,
        "wordcount": wordcount,
        "unique_words": len(unique_words),
        "cleaned_unique": len(stopped),
    }


def get_message_length(message):
    if type(message) is str:
        return len(message)
    else:
        return len(str(message))


def replace_typographic_apostrophy(message):
    if type(message) is str:
        return message.replace("’", "'")
    else:
        return message


In [None]:
def load_whole_inbox(rootdir, platform="Facebook"):
    conversations = []
    for d in os.listdir(rootdir):
        conversations.append(d)
    print(f"There are {len(conversations)} conversations to look at from {platform}.")
    # conversations

    convo_df_list = []

    if not pickle_path.is_file():
        for convo in os.listdir(rootdir):
            for f in os.listdir(os.path.join(rootdir, convo)):
                try:
                    message_list = []
                    path = os.path.join(os.path.join(rootdir, convo, f))
                    if Path(path).is_file():
                        with open(path, "r") as fb_data:
                            messages = json.load(fb_data, object_hook=parse_obj)
                            message_list.extend(messages["messages"])

                    if len(message_list) != 0:
                        df = pd.DataFrame(message_list)
                        df["source_convo"] = convo
                        df["datetime"] = df.timestamp_ms.apply(
                            lambda x: datetime.datetime.fromtimestamp(x / 1000.0)
                        )

                        if "content" in df.columns:
                            df["message_length"] = df.content.apply(get_message_length)
                            df.content = df.content.apply(
                                replace_typographic_apostrophy
                            )
                        else:
                            df["message_length"] = 0
                            df["content"] = np.nan

                        df["platform"] = platform

                        convo_df_list.append(df)

                except Exception as e:
                    print("exception", convo, e)
    return convo_df_list


fb_rootdir = "fb_data\messages\inbox"
fb_convo_df_list = load_whole_inbox(fb_rootdir, platform="Facebook")
ig_rootdir = "ig_data\inbox"
ig_convo_df_list = load_whole_inbox(ig_rootdir, platform="Instagram")
convo_df_list = fb_convo_df_list + ig_convo_df_list
len(convo_df_list)


In [None]:
if pickle_path.is_file():
    all_convo_df = pd.read_pickle(pickle_path)
else:
    all_convo_df = pd.concat(convo_df_list)
    pd.to_pickle(all_convo_df, pickle_path)
    all_convo_df.sample(10)

In [None]:
def clean_and_stop(content, as_list=False):
    try:
        clean = re.sub(
            mh.PUNCTUATION_REGEX,
            " ",
            content,
            flags=re.VERBOSE,  # and replace it with a single space
        )
        stopped = [w.lower() for w in clean.split() if w.lower() not in mg.STOP_WORDS]
        # print(content, "=>", stopped)
        if as_list:
            return stopped
        else:
            return " ".join(stopped)
    except Exception as e:
        # print(content, e)
        return content


all_convo_df["clean_content"] = all_convo_df.content.apply(clean_and_stop)


In [None]:
# words = {}
# for name, data in df.groupby("sender_name"):
#     words[name] = data.content.str.cat(sep=" ")
#     wordcount = len(words[name].split(" "))
#     # print(f"{name} wrote {wordcount} words ({len(words[name])} characters)")


In [None]:
# for name, df in all_convo_df.groupby("platform"):
#     vc = df.sender_name.value_counts()
#     print(vc[vc>100].index)


In [None]:
def fold_names(input_name):
    byron = "Byron Sullivan"
    charlie = "Charlie"
    karin = "Karin"
    ivana = "Ivana Kuzmanovska"
    julz = "Jülz Milthorpe"
    jess = "Jess Howard"
    jodie = "Jodie Hinton"
    tones = "Antonia Sheil"
    annisa = "Annisa Rivera Rizal"
    thesaurus = {
        "Byron Sullivan": byron,
        "Byron": byron,
        "Thearlaich Ogilive": charlie,
        "Charles OGILVIE": charlie,
        "Karin Frost": karin,
        "karin ke": karin,
        "Ivana Kuzmanovska": ivana,
        "ivana kuzmanovska": ivana,
        "Jülz": julz,
        "Jülz Milthorpe": julz,
        "jesshoward": jess,
        "Jess Howard": jess,
        "Jodie": jodie,
        "Tones": tones,
        "annisarivera": annisa,
    }
    new_name = thesaurus.get(input_name, input_name)
    # if new_name != input_name:
    #     print(f"renamed {input_name} to {new_name}")
    return new_name


all_convo_df["input_names"] = all_convo_df.sender_name
all_convo_df.sender_name = all_convo_df.sender_name.apply(fold_names)


In [None]:
print(all_convo_df.shape)
all_convo_df.head(5)

In [None]:
conv_meta = []
for name, data in all_convo_df.groupby("sender_name"):
    conv_meta.append(sumarise_convo(name, data))
meta_df = pd.DataFrame(conv_meta)
meta_df["ratio"] = meta_df.apply(
    lambda row: row.wordcount / (row.cleaned_unique + 1), axis=1
)


In [None]:
fig = plt.figure()
ax = plt.gca()
# plt.scatter(meta_df.wordcount, meta_df.unique_words)
ax.scatter(meta_df.wordcount, meta_df.cleaned_unique)
plt.xlabel("Wordcount")
plt.ylabel("Number of unique words")
plt.xlim([0, 380000])
plt.ylim([0, 18000])

for i, row in meta_df.iterrows():
    if row.wordcount > 15000:
        # if row.wordcount > 15000:
        #     right = 60
        # else:
        #     right = 500
        plt.annotate(
            row.participant,
            (row.wordcount, row.cleaned_unique),
            size=10,
            xycoords="data",
            xytext=(
                random.randint(50, 130) * random.sample(range(-1, 1), k=1)[0],
                random.randint(50, 60) * random.sample(range(-1, 1), k=1)[0],
            ),
            textcoords="offset points",
            arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.2"),
        )

# # define the true objective function
# def objective(x, a, b, c):
# 	return a * x + b * x**2 + c
# x=meta_df.wordcount
# y=meta_df.cleaned_unique
# # curve fit
# popt, _ = curve_fit(objective, x, y)
# # summarize the parameter values
# a, b, c = popt
# print('y = %.5f * x + %.5f * x^2 + %.5f' % (a, b, c))

# # define a sequence of inputs between the smallest and largest known inputs
# x_line = np.arange(min(x), max(x), 1)
# # calculate the output for the range
# y_line = objective(x_line, a, b, c)
# # create a line plot for the mapping function
# plt.plot(x_line, y_line, '--', color='red')


In [None]:
meta_df[meta_df.wordcount > 5000].set_index(
    "participant"
).ratio.sort_values().plot.barh()
plt.title(
    "Ratio of wordcount to unique words count\n(only showing those with word counts over 5k)"
)


In [None]:
vc = all_convo_df.sender_name.value_counts()
lb, ub = (500, 100000)
vc[(vc > lb) & (vc < ub)].plot.barh(
    title=f"Number of messages sent (between {lb} & {ub})"
)


In [None]:
chatty_people = all_convo_df.sender_name.value_counts() > 1000
all_convo_df["chatty_people"] = [chatty_people[x] for x in all_convo_df.sender_name]
all_convo_df.sample(10)


# Plotting time that might show sleep

This is a bit nasty, I don't really like it, but it seems to work. There _must_ be a better way.

Convert the time component of the datetime to just-date and just-time (but in seconds, as a number) and then plot that.

The y axis is nasty becasue it doesn't show 

In [None]:
def time_to_seconds(dt):
    t = dt.time()
    seconds = (t.hour * 60 + t.minute) * 60 + t.second
    return seconds


all_convo_df["date"] = all_convo_df.datetime.apply(lambda x: x.date())
all_convo_df["time"] = all_convo_df.datetime.apply(lambda x: x.time())
all_convo_df["seconds"] = all_convo_df.datetime.apply(time_to_seconds)
all_convo_df.sample(5)


In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
for name, df in all_convo_df.groupby("platform"):
    ax.scatter(df.date, df.seconds, s=20, alpha=0.04, label=name)
ax.yaxis_date()
fig.autofmt_xdate()
plt.title("all messages in and out")
plt.ylabel("seconds after midnight GMT")

plt.show()


So, this makes sense to me, but it's a bit gross. The y axis is all messed up, it should be showing time, not number of seconds. I'm also not sure if time goes forward up or down the page.

It'd be good to work out a way of identifying sleep periods. Maybe that's a period that starts after 10pm that's offline for more than 4 hours?

In [None]:
plt.rcParams["figure.figsize"] = (30, 30)

cut = 500
labels = []
odd_df = None

fontP = FontProperties()
fontP.set_size("x-small")
all_initials = []

fig = plt.figure()
ax = fig.add_subplot(111)
for name, df in all_convo_df.groupby("sender_name"):
    if df.shape[0] > cut:
        initials = "".join([n[0] for n in name.split()]).upper()
        if initials in all_initials:
            print("uh oh, double up on", initials, name)
            initials = (
                "".join([n[0] for n in name.split()]).upper() + name.split()[-1][1]
            )
            print("replaced with", initials)
        else:
            all_initials.append(initials)

        label = f"{name} ({initials}, {df.shape[0]})"

        if name == "Ben Doherty":
            marker = ","
            ax.scatter(
                df.date,
                df.seconds,
                s=0.3,
                alpha=0.3,
                linewidth=0,
                label=label,
                marker=marker,
            )
        elif len(initials) > 0:
            marker = f"${initials}$"
            ax.scatter(
                df.date,
                df.seconds,
                s=10 if len(initials) == 2 else 15,
                alpha=0.2,
                linewidth=0,
                label=label,
                marker=marker,
            )
        else:
            # marker = "1"
            # print(name, "odd one", df.content.head(10))
            odd_df = df

        labels.append(label)
    else:
        ax.scatter(
            df.date,
            df.seconds,
            s=15,
            alpha=0.1,
            marker="x",
        )
ax.yaxis_date()
fig.autofmt_xdate()
plt.title("all messages in and out")
plt.ylabel("seconds after midnight GMT")
leg = plt.legend(
    title=f"People with more\nthan {cut} messages",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    prop=fontP,
)
for lh in leg.legendHandles:
    lh.set_alpha(1)

plt.savefig("all_messages.svg")
# plt.show()
print(labels)


In [None]:
plot = sns.lmplot(
    x="date",
    y="seconds",
    data=all_convo_df[all_convo_df["chatty_people"]],
    # data=all_convo_df,
    hue="sender_name",
    fit_reg=False,
    legend=True,
    palette="Set2",
    col="sender_name",
    col_wrap=4,
    scatter_kws=dict(s=80, alpha=0.2),
)


In [None]:
sns.set(rc={"figure.figsize": (15, 15)})
plot = sns.lmplot(
    x="date",
    y="seconds",
    data=all_convo_df[all_convo_df.sender_name != "Ben Doherty"],
    hue="sender_name",
    markers="x",
    fit_reg=False,
    legend=False,
    palette="Set1",
    scatter_kws=dict(s=30, alpha=0.1),
    height=20,
    aspect=20 / 20,
)
# plt.xticks(rotation=45);
plt.savefig("all_incoming.svg")
plt.savefig("all_incoming.png")


In [None]:
data = all_convo_df[all_convo_df.sender_name != "Ben Doherty"]
data = data[data.chatty_people]
print(data.shape)
data.sample(3)


In [None]:
sns.set(rc={"figure.figsize": (10, 10)})
fig, ax = plt.subplots()
g = sns.scatterplot(
    x="date",
    y="seconds",
    data=data,
    hue="sender_name",
    legend=False,
    palette="Set1",
    s=30,
    alpha=0.1,
    ax=ax,
)
# g.legend(bbox_to_anchor=(1.5, 1))


In [None]:
def encircle(x, y, ax=None, **kw):
    if not ax:
        ax = plt.gca()
    p = np.c_[x, y]
    hull = ConvexHull(p)
    poly = plt.Polygon(p[hull.vertices, :], **kw)
    ax.add_patch(poly)


fig, ax = plt.subplots()
for x, d in data.groupby("sender_name"):
    if d.shape[0] > 4000:
        m, b = np.polyfit(d.timestamp_ms, d.seconds, 1)
        plt.plot(d.timestamp_ms, m * d.timestamp_ms + b)
        col = plt.gca().lines[-1].get_color()
        sc = ax.scatter(d.timestamp_ms, d.seconds, s=10, alpha=0.4, label=x, color=col)
        encircle(d.timestamp_ms, d.seconds, ax=ax, ec=col, fc="none")
        plt.annotate(x, (d.timestamp_ms.mean(), d.seconds.mean()), size=10, color=col)

plt.show()


In [None]:
bd_out_df = all_convo_df[all_convo_df.sender_name == "Ben Doherty"]
bd_out_df.set_index("datetime").groupby(pd.Grouper(freq="M")).count().sender_name.plot()
plt.title("Messages sent per Month")


In [None]:
fig, ax = plt.subplots()
for x, d in all_convo_df.groupby("sender_name"):
    if d.shape[0] > 4000 and x != "Ben Doherty":
        per_period = (
            d.set_index("datetime").groupby(pd.Grouper(freq="2m")).count().sender_name
        )
        per_period.plot()
        col = plt.gca().lines[-1].get_color()
        plt.annotate(x, (per_period.idxmax(), per_period.max()), size=10, color=col)


In [None]:
SOO_WORDS = [
    "poop",
    "home",
    "doughnut",
]


def is_soo(content):
    try:
        if any(x.lower() in content for x in SOO_WORDS):
            return "soo"
    except:
        pass
    return "not"


In [None]:
all_convo_df["soo"] = all_convo_df.content.apply(is_soo)

fig = plt.figure()
ax = fig.add_subplot(111)
for name, data in all_convo_df.groupby("soo"):
    if name == "soo":
        ax.scatter(data.date, data.seconds, s=30, alpha=0.5, c="red", marker="x")
    else:
        ax.scatter(data.date, data.seconds, s=10, alpha=0.1, c="blue", marker=".")

# ax.yaxis_date()
seconds_in_a_day = 24 * 60 * 60
ax.yaxis.set_major_locator(plt.MaxNLocator(30))
# plt.yticks(plt.yticks()[0], [datetime.timedelta(seconds=t)  for t in plt.yticks()[0]])
plt.ylim([0, seconds_in_a_day])

ax.xaxis.set_major_locator(plt.MaxNLocator(30))
fig.autofmt_xdate()
# plt.xlim(['2020-07-18', '2021-07-21'])

plt.suptitle("When do we talk soo?")
plt.title(f"Occurance of {', '.join(SOO_WORDS)}")

plt.show()


In [None]:
top = 50

pool = " ".join(all_convo_df[all_convo_df.soo == "soo"].content.to_list())
clean = re.sub(
    mh.PUNCTUATION_REGEX, " ", pool, flags=re.VERBOSE  # and replace it with a single space
)
stopped = [w.lower() for w in clean.split() if w.lower() not in mh.STOP_WORDS]
vc = pd.Series(stopped).value_counts()
vc[:top].plot.barh()
plt.title(f'Top {top} most common words in "soo" messages')


In [None]:
pool = " ".join([str(x) for x in all_convo_df.content])
clean = re.sub(
    mh.PUNCTUATION_REGEX, " ", pool, flags=re.VERBOSE  # and replace it with a single space
)
stopped = [w.lower() for w in clean.split() if w.lower() not in mh.STOP_WORDS]
vc = pd.Series(stopped).value_counts()
vc[:top].plot.barh()
plt.title(f"Top {top} most common words in all messages")


In [None]:
ratios = {}
for name, df in all_convo_df.groupby("sender_name"):
    if df.shape[0] > 1000:
        vc = df.soo.value_counts()
        ratios[name] = (vc.get("soo", 1)) / vc["not"]
highly_soo = pd.Series(ratios).sort_values()
highly_soo.plot.barh()


In [None]:
# print(highly_soo.index)
highly_soo_df = all_convo_df[
    [x in list(highly_soo.index) for x in all_convo_df.sender_name]
]


In [None]:
occurances = []
for name, df in highly_soo_df.groupby("sender_name"):
    d = {"name": name}
    pool = " ".join([str(x) for x in df.content])
    for w in SOO_WORDS:
        oc = pool.count(w)
        d[w] = oc
    occurances.append(d)


In [None]:
sdf = pd.DataFrame(occurances)
sdf = sdf.set_index("name")
sdf.head()


In [None]:
sdf_normed = sdf.div(sdf.sum(axis=1), axis=0)
sdf_normed.plot.barh(edgecolor="none")
plt.title("Occurances of these words (normalised per person)")


In [None]:
sdf[sdf.index != "Ben Doherty"].plot.barh(edgecolor="none")
plt.title("Occurances of these words (not normalised per person)")


In [None]:
p = [
    "Ben Doherty",
    "Ivana Kuzmanovska",
    "More People"
]
# This will fail because there isnt' a person in the dataset called "More People"
sdf_normed.loc[p].plot.barh(edgecolor="none", width=0.7,)
plt.title(f"Occurances of these words in messages from \n{p} (normalised)")


In [None]:
sdf.sum(axis=0).plot.barh()
plt.title("General occurance of these words")
