In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from datetime import datetime
import pandas as pd

from echolalia.parser import WhatsAppParser
# from echolalia.contextualizer import Contextualizer

In [3]:
parser = WhatsAppParser()

S3_BUCKET_NAME = "smcphers-echolalia"
CHAT_LOG_FILENAME = "data/_chat.txt"

# Parse the chat log into dicts and then into a DataFrame
messages = parser.parse_chat_log(bucket=S3_BUCKET_NAME, chat_log_filename=CHAT_LOG_FILENAME)

In [4]:
messages.iloc[:50]

Unnamed: 0,timestamp,user,message,exception,chatline
0,2022-01-17 23:15:56,Cat,,No content to message,"[1/17/22, 11:15:56 PM] Cat: ‎Messages and call..."
1,2022-01-17 23:15:56,Selwyn-Lloyd McPherson,Hello?!,,"[1/17/22, 11:15:56 PM] Selwyn-Lloyd McPherson:..."
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI"
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ..."
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:..."
5,2022-01-17 23:19:47,Selwyn-Lloyd McPherson,,No content to message,"‎[1/17/22, 11:19:47 PM] Selwyn-Lloyd McPherson..."
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:..."
7,2022-01-17 23:20:17,Cat,yes!,,"[1/17/22, 11:20:17 PM] Cat: yes!"
8,2022-01-17 23:20:40,Cat,can you see this?,,"[1/17/22, 11:20:40 PM] Cat: https://photobucke..."
9,2022-01-17 23:21:12,Selwyn-Lloyd McPherson,Oh!,,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:..."


In [7]:
# Begin to contextualize

# Filter out messages with exceptions
messages = messages[~messages['exception'].apply(lambda x: isinstance(x, str))]

# Sort by timestamp
messages = messages.sort_values(by='timestamp')

# Add time between last messages
messages['time_diff'] = messages['timestamp'].diff()

In [8]:
messages

Unnamed: 0,timestamp,user,message,exception,chatline,time_diff
1,2022-01-17 23:15:56,Selwyn-Lloyd McPherson,Hello?!,,"[1/17/22, 11:15:56 PM] Selwyn-Lloyd McPherson:...",NaT
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI",0 days 00:00:30
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ...",0 days 00:00:58
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:00
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32
...,...,...,...,...,...,...
58163,2024-08-08 13:13:14,Cat,My turn in the er,,"[8/8/24, 1:13:14 PM] Cat: My turn in the er",2 days 00:28:31
58164,2024-08-08 16:41:41,Cat,Ya girls got cirrhosis,,"[8/8/24, 4:41:41 PM] Cat: Ya girls got cirrhosis",0 days 03:28:27
58165,2024-08-08 16:41:49,Cat,Among other things,,"[8/8/24, 4:41:49 PM] Cat: Among other things",0 days 00:00:08
58166,2024-08-11 19:35:39,Cat,Might have a year to live up l,,"[8/11/24, 7:35:39 PM] Cat: Might have a year t...",3 days 02:53:50


In [9]:
# Create a "group" whenever the user changes
messages['group'] = (messages['user'] != messages['user'].shift()).cumsum()
messages['num_messages'] = None

In [10]:
messages

Unnamed: 0,timestamp,user,message,exception,chatline,time_diff,group,num_messages
1,2022-01-17 23:15:56,Selwyn-Lloyd McPherson,Hello?!,,"[1/17/22, 11:15:56 PM] Selwyn-Lloyd McPherson:...",NaT,1,
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI",0 days 00:00:30,2,
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ...",0 days 00:00:58,2,
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:00,3,
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32,3,
...,...,...,...,...,...,...,...,...
58163,2024-08-08 13:13:14,Cat,My turn in the er,,"[8/8/24, 1:13:14 PM] Cat: My turn in the er",2 days 00:28:31,27800,
58164,2024-08-08 16:41:41,Cat,Ya girls got cirrhosis,,"[8/8/24, 4:41:41 PM] Cat: Ya girls got cirrhosis",0 days 03:28:27,27800,
58165,2024-08-08 16:41:49,Cat,Among other things,,"[8/8/24, 4:41:49 PM] Cat: Among other things",0 days 00:00:08,27800,
58166,2024-08-11 19:35:39,Cat,Might have a year to live up l,,"[8/11/24, 7:35:39 PM] Cat: Might have a year t...",3 days 02:53:50,27800,


In [11]:
# Remove the first because I started the conversation (TODO: this is a hack and a half)
messages = messages.iloc[1:]

In [10]:
messages

Unnamed: 0,timestamp,user,message,exception,chatline,time_diff,group,num_messages
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI",0 days 00:00:30,2,
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ...",0 days 00:00:58,2,
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:00,3,
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32,3,
7,2022-01-17 23:20:17,Cat,yes!,,"[1/17/22, 11:20:17 PM] Cat: yes!",0 days 00:00:21,4,
...,...,...,...,...,...,...,...,...
58163,2024-08-08 13:13:14,Cat,My turn in the er,,"[8/8/24, 1:13:14 PM] Cat: My turn in the er",2 days 00:28:31,27798,
58164,2024-08-08 16:41:41,Cat,Ya girls got cirrhosis,,"[8/8/24, 4:41:41 PM] Cat: Ya girls got cirrhosis",0 days 03:28:27,27798,
58165,2024-08-08 16:41:49,Cat,Among other things,,"[8/8/24, 4:41:49 PM] Cat: Among other things",0 days 00:00:08,27798,
58166,2024-08-11 19:35:39,Cat,Might have a year to live up l,,"[8/11/24, 7:35:39 PM] Cat: Might have a year t...",3 days 02:53:50,27798,


In [11]:
# Group by this new "group" column and concatenate the values in "messages"
messages_combined = messages.groupby('group', as_index=False).agg({
    'user': 'first',                 # Take the first value of 'user' for each group
    'timestamp': list,               # Take the first value of 'timestamp' for each group
    'message': ' '.join,             # Concatenate the values of 'message'
    'num_messages': 'size',          # Count the number of messages
    'chatline': '. '.join            # Concatenate the values of 'chatline'
})

In [12]:
messages_combined

Unnamed: 0,group,user,timestamp,message,num_messages,chatline
0,2,Cat,"[2022-01-17 23:16:26, 2022-01-17 23:17:24]",HI oh good i do have it on my computer already,2,"[1/17/22, 11:16:26 PM] Cat: HI. [1/17/22, 11:1..."
1,3,Selwyn-Lloyd McPherson,"[2022-01-17 23:17:24, 2022-01-17 23:19:56]",Success! Ooh pictures work!,2,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:..."
2,4,Cat,"[2022-01-17 23:20:17, 2022-01-17 23:20:40]",yes! can you see this?,2,"[1/17/22, 11:20:17 PM] Cat: yes!. [1/17/22, 11..."
3,5,Selwyn-Lloyd McPherson,"[2022-01-17 23:21:12, 2022-01-17 23:21:18]",Oh! Haha the image that comes up is so random,2,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:..."
4,6,Cat,[2022-01-17 23:21:37],i know lol a friend of mine made me a header f...,1,"[1/17/22, 11:21:37 PM] Cat: i know lol a frien..."
...,...,...,...,...,...,...
27793,27795,Selwyn-Lloyd McPherson,"[2024-08-01 03:51:47, 2024-08-01 03:52:26]",Count that shit up Minus a few for laundry,2,"[8/1/24, 3:51:47 AM] Selwyn-Lloyd McPherson: C..."
27794,27796,Cat,"[2024-08-01 03:57:17, 2024-08-01 04:19:50]",About a dollar of Uzbek money Yeah lol,2,"[8/1/24, 3:57:17 AM] Cat: About a dollar of Uz..."
27795,27797,Selwyn-Lloyd McPherson,"[2024-08-01 04:33:09, 2024-08-01 04:33:18, 202...",I just called al Italia airlines to see what k...,12,"[8/1/24, 4:33:09 AM] Selwyn-Lloyd McPherson: I..."
27796,27798,Cat,"[2024-08-01 16:30:04, 2024-08-04 17:06:35, 202...",Omg I randomly have the worst cough now jfc ha...,7,"[8/1/24, 4:30:04 PM] Cat: Omg I randomly have ..."


### EVERYTHING LOOKS GREAAT! The timestamps are first, which is fine, and there's no time diff, which is fine. It would be nice to know the time diff

In [19]:
# Let's try to take a median of the time_diff
messages_combined['time_diff'] = messages_combined['timestamp'].apply(lambda x: pd.Series(x).diff().median())

In [20]:
messages_combined

Unnamed: 0,group,user,timestamp,message,num_messages,chatline,time_diff
0,2,Cat,"[2022-01-17 23:16:26, 2022-01-17 23:17:24]",HI oh good i do have it on my computer already,2,"[1/17/22, 11:16:26 PM] Cat: HI. [1/17/22, 11:1...",2022-01-17 23:16:55.000
1,3,Selwyn-Lloyd McPherson,"[2022-01-17 23:17:24, 2022-01-17 23:19:56]",Success! Ooh pictures work!,2,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",2022-01-17 23:18:40.000
2,4,Cat,"[2022-01-17 23:20:17, 2022-01-17 23:20:40]",yes! can you see this?,2,"[1/17/22, 11:20:17 PM] Cat: yes!. [1/17/22, 11...",2022-01-17 23:20:28.500
3,5,Selwyn-Lloyd McPherson,"[2022-01-17 23:21:12, 2022-01-17 23:21:18]",Oh! Haha the image that comes up is so random,2,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:...",2022-01-17 23:21:15.000
4,6,Cat,[2022-01-17 23:21:37],i know lol a friend of mine made me a header f...,1,"[1/17/22, 11:21:37 PM] Cat: i know lol a frien...",2022-01-17 23:21:37.000
...,...,...,...,...,...,...,...
27793,27795,Selwyn-Lloyd McPherson,"[2024-08-01 03:51:47, 2024-08-01 03:52:26]",Count that shit up Minus a few for laundry,2,"[8/1/24, 3:51:47 AM] Selwyn-Lloyd McPherson: C...",2024-08-01 03:52:06.500
27794,27796,Cat,"[2024-08-01 03:57:17, 2024-08-01 04:19:50]",About a dollar of Uzbek money Yeah lol,2,"[8/1/24, 3:57:17 AM] Cat: About a dollar of Uz...",2024-08-01 04:08:33.500
27795,27797,Selwyn-Lloyd McPherson,"[2024-08-01 04:33:09, 2024-08-01 04:33:18, 202...",I just called al Italia airlines to see what k...,12,"[8/1/24, 4:33:09 AM] Selwyn-Lloyd McPherson: I...",2024-08-01 04:46:30.500
27796,27798,Cat,"[2024-08-01 16:30:04, 2024-08-04 17:06:35, 202...",Omg I randomly have the worst cough now jfc ha...,7,"[8/1/24, 4:30:04 PM] Cat: Omg I randomly have ...",2024-08-08 13:13:14.000


# Why does this sort not work?

Why is there an NaT in the timediff? Yes because there's only one


In [16]:
messages_combined.sort_values(by=['time_diff'])

Unnamed: 0,group,user,timestamp,message,num_messages,chatline,time_diff
927,929,Selwyn-Lloyd McPherson,"[2022-02-18 20:17:52, 2022-02-18 20:17:52]",onomatopoeia? I loved it. You speak to me in s...,2,onomatopoeia? I loved it. You speak to me in s...,0 days
21333,21335,Selwyn-Lloyd McPherson,"[2024-03-28 01:06:04, 2024-03-28 01:06:04, 202...",Things seem a bit blanker now. I’m staying in ...,3,Things seem a bit blanker now. I’m staying in ...,0 days
25963,25965,Selwyn-Lloyd McPherson,"[2024-06-20 05:55:01, 2024-06-20 05:55:01, 202...",Things seem to come in cycles. I’m sorry you’r...,3,"[6/20/24, 5:55:01 AM] Selwyn-Lloyd McPherson: ...",0 days
25901,25903,Selwyn-Lloyd McPherson,"[2024-06-17 15:58:41, 2024-06-17 15:58:41, 202...",FUCKING GREAT ‎<This message was edited> How's...,14,FUCKING GREAT ‎<This message was edited>. How'...,0 days
22152,22154,Cat,"[2024-04-15 19:55:30, 2024-04-15 20:09:37, 202...",there has been and i hope there will be more p...,18,"[4/15/24, 7:55:30 PM] Cat: there has been and ...",0 days
...,...,...,...,...,...,...,...
27787,27789,Selwyn-Lloyd McPherson,[2024-08-01 03:44:03],"I wouldn’t know, I haven’t been touched since....",1,"[8/1/24, 3:44:03 AM] Selwyn-Lloyd McPherson: I...",NaT
27788,27790,Cat,[2024-08-01 03:44:17],Me either,1,"[8/1/24, 3:44:17 AM] Cat: Me either",NaT
27790,27792,Cat,[2024-08-01 03:44:46],No I don’t,1,"[8/1/24, 3:44:46 AM] Cat: No I don’t",NaT
27792,27794,Cat,[2024-08-01 03:51:39],I have a few quarters in my desk,1,"[8/1/24, 3:51:39 AM] Cat: I have a few quarter...",NaT


# ============
# Dunno what this is down there

In [204]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

# Sample DataFrame with datetime column
data = {'datetime': [datetime.now(), 
                     datetime.now() + timedelta(hours=3), 
                     datetime.now() + timedelta(hours=7),
                     datetime.now() + timedelta(hours=11)]}

df = pd.DataFrame(data)

# Group by dummy column (since we don't have one in this example)
# You can remove this in your actual dataframe or use a real group-by column
df['dummy_group'] = 1

In [205]:
df

Unnamed: 0,datetime,dummy_group
0,2024-09-17 06:54:20.088808,1
1,2024-09-17 09:54:20.088814,1
2,2024-09-17 13:54:20.088819,1
3,2024-09-17 17:54:20.088820,1


# This one has a different syntax

In [111]:
result = df.groupby('dummy_group').agg(
    datetime = ('datetime', 'first'),
    median_time_diff = ('datetime', lambda x: np.median([y.total_seconds() for y in (x - x.shift()).dropna()])),
    median_time_diff_2 = ('datetime', lambda x: np.median([y.total_seconds() for y in (x - x.shift()).dropna()]))
)

print(result)

                              datetime  median_time_diff  median_time_diff_2
dummy_group                                                                 
1           2024-09-17 03:54:12.162632      14400.000002        14400.000002


# Can I apply it? (this is the last one failing)

In [178]:
# Group by this new "group" column and concatenate the values in "messages"
messages_combined_2 = messages.groupby('group', as_index=False).agg(
    user = ('user', 'first'),                 # Take the first value of 'user' for each group
    timestamp = ('timestamp', 'first'),            # Take the first value of 'timestamp' for each group
    message = ('message', ' '.join),           # Concatenate the values of 'message'
    num_messages = ('num_messages', 'size'),          # Count the number of messages
    time_diff = ('time_diff', median_diff_lambda), # Calculate the median time difference between messages
    chatline = ('chatline', '. '.join)            # Concatenate the values of 'chatline'
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [179]:
messages_combined_2

Unnamed: 0,group,user,timestamp,message,num_messages,time_diff,chatline
0,2,Cat,2022-01-17 23:16:26,HI oh good i do have it on my computer already,2,0 days,"[1/17/22, 11:16:26 PM] Cat: HI. [1/17/22, 11:1..."
1,3,Selwyn-Lloyd McPherson,2022-01-17 23:17:24,Success! Ooh pictures work!,2,0 days,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:..."
2,4,Cat,2022-01-17 23:20:17,yes! can you see this?,2,0 days,"[1/17/22, 11:20:17 PM] Cat: yes!. [1/17/22, 11..."
3,5,Selwyn-Lloyd McPherson,2022-01-17 23:21:12,Oh! Haha the image that comes up is so random,2,0 days,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:..."
4,6,Cat,2022-01-17 23:21:37,i know lol a friend of mine made me a header f...,1,0 days,"[1/17/22, 11:21:37 PM] Cat: i know lol a frien..."
...,...,...,...,...,...,...,...
27793,27795,Selwyn-Lloyd McPherson,2024-08-01 03:51:47,Count that shit up Minus a few for laundry,2,0 days,"[8/1/24, 3:51:47 AM] Selwyn-Lloyd McPherson: C..."
27794,27796,Cat,2024-08-01 03:57:17,About a dollar of Uzbek money Yeah lol,2,0 days,"[8/1/24, 3:57:17 AM] Cat: About a dollar of Uz..."
27795,27797,Selwyn-Lloyd McPherson,2024-08-01 04:33:09,I just called al Italia airlines to see what k...,12,0 days,"[8/1/24, 4:33:09 AM] Selwyn-Lloyd McPherson: I..."
27796,27798,Cat,2024-08-01 16:30:04,Omg I randomly have the worst cough now jfc ha...,7,0 days,"[8/1/24, 4:30:04 PM] Cat: Omg I randomly have ..."
