In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import re
from datetime import datetime
import pandas as pd

from echolalia.parser import WhatsAppParser
# from echolalia.contextualizer import Contextualizer

In [23]:
parser = WhatsAppParser()

S3_BUCKET_NAME = "smcphers-echolalia"
CHAT_LOG_FILENAME = "data/_chat.txt"

# Parse the chat log into dicts and then into a DataFrame
messages = parser.parse_chat_log(bucket=S3_BUCKET_NAME, chat_log_filename=CHAT_LOG_FILENAME)

In [24]:
messages.iloc[:50]

Unnamed: 0,timestamp,user,message,exception,chatline
0,2022-01-17 23:15:56,Cat,,No content to message,"[1/17/22, 11:15:56 PM] Cat: ‎Messages and call..."
1,2022-01-17 23:15:56,Selwyn-Lloyd McPherson,Hello?!,,"[1/17/22, 11:15:56 PM] Selwyn-Lloyd McPherson:..."
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI"
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ..."
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:..."
5,2022-01-17 23:19:47,Selwyn-Lloyd McPherson,,No content to message,"‎[1/17/22, 11:19:47 PM] Selwyn-Lloyd McPherson..."
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:..."
7,2022-01-17 23:20:17,Cat,yes!,,"[1/17/22, 11:20:17 PM] Cat: yes!"
8,2022-01-17 23:20:40,Cat,can you see this?,,"[1/17/22, 11:20:40 PM] Cat: https://photobucke..."
9,2022-01-17 23:21:12,Selwyn-Lloyd McPherson,Oh!,,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:..."


In [25]:
# Begin to contextualize

# Filter out messages with exceptions
messages = messages[~messages['exception'].apply(lambda x: isinstance(x, str))]

# Sort by timestamp
messages = messages.sort_values(by='timestamp')

# Add time between last messages
messages['time_diff'] = messages['timestamp'].diff()

In [26]:
messages

Unnamed: 0,timestamp,user,message,exception,chatline,time_diff
1,2022-01-17 23:15:56,Selwyn-Lloyd McPherson,Hello?!,,"[1/17/22, 11:15:56 PM] Selwyn-Lloyd McPherson:...",NaT
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI",0 days 00:00:30
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ...",0 days 00:00:58
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:00
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32
...,...,...,...,...,...,...
58163,2024-08-08 13:13:14,Cat,My turn in the er,,"[8/8/24, 1:13:14 PM] Cat: My turn in the er",2 days 00:28:31
58164,2024-08-08 16:41:41,Cat,Ya girls got cirrhosis,,"[8/8/24, 4:41:41 PM] Cat: Ya girls got cirrhosis",0 days 03:28:27
58165,2024-08-08 16:41:49,Cat,Among other things,,"[8/8/24, 4:41:49 PM] Cat: Among other things",0 days 00:00:08
58166,2024-08-11 19:35:39,Cat,Might have a year to live up l,,"[8/11/24, 7:35:39 PM] Cat: Might have a year t...",3 days 02:53:50


In [27]:
# Create a "group" whenever the user changes
messages['group'] = (messages['user'] != messages['user'].shift()).cumsum()
messages['num_messages'] = None

In [28]:
messages

Unnamed: 0,timestamp,user,message,exception,chatline,time_diff,group,num_messages
1,2022-01-17 23:15:56,Selwyn-Lloyd McPherson,Hello?!,,"[1/17/22, 11:15:56 PM] Selwyn-Lloyd McPherson:...",NaT,1,
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI",0 days 00:00:30,2,
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ...",0 days 00:00:58,2,
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:00,3,
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32,3,
...,...,...,...,...,...,...,...,...
58163,2024-08-08 13:13:14,Cat,My turn in the er,,"[8/8/24, 1:13:14 PM] Cat: My turn in the er",2 days 00:28:31,27798,
58164,2024-08-08 16:41:41,Cat,Ya girls got cirrhosis,,"[8/8/24, 4:41:41 PM] Cat: Ya girls got cirrhosis",0 days 03:28:27,27798,
58165,2024-08-08 16:41:49,Cat,Among other things,,"[8/8/24, 4:41:49 PM] Cat: Among other things",0 days 00:00:08,27798,
58166,2024-08-11 19:35:39,Cat,Might have a year to live up l,,"[8/11/24, 7:35:39 PM] Cat: Might have a year t...",3 days 02:53:50,27798,


In [29]:
# Remove the first because I started the conversation (TODO: this is a hack and a half)
messages = messages.iloc[1:]

In [30]:
messages

Unnamed: 0,timestamp,user,message,exception,chatline,time_diff,group,num_messages
2,2022-01-17 23:16:26,Cat,HI,,"[1/17/22, 11:16:26 PM] Cat: HI",0 days 00:00:30,2,
3,2022-01-17 23:17:24,Cat,oh good i do have it on my computer already,,"[1/17/22, 11:17:24 PM] Cat: oh good i do have ...",0 days 00:00:58,2,
4,2022-01-17 23:17:24,Selwyn-Lloyd McPherson,Success!,,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:00,3,
6,2022-01-17 23:19:56,Selwyn-Lloyd McPherson,Ooh pictures work!,,"[1/17/22, 11:19:56 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32,3,
7,2022-01-17 23:20:17,Cat,yes!,,"[1/17/22, 11:20:17 PM] Cat: yes!",0 days 00:00:21,4,
...,...,...,...,...,...,...,...,...
58163,2024-08-08 13:13:14,Cat,My turn in the er,,"[8/8/24, 1:13:14 PM] Cat: My turn in the er",2 days 00:28:31,27798,
58164,2024-08-08 16:41:41,Cat,Ya girls got cirrhosis,,"[8/8/24, 4:41:41 PM] Cat: Ya girls got cirrhosis",0 days 03:28:27,27798,
58165,2024-08-08 16:41:49,Cat,Among other things,,"[8/8/24, 4:41:49 PM] Cat: Among other things",0 days 00:00:08,27798,
58166,2024-08-11 19:35:39,Cat,Might have a year to live up l,,"[8/11/24, 7:35:39 PM] Cat: Might have a year t...",3 days 02:53:50,27798,


In [31]:
# Group by this new "group" column and concatenate the values in "messages"
messages_combined = messages.groupby('group', as_index=False).agg({
    'user': 'first',                 # Take the first value of 'user' for each group
    'timestamp': list,               # Take the first value of 'timestamp' for each group
    'message': ' '.join,             # Concatenate the values of 'message'
    'num_messages': 'size',          # Count the number of messages
    'chatline': '. '.join            # Concatenate the values of 'chatline'
})

In [32]:
messages_combined

Unnamed: 0,group,user,timestamp,message,num_messages,chatline
0,2,Cat,"[2022-01-17 23:16:26, 2022-01-17 23:17:24]",HI oh good i do have it on my computer already,2,"[1/17/22, 11:16:26 PM] Cat: HI. [1/17/22, 11:1..."
1,3,Selwyn-Lloyd McPherson,"[2022-01-17 23:17:24, 2022-01-17 23:19:56]",Success! Ooh pictures work!,2,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:..."
2,4,Cat,"[2022-01-17 23:20:17, 2022-01-17 23:20:40]",yes! can you see this?,2,"[1/17/22, 11:20:17 PM] Cat: yes!. [1/17/22, 11..."
3,5,Selwyn-Lloyd McPherson,"[2022-01-17 23:21:12, 2022-01-17 23:21:18]",Oh! Haha the image that comes up is so random,2,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:..."
4,6,Cat,[2022-01-17 23:21:37],i know lol a friend of mine made me a header f...,1,"[1/17/22, 11:21:37 PM] Cat: i know lol a frien..."
...,...,...,...,...,...,...
27793,27795,Selwyn-Lloyd McPherson,"[2024-08-01 03:51:47, 2024-08-01 03:52:26]",Count that shit up Minus a few for laundry,2,"[8/1/24, 3:51:47 AM] Selwyn-Lloyd McPherson: C..."
27794,27796,Cat,"[2024-08-01 03:57:17, 2024-08-01 04:19:50]",About a dollar of Uzbek money Yeah lol,2,"[8/1/24, 3:57:17 AM] Cat: About a dollar of Uz..."
27795,27797,Selwyn-Lloyd McPherson,"[2024-08-01 04:33:09, 2024-08-01 04:33:18, 202...",I just called al Italia airlines to see what k...,12,"[8/1/24, 4:33:09 AM] Selwyn-Lloyd McPherson: I..."
27796,27798,Cat,"[2024-08-01 16:30:04, 2024-08-04 17:06:35, 202...",Omg I randomly have the worst cough now jfc ha...,7,"[8/1/24, 4:30:04 PM] Cat: Omg I randomly have ..."


### EVERYTHING LOOKS GREAAT! The timestamps are first, which is fine, and there's no time diff, which is fine. It would be nice to know the time diff

## Median time diff?

In [50]:
from echolalia._utils import median_diff

In [55]:
messages_combined['median_diff'] = messages_combined['timestamp'].apply(median_diff)

In [56]:
messages_combined

Unnamed: 0,group,user,timestamp,message,num_messages,chatline,median_diff
0,2,Cat,"[[2022-01-17 23:16:26, 2022-01-17 23:17:24]]",HI oh good i do have it on my computer already,1,"[1/17/22, 11:16:26 PM] Cat: HI. [1/17/22, 11:1...",0 days 00:00:58
1,3,Selwyn-Lloyd McPherson,"[[2022-01-17 23:17:24, 2022-01-17 23:19:56]]",Success! Ooh pictures work!,1,"[1/17/22, 11:17:24 PM] Selwyn-Lloyd McPherson:...",0 days 00:02:32
2,4,Cat,"[[2022-01-17 23:20:17, 2022-01-17 23:20:40]]",yes! can you see this?,1,"[1/17/22, 11:20:17 PM] Cat: yes!. [1/17/22, 11...",0 days 00:00:23
3,5,Selwyn-Lloyd McPherson,"[[2022-01-17 23:21:12, 2022-01-17 23:21:18]]",Oh! Haha the image that comes up is so random,1,"[1/17/22, 11:21:12 PM] Selwyn-Lloyd McPherson:...",0 days 00:00:06
4,6,Cat,[[2022-01-17 23:21:37]],i know lol a friend of mine made me a header f...,1,"[1/17/22, 11:21:37 PM] Cat: i know lol a frien...",0 days 00:00:00
...,...,...,...,...,...,...,...
27793,27795,Selwyn-Lloyd McPherson,"[[2024-08-01 03:51:47, 2024-08-01 03:52:26]]",Count that shit up Minus a few for laundry,1,"[8/1/24, 3:51:47 AM] Selwyn-Lloyd McPherson: C...",0 days 00:00:39
27794,27796,Cat,"[[2024-08-01 03:57:17, 2024-08-01 04:19:50]]",About a dollar of Uzbek money Yeah lol,1,"[8/1/24, 3:57:17 AM] Cat: About a dollar of Uz...",0 days 00:22:33
27795,27797,Selwyn-Lloyd McPherson,"[[2024-08-01 04:33:09, 2024-08-01 04:33:18, 20...",I just called al Italia airlines to see what k...,1,"[8/1/24, 4:33:09 AM] Selwyn-Lloyd McPherson: I...",0 days 00:01:30
27796,27798,Cat,"[[2024-08-01 16:30:04, 2024-08-04 17:06:35, 20...",Omg I randomly have the worst cough now jfc ha...,1,"[8/1/24, 4:30:04 PM] Cat: Omg I randomly have ...",1 days 22:03:19.500000
