In [17]:
import os
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import re
import zipfile

from datetime import datetime

## Extract text data from zip file

In [2]:
if not os.path.isfile('../data/_chat.txt'):
    with zipfile.ZipFile('../data/WhatsApp Chat - Entrenos M-J 13-15.zip') as myzip:
        myzip.extract('_chat.txt', path='../data')

chat_lines = []

with open('../data/_chat.txt', mode='r', encoding='utf8') as chat:
    chat_lines = chat.readlines()

In [3]:
def parse_line(line):
    message_pattern = r'\[(\d{1,2}/\d{1,2}/\d{2}, \d{2}:\d{2}:\d{2})\] (.+?): (.+)'
    match = re.match(message_pattern, line)
    if match:
        timestamp_str = match.group(1)
        timestamp = datetime.strptime(timestamp_str, '%d/%m/%y, %H:%M:%S')
        sender = match.group(2)
        message = match.group(3).replace('\u200e', '')
        return timestamp, sender, message
    return None, None, line

## Parse text lines into messages

In [15]:
timestamps = []
senders = []
messages = []
for line in chat_lines:
    timestamp, sender, message = parse_line(line)
    if timestamp:
        timestamps.append(timestamp)
        senders.append(sender)
        messages.append(message)
    else:
        messages[-1] += message

df = pd.DataFrame({'timestamp': timestamps, 'sender': senders, 'message': messages})

## Decorate data to facilitate querying

In [26]:
from unidecode import unidecode
from math import ceil
def week_of_month(dt):
    """ Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)

    dom = dt.day
    adjusted_dom = dom + first_day.weekday()

    return int(ceil(adjusted_dom/7.0))

In [23]:
df['newline_count'] = df['message'].str.count('\n').add(1)

In [27]:
df['week_of_month'] = df['timestamp'].apply(week_of_month)

In [28]:
df['day_of_week'] = df['timestamp'].apply(lambda x: x.day_of_week)

## Save data frame to a parquet file

In [29]:
table = pa.Table.from_pandas(df)
pq.write_table(table, '../data/chat.parquet')