# Data Cleaning
Since I'm using chat and conversational data from, let's say, WhatsApp, need to remove all the metadata and have only the text in it.

In [24]:
import re
import pandas as pd

In [25]:
def read_chat(file_path: str) -> pd.DataFrame:
    encryption_message = "Messages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them."
    media_pattern = "<Media omitted>"
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    edited_message = "<This message was edited>"
    deleted_message = "You deleted this message"
    null_message = "null"
    created_group_message = "created group"
    added_you_to_group_message = "added you"
    tagging_pattern = r'@[\w]+'

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    filtered_lines = []
    for line in lines:
        if (
            encryption_message not in line and
            deleted_message not in line and
            null_message != line.split(" ")[-1] and
            media_pattern not in line and
            created_group_message not in line and
            added_you_to_group_message not in line and
            not re.search(email_pattern, line) and
            not re.search(url_pattern, line)
        ):
            line = line.replace(edited_message, "").strip()
            line = re.sub(tagging_pattern, "", line).strip()
            filtered_lines.append(line)
    
    content = '\n'.join(filtered_lines)
    content = content.replace('\u202f', ' ')
    content = re.sub(
        r'\[(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?::\d{2})?\s?[APap][Mm])\]',
        r'\1',
        content
    )
    content = content.replace('\u200E', '').replace('\u200F', '')

    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?::\d{2})?(?:\s?[APap][Mm])?)\s?(?:-|\~)?\s?(.*?): (.*?)(?=\n\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}|$)'
    messages = re.findall(pattern, content, re.DOTALL)
    df = pd.DataFrame(messages, columns=['timestamp', 'sender', 'message'])

    timestamps = []
    for timestamp in df['timestamp']:
        try:
            timestamp = pd.to_datetime(
                timestamp, format='mixed', errors='coerce')
        except Exception as e:
            print(f"Error parsing timestamp '{timestamp}': {e}")
            timestamp = pd.NaT
        timestamps.append(timestamp)

    df['timestamp'] = timestamps
    return df

In [26]:
from pathlib import Path
all_chats = {}
data_directory = Path("../data/private")

for file in data_directory.glob('*.txt'):
    file_name = file.stem
    all_chats[file_name] = read_chat(file)
    # print(all_chats[file_name].head(5))

In [27]:
encryption_message = "Messages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them."

text_sequence = ""

for file_name in all_chats.keys():
    df = all_chats[file_name]
    
    # Drop nulls first
    filtered = df['message'].dropna()

    # Exclude messages containing any of the unwanted patterns
    filtered = filtered[
        ~filtered.str.contains("document omitted", case=False) &
        ~filtered.str.contains("<Media omitted>", case=False) &
        (filtered != encryption_message)
    ]
    
    text_sequence += " ".join(filtered.values) + " "

text_sequence = re.sub(r'\s+', ' ', text_sequence).strip()
print(len(text_sequence))

3673021


In [28]:
with open("../output/combined_text.txt", "w", encoding="utf-8") as f:
    f.write(text_sequence)