In [None]:
import csv
import os
import re
import numpy as np

In [None]:
# Read contacts.csv and create a mapping from number to name
contacts = {}
names_substring = set()
with open('contacts.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        contacts[row['number']] = row['name']
        for name in row['name'].split(' '):
            names_substring.add(name.lower().strip() + " ")

print(contacts)
print(names_substring)

In [None]:
# Function to check if all numbers in a filename are in the contacts
def numbers_in_contacts(filename):
    numbers = re.findall(r'\d+', filename)
    return all(number in contacts for number in numbers) and len(numbers) == 1

In [None]:
def is_date_line(line):
    # Check if the line starts with a date (not tabbed/indented)
    return bool(re.match(r'^[A-Za-z]{3} \d{2}, \d{4}.*', line))

In [None]:
def clean_block(block):
    return [m for m in block if m.strip() != '']

def get_msg_blocks(lines):
    msg_blocks = []

    msg_block = []
    for line in lines:
        if is_date_line(line) and len(msg_block) >= 3:
            msg_blocks.append(clean_block(msg_block))
            msg_block = [line.rstrip()]
        else:
            msg_block.append(line.rstrip())

    return msg_blocks

In [None]:
def ret_msg_content(sender_name, msg):
    msg = msg[0].lower() + msg[1:]
    return {'sender_name' : sender_name, 'content' : msg}

remove_substring = []
with open('params/remove_substrs.txt', 'r') as substrs:
    lines = substrs.readlines()
    for line in lines:
        remove_substring.append(line.strip())

# Message Dict
def parse_msg(block):
    sender_name = block[1]
    msg = ""

    if len(block) < 3:
        
        return None

    if 'Me' in sender_name and any(substr in block[i].lower() for substr in remove_substring for i in range(2,len(block))):
        return None
    
    if len(block) == 3:
        msg = block[2]
        return ret_msg_content(sender_name, msg)
    else:

        for i in range(2,len(block)):
            if 'Edited' in block[i]:
                msg = msg.join(block[i][block[i].find(':') + 1:].strip())
                return ret_msg_content(sender_name, msg)

        text_additions = ['Reactions', 'This message responded to an earlier message', 'Sent with']
        for i in range(2,len(block)):
            if any(mod in block[i] for mod in text_additions):
                msg = msg.join(block[j] for j in range(2,i))
                return ret_msg_content(sender_name, msg)
            
        msg = msg.join(block[i] for i in range(2,len(block)))
        
    return {'sender_name' : sender_name, 'content' : msg}

# List of Messages
def parse_text_chain(chain):
    messages = []
    block = []
    blocks = []
    for line in chain:
        stripped_line = line.replace("    ", "").strip()
        if is_date_line(stripped_line) and len(block) > 0:
            blocks.append(clean_block(block))
            block = [stripped_line]
        else:
            block.append(stripped_line)

    blocks.append(block)

    for msg_block in blocks:
        if parse_msg(msg_block) == None:
            return messages

        messages.append(parse_msg(msg_block))

    return messages
        

In [None]:
# Function to parse a text file and extract messages

def parse_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    msg_blocks = get_msg_blocks(lines)

    messages = []
    for block in msg_blocks:
        if any(m.startswith('    ') for m in block):
            msgs = parse_text_chain(block)
            if len(msgs) > 0:
                messages.extend(msgs)
        else:
            msg = parse_msg(block)
            if msg is not None:
                messages.append(parse_msg(block))

    return messages

In [None]:
# Iterate over files in the texts_txt directory
for filename in os.listdir('texts_txt'):
    messages = []
    if filename.endswith('.txt') and numbers_in_contacts(filename):
        filepath = os.path.join('texts_txt', filename)
        messages.extend(parse_text_file(filepath))
        with open(f'messages_{filename}.csv', 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['sender_name', 'content']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for message in messages:
                writer.writerow(message)
