In [9]:
import os
import pandas as pd
import datetime
import re

DELAY_THRESHOLD = 15  # minutes in delay threshold

# Regular expression to identify standard messages
standard_msg_regex = re.compile(r'^\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}\s[ap]m - .*:')

def read_chat_file(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Check if the line matches the format of a standard message
            if standard_msg_regex.match(line):
                messages.append(line.strip())
    return messages

def extract_timestamp_and_sender(message):
    # Splitting the message to extract timestamp and sender
    timestamp_str, sender_and_message = message.split(' - ', 1)
    sender = sender_and_message.split(':', 1)[0]
    timestamp = pd.to_datetime(timestamp_str, format='%d/%m/%y, %I:%M %p', errors='coerce')

    # Exclude lines where sender is a phone number
    if sender.strip().startswith('+') or 'added' in sender:
        sender = "Unknown"
    return timestamp, sender



# Function to aggregate messages by date
def aggregate_messages_by_date(root_directory):
    aggregated_messages = {}
    for date_folder in os.listdir(root_directory):
        date_path = os.path.join(root_directory, date_folder)
        messages_for_date = []
        for team_folder in os.listdir(date_path):
            team_path = os.path.join(date_path, team_folder)
            for person_folder in os.listdir(team_path):
                person_path = os.path.join(team_path, person_folder)
                for file in os.listdir(person_path):
                    if file.endswith('.txt'):
                        chat_file_path = os.path.join(person_path, file)
                        messages = read_chat_file(chat_file_path)
                        messages_for_date.extend(messages)
        aggregated_messages[date_folder] = messages_for_date
    return aggregated_messages

# Function to check for delay and calculate delay time
def check_for_delay_and_calculate_time(messages):
    delays = []
    for i in range(len(messages) - 8):  # Iterate with enough messages left for last 7
        current_msg = messages[i]
        next_msg = messages[i + 1]
        current_timestamp, current_sender = extract_timestamp_and_sender(current_msg)
        next_timestamp, next_sender = extract_timestamp_and_sender(next_msg)

        if current_sender == next_sender:  # Check delay for the same sender
            time_diff = (next_timestamp - current_timestamp).total_seconds() / 60
            if time_diff > DELAY_THRESHOLD:
                delay_info = (i, time_diff)  # Store index and time difference
                delays.append(delay_info)
    
    return delays

def extract_group_name(file_name):
    # Assuming file_name is like "WhatsApp Chat with GroupName.txt"
    group_name = file_name.replace('WhatsApp Chat with ', '').replace('.txt', '')
    return group_name


def main_analysis(root_directory):
    analysis_table = pd.DataFrame(columns=['Date', 'Chat Group Name', 'Person', 'Delay Detected', 'Delay Time', 'Last 7 Messages'])
    aggregated_messages = aggregate_messages_by_date(root_directory)
    
    for date, messages in aggregated_messages.items():
        delays = check_for_delay_and_calculate_time(messages)
        for delay_index, delay_time in delays:
            sender = extract_timestamp_and_sender(messages[delay_index])[1]

            # Skip adding to dataframe if sender is 'Unknown'
            if sender == "Unknown":
                continue

            last_7_messages = ' | '.join(messages[delay_index - 6:delay_index + 1])
            group_name = extract_group_name(file_name)  # Make sure file_name is correctly provided

            analysis_table = analysis_table.append({
                'Date': date,
                'Chat Group Name': group_name,
                'Person': sender,
                'Delay Detected': True,
                'Delay Time': delay_time,
                'Last 7 Messages': last_7_messages
            }, ignore_index=True)

    return analysis_table




# Run the analysis
root_directory = 'C:\\Users\\maurice\\Documents\\Chat-Analyzer-V2\\Test\\filtered_chats'  # Replace with the actual path
delay_analysis_table = main_analysis(root_directory)
delay_analysis_table.head(5)



Unnamed: 0,Date,Chat Group Name,Person,Delay Detected,Delay Time,Last 7 Messages
0,2023-11-28,Arshita: Source: Nirnoy EWYL23E0617,Arshita,True,100.0,
1,2023-11-28,Arshita: *Points to remember*,Arshita,True,919.0,"27/11/23, 6:55 pm - Arshita: <Media omitted> |..."
2,2023-11-28,Arshita: Greetings!,Arshita,True,543.0,"27/11/23, 6:56 pm - Arshita: <Media omitted> |..."
3,2023-11-28,Arshita: <Media omitted>,Arshita,True,1012.0,"13/11/23, 6:30 pm - Arshita: Michelle EWYL23E0..."
4,2023-11-28,"Arshita: Good Morning,",Arshita,True,480.0,"13/11/23, 6:32 pm - Arshita: <Media omitted> |..."


In [10]:
delay_analysis_table.to_csv("tab3_v2.csv",index=False)