In [14]:
import os
import pandas as pd
import datetime
import re

DELAY_THRESHOLD = 15  # minutes in delay threshold

# Regex pattern for message start and standard message identification
regex_pattern = r'^\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}\s[ap]m -'
standard_msg_regex = re.compile(r'^\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}\s[ap]m - .*:')

# Function to read chat files and extract standard messages
def read_chat_file(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if standard_msg_regex.match(line):
                messages.append(line.strip())
    return messages

# Helper function to extract timestamp and sender from a message
def extract_timestamp_and_sender(message):
    timestamp_str, sender_and_message = message.split(' - ', 1)
    sender = sender_and_message.split(':', 1)[0]
    timestamp = pd.to_datetime(timestamp_str, format='%d/%m/%y, %I:%M %p', errors='coerce')

    # Exclude lines where sender is a phone number or 'Unknown'
    if sender.strip().startswith('+') or 'added' in sender:
        sender = "Unknown"
    return timestamp, sender

# Function to aggregate messages by date
def aggregate_messages_by_date(root_directory):
    aggregated_messages = {}
    for date_folder in os.listdir(root_directory):
        date_path = os.path.join(root_directory, date_folder)
        messages_for_date = []
        for team_folder in os.listdir(date_path):
            team_path = os.path.join(date_path, team_folder)
            for person_folder in os.listdir(team_path):
                person_path = os.path.join(team_path, person_folder)
                for file in os.listdir(person_path):
                    if file.endswith('.txt'):
                        chat_file_path = os.path.join(person_path, file)
                        messages = read_chat_file(chat_file_path)
                        messages_for_date.extend(messages)
        aggregated_messages[date_folder] = messages_for_date
    return aggregated_messages

# Function to check for delay and calculate delay time
def check_for_delay_and_calculate_time(messages):
    delays = []
    for i in range(len(messages) - 8):  # Iterate with enough messages left for last 7
        current_msg = messages[i]
        next_msg = messages[i + 1]
        current_timestamp, current_sender = extract_timestamp_and_sender(current_msg)
        next_timestamp, next_sender = extract_timestamp_and_sender(next_msg)

        if current_sender == next_sender and current_sender != "Unknown":  # Check delay for the same known sender
            time_diff = (next_timestamp - current_timestamp).total_seconds() / 60
            if time_diff > DELAY_THRESHOLD:
                delay_info = (i, time_diff)  # Store index and time difference
                delays.append(delay_info)
    
    return delays

# Helper function to extract the group name from the file name
def extract_group_name(file_name):
    group_name = file_name.replace('WhatsApp Chat with ', '').replace('.txt', '')
    return group_name

# Main analysis process
def main_analysis(root_directory):
    analysis_table = pd.DataFrame(columns=['Date', 'Chat Group Name', 'Person', 'Delay Detected', 'Delay Time', 'Last 7 Messages'])
    aggregated_messages = aggregate_messages_by_date(root_directory)
    
    for date, messages in aggregated_messages.items():
        delays = check_for_delay_and_calculate_time(messages)
        for delay_index, delay_time in delays:
            last_7_messages = ' | '.join(messages[delay_index - 6:delay_index + 1])
            sender = extract_timestamp_and_sender(messages[delay_index])[1]
            
            if sender != "Unknown":  # Skip entries where the sender is 'Unknown'
                file_name = messages[delay_index].split(' - ')[-1]
                group_name = extract_group_name(file_name)
                analysis_table = analysis_table.append({
                    'Date': date,
                    'Chat Group Name': group_name,
                    'Person': sender,
                    'Delay Detected': True,
                    'Delay Time': delay_time,
                    'Last 7 Messages': last_7_messages
                }, ignore_index=True)

    return analysis_table

# Run the analysis
root_directory = 'C:\\Users\\maurice\\Documents\\Chat-Analyzer-V2\\Test\\filtered_chats'  # Replace with the actual path
delay_analysis_table = main_analysis(root_directory)
print(delay_analysis_table)


            Date                                    Chat Group Name   Person  \
0     2023-11-28                Arshita: Source: Nirnoy EWYL23E0617  Arshita   
1     2023-11-28                      Arshita: *Points to remember*  Arshita   
2     2023-11-28                                Arshita: Greetings!  Arshita   
3     2023-11-28                           Arshita: <Media omitted>  Arshita   
4     2023-11-28                             Arshita: Good Morning,  Arshita   
...          ...                                                ...      ...   
5825  2023-11-28                            Tushti: <Media omitted>   Tushti   
5826  2023-11-28  Tushti: Are the parents ready to join the sess...   Tushti   
5827  2023-11-28  Tushti: *As we celebrate the festival of Diwal...   Tushti   
5828  2023-11-28  Tushti: Hello , Good afternoon, Hope you are d...   Tushti   
5829  2023-11-28                            Tushti: <Media omitted>   Tushti   

     Delay Detected  Delay Time  \
0   

In [16]:
delay_analysis_table.to_csv("tab3_v2.csv",index=False)

In [None]:
root_directory = 'C:\\Users\\maurice\\Documents\\Chat-Analyzer-V2\\Test\\filtered_chats'  # Replace with the actual path


In [18]:
import os
import pandas as pd
import datetime
import re

DELAY_THRESHOLD = 15  # minutes in delay threshold
standard_msg_regex = re.compile(r'^\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}\s[ap]m - .*:')

# Function to read chat files and extract standard messages
def read_chat_file(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if standard_msg_regex.match(line):
                messages.append(line.strip())
    return messages

# Helper function to extract timestamp and sender from a message
def extract_timestamp_and_sender(message):
    timestamp_str, sender_and_message = message.split(' - ', 1)
    sender = sender_and_message.split(':', 1)[0]
    timestamp = pd.to_datetime(timestamp_str, format='%d/%m/%y, %I:%M %p', errors='coerce')

    # Exclude lines where sender is a phone number or 'Unknown'
    if sender.strip().startswith('+') or 'added' in sender:
        sender = "Unknown"
    return timestamp, sender

def aggregate_messages_by_date(root_directory):
    aggregated_messages = {}
    for date_folder in os.listdir(root_directory):
        date_path = os.path.join(root_directory, date_folder)
        messages_for_date = {}
        for team_folder in os.listdir(date_path):
            team_path = os.path.join(date_path, team_folder)
            for person_folder in os.listdir(team_path):
                person_path = os.path.join(team_path, person_folder)
                for file in os.listdir(person_path):
                    if file.endswith('.txt'):
                        chat_file_path = os.path.join(person_path, file)
                        messages = read_chat_file(chat_file_path)
                        messages_for_date[file] = messages  # Store messages with filename as key
        aggregated_messages[date_folder] = messages_for_date
    return aggregated_messages


def extract_group_name(file_name):
    group_name = file_name.split('WhatsApp Chat with ')[-1].split('.txt')[0]
    return group_name

def check_for_delay_and_calculate_time(messages):
    delays = []
    for i in range(7, len(messages)):  # Start from the 8th message
        current_msg = messages[i]
        current_timestamp, current_sender = extract_timestamp_and_sender(current_msg)

        # Check for delays in the preceding 7 messages
        for j in range(1, 8):
            prev_msg = messages[i - j]
            prev_timestamp, prev_sender = extract_timestamp_and_sender(prev_msg)

            if current_sender == prev_sender and current_sender != "Unknown":
                time_diff = (current_timestamp - prev_timestamp).total_seconds() / 60
                if time_diff > DELAY_THRESHOLD:
                    delay_info = (i, time_diff)  # Store index and time difference
                    delays.append(delay_info)
                    break  # Break to avoid multiple entries for the same delay

    return delays

def main_analysis(root_directory):
    analysis_table = pd.DataFrame(columns=['Date', 'Chat Group Name', 'Person', 'Delay Detected', 'Delay Time', 'Last 7 Messages'])
    aggregated_messages = aggregate_messages_by_date(root_directory)
    
    for date, messages in aggregated_messages.items():
        for file_name, file_messages in messages.items():
            group_name = extract_group_name(file_name)
            delays = check_for_delay_and_calculate_time(file_messages)
            for delay_index, delay_time in delays:
                last_7_messages = ' | '.join(file_messages[delay_index - 7:delay_index])
                sender = extract_timestamp_and_sender(file_messages[delay_index])[1]
                if sender != "Unknown":
                    analysis_table = analysis_table.append({
                        'Date': date,
                        'Chat Group Name': group_name,
                        'Person': sender,
                        'Delay Detected': True,
                        'Delay Time': delay_time,
                        'Last 7 Messages': last_7_messages
                    }, ignore_index=True)

    return analysis_table

# Run the analysis
root_directory = 'C:\\Users\\maurice\\Documents\\Chat-Analyzer-V2\\Test\\filtered_chats'  # Replace with the actual path
delay_analysis_table = main_analysis(root_directory)
print(delay_analysis_table)


AttributeError: 'list' object has no attribute 'items'