In [None]:
import os
import pandas as pd
import datetime
import re
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to list all chat files in the directory structure
def list_chat_files(date_directory):
    chat_files = []
    for date_folder in os.listdir(date_directory):
        date_path = os.path.join(date_directory, date_folder)
        if os.path.isdir(date_path):
            for team_folder in os.listdir(date_path):
                team_path = os.path.join(date_path, team_folder)
                if os.path.isdir(team_path):
                    for person_folder in os.listdir(team_path):
                        person_path = os.path.join(team_path, person_folder)
                        if os.path.isdir(person_path):
                            for file in os.listdir(person_path):
                                if file.endswith('.txt'):
                                    chat_files.append(os.path.join(person_path, file))
    return chat_files

def is_expected_group_format(group_name, team_name):
    if team_name.lower() == 'sales':
        pattern = re.compile(r'.+?_(EDOOFA|edoofa|Edoofa|EA)|\(\d{2}_\d{2}\)')
        return bool(pattern.search(group_name))
    else:
        parts = group_name.split()
        return len(parts) >= 2 and not (re.search(r'[^A-Za-z0-9]', parts[0]) or re.search(r'[^A-Za-z0-9]', parts[1]))

def parse_chat_group_name(file_path):
    team_name = os.path.basename(os.path.dirname(os.path.dirname(file_path)))
    file_name = os.path.basename(file_path)
    
    # Remove the "WhatsApp Chat with " prefix and the file extension
    group_name_with_extension = file_name.replace("WhatsApp Chat with ", "")
    group_name = os.path.splitext(group_name_with_extension)[0]
    
    # Check for duplicates like (1), (2) etc., and remove them
    group_name = re.sub(r'\(\d+\)$', '', group_name)

    # Check if the group name is in the expected format
    expected_format = is_expected_group_format(group_name, team_name)
    return group_name, expected_format

def parse_chat_file_for_delay_analysis(file_path):
    person_name = os.path.basename(os.path.dirname(file_path))
    folder_date_str = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(file_path))))
    expected_date_minus_one = pd.to_datetime(folder_date_str, format='%Y-%m-%d').date() - datetime.timedelta(days=1)

    delay_messages = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    last_person_msg_time = None

    for i, line in enumerate(lines):
        message_match = re.match(r'(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m) - (.*?): (.*)', line)
        if message_match:
            date_time_str, sender, message = message_match.groups()
            date_time = pd.to_datetime(date_time_str, format='%d/%m/%y, %I:%M %p')

            if date_time.date() != expected_date_minus_one:
                continue

            is_person = sender is not None and not (sender.strip().isnumeric() or sender.startswith('+'))

            if is_person:
                if last_person_msg_time is not None:
                    time_diff_seconds = (date_time - last_person_msg_time).total_seconds()
                    time_diff_minutes = time_diff_seconds / 60
                    delay = time_diff_minutes > 15
                else:
                    delay = False
                last_person_msg_time = date_time
            else:
                delay = False

            if delay:
                delay_messages.append((date_time, sender, message))

    return delay_messages

def process_all_files(chat_files):
    all_delay_messages = []

    for file_path in chat_files:
        logging.info(f"Processing file: {file_path}")  # Debug line
        delay_messages = parse_chat_file_for_delay_analysis(file_path)

        for message in delay_messages:
            timestamp, sender, msg = message
            chat_group_name, expected_format = parse_chat_group_name(file_path)
            if expected_format:
                all_delay_messages.append((timestamp, sender, chat_group_name, True))

    return all_delay_messages

def create_delay_data_dataframe(all_delay_data):
    columns = ['Date', 'Person', 'Chat Group Name', 'Delay']
    df_data = []

    for delay_instance in all_delay_data:
        date_time, sender, chat_group_name, delay = delay_instance
        df_data.append([date_time, sender, chat_group_name, delay])

    delay_df = pd.DataFrame(df_data, columns=columns)
    return delay_df

# Main Execution
date_directory = "C:\\Users\\mauriceyeng\\Python\\Daily-Reports\\Chat Folder from Drive\\drive-download-20231201T052455Z-001"
chat_files = list_chat_files(date_directory)
all_delay_data = process_all_files(chat_files)
delay_analysis_df = create_delay_data_dataframe(all_delay_data)

# Display the first few rows of the DataFrame
print(delay_analysis_df.head())


In [None]:
delay_analysis_df.to_csv("delay analyzed.csv",index=False)

# Expected Format Checks

In [None]:
delay_analysis_df.to_csv("delay_1.csv",index=False)

In [None]:
import os
import pandas as pd
import datetime
import re
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to list all chat files in the directory structure
def list_chat_files(date_directory):
    chat_files = []
    for date_folder in os.listdir(date_directory):
        date_path = os.path.join(date_directory, date_folder)
        if os.path.isdir(date_path):
            for team_folder in os.listdir(date_path):
                team_path = os.path.join(date_path, team_folder)
                if os.path.isdir(team_path):
                    for person_folder in os.listdir(team_path):
                        person_path = os.path.join(team_path, person_folder)
                        if os.path.isdir(person_path):
                            for file in os.listdir(person_path):
                                if file.endswith('.txt'):
                                    chat_files.append(os.path.join(person_path, file))
    return chat_files

def is_expected_group_format(group_name, team_name):
    if team_name.lower() == 'sales':
        pattern = re.compile(r'.+?_(EDOOFA|edoofa|Edoofa|EA)|\(\d{2}_\d{2}\)')
        return bool(pattern.search(group_name))
    else:
        parts = group_name.split()
        return len(parts) >= 2 and not (re.search(r'[^A-Za-z0-9]', parts[0]) or re.search(r'[^A-Za-z0-9]', parts[1]))

def parse_chat_group_name(file_path):
    team_name = os.path.basename(os.path.dirname(os.path.dirname(file_path)))
    file_name = os.path.basename(file_path)
    
    # Remove the "WhatsApp Chat with " prefix and the file extension
    group_name_with_extension = file_name.replace("WhatsApp Chat with ", "")
    group_name = os.path.splitext(group_name_with_extension)[0]
    
    # Check for duplicates like (1), (2) etc., and remove them
    group_name = re.sub(r'\(\d+\)$', '', group_name)

    # Check if the group name is in the expected format
    expected_format = is_expected_group_format(group_name, team_name)
    return group_name, expected_format

def parse_chat_file_for_delay_analysis(file_path):
    person_name = os.path.basename(os.path.dirname(file_path))
    folder_date_str = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(file_path))))
    expected_date_minus_one = pd.to_datetime(folder_date_str, format='%Y-%m-%d').date() - datetime.timedelta(days=1)

    delay_messages = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    last_person_msg_time = None

    for i, line in enumerate(lines):
        message_match = re.match(r'(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m) - (.*?): (.*)', line)
        if message_match:
            date_time_str, sender, message = message_match.groups()
            date_time = pd.to_datetime(date_time_str, format='%d/%m/%y, %I:%M %p')

            if date_time.date() != expected_date_minus_one:
                continue

            is_person = sender is not None and not (sender.strip().isnumeric() or sender.startswith('+'))

            if is_person:
                if last_person_msg_time is not None:
                    time_diff_seconds = (date_time - last_person_msg_time).total_seconds()
                    time_diff_minutes = time_diff_seconds / 60
                    delay = time_diff_minutes > 15
                else:
                    delay = False
                last_person_msg_time = date_time
            else:
                delay = False

            if delay:
                delay_messages.append((date_time, sender, message))

    return delay_messages

def process_all_files(chat_files):
    all_delay_messages = []

    for file_path in chat_files:
        logging.info(f"Processing file: {file_path}")  # Debug line
        delay_messages = parse_chat_file_for_delay_analysis(file_path)

        for message in delay_messages:
            timestamp, sender, msg = message
            chat_group_name, expected_format = parse_chat_group_name(file_path)
            all_delay_messages.append((timestamp, sender, chat_group_name, expected_format, True))

    return all_delay_messages

def create_delay_data_dataframe(all_delay_data):
    columns = ['Date', 'Person', 'Chat Group Name', 'Expected Format', 'Delay']
    df_data = []

    for delay_instance in all_delay_data:
        date_time, sender, chat_group_name, expected_format, delay = delay_instance
        df_data.append([date_time, sender, chat_group_name, expected_format, delay])

    delay_df = pd.DataFrame(df_data, columns=columns)
    return delay_df

# Main Execution
date_directory = "C:\\Users\\maurice\\Documents\\Chat-Analyzer-V2\\Chat Folder from Drive\\drive-download-20231130T060206Z-001"
chat_files = list_chat_files(date_directory)
all_delay_data = process_all_files(chat_files)
delay_analysis_df = create_delay_data_dataframe(all_delay_data)

# Display the first few rows of the DataFrame
print(delay_analysis_df.head())


In [None]:
delay_analysis_df.to_csv("delay_2.csv",index=False)

# integrating with table 4

In [None]:
import os
import pandas as pd
import datetime
import re

def list_chat_files(date_directory):
    chat_files = []
    for date_folder in os.listdir(date_directory):
        date_path = os.path.join(date_directory, date_folder)
        if os.path.isdir(date_path):
            for team_folder in os.listdir(date_path):
                team_path = os.path.join(date_path, team_folder)
                if os.path.isdir(team_path):
                    for person_folder in os.listdir(team_path):
                        person_path = os.path.join(team_path, person_folder)
                        if os.path.isdir(person_path):
                            for file in os.listdir(person_path):
                                if file.endswith('.txt'):
                                    chat_files.append(os.path.join(person_path, file))
    return chat_files

def parse_chat_file(file_path):
    chat_data = []
    today_minus_one = datetime.datetime.now().date() - datetime.timedelta(days=1)

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            message_match = re.match(r'(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m) - (.*?): (.*)', line)
            system_match = re.match(r'(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m) - (.*)', line)
            if message_match:
                date_time_str, sender, message = message_match.groups()
            elif system_match:
                date_time_str, info = system_match.groups()
                sender = None
            else:
                continue

            date_time = pd.to_datetime(date_time_str, format='%d/%m/%y, %I:%M %p')

            if date_time.date() != today_minus_one:
                continue

            is_person = sender is not None and re.match(r'^[+\d\s-]+$', sender) is None
            chat_data.append((date_time, sender, is_person))
    return chat_data

def create_template_dataframe():
    times = [datetime.datetime(2000, 1, 1, 0, 0) + datetime.timedelta(minutes=1 * i) for i in range(1440)]
    intervals = [time.strftime('%I:%M %p') for time in times]
    df = pd.DataFrame(index=intervals)
    df['Number of Active Chats'] = 0  # Initialize 'Number of Active Chats' column
    return df


def populate_dataframe(df, parsed_data, group_name, last_msg_info_dict):
    delay_threshold = 15  # Delay threshold in minutes
    group_name_str = str(group_name)  # Ensure group_name is a string

    print(f"Processing chat group: {group_name_str}")  # Debug line

    personal_msg_col_index = len(df.columns)
    system_msg_col_index = personal_msg_col_index + 1
    delay_col_index = personal_msg_col_index + 2

    # Add new columns for this chat
    df[group_name_str + " Personal"] = 0  # Column for personal messages
    df[group_name_str + " System"] = 0    # Column for system messages
    df[group_name_str + " Delay"] = False # Column for delay status

    for entry in parsed_data:
        date_time, sender, is_person = entry
        interval_index = min((date_time.hour * 60 + date_time.minute) // 1, 1439)
        interval = df.index[interval_index]  # Adjust if necessary for the DataFrame structure

        print(f"Processing message at {date_time} by {'person' if is_person else 'system'}")  # Debug line

        if is_person:
            df.at[interval, group_name_str + " Personal"] = 1
            print(f"Added personal message for {group_name_str} at {interval}")  # Debug line

            if sender in last_msg_info_dict:
                last_msg_time, was_delayed = last_msg_info_dict[sender]
                time_diff = (date_time - last_msg_time).total_seconds() / 60
                if time_diff > delay_threshold and not was_delayed:
                    df.at[interval, group_name_str + " Delay"] = True
                    print(f"Delay detected for {group_name_str} at {interval}")  # Debug line
                    was_delayed = True
                else:
                    was_delayed = False
            else:
                was_delayed = False
            last_msg_info_dict[sender] = (date_time, was_delayed)
        else:
            df.at[interval, group_name_str + " System"] = 1
            print(f"Added system message for {group_name_str} at {interval}")  # Debug line

    # Update 'Number of Active Chats' for each interval
    print("Updating 'Number of Active Chats' for each interval")  # Debug line
    active_chats_col = df.columns.get_loc('Number of Active Chats')
    for i in range(len(df)):
        active_chats = 0
        for j in range(2, len(df.columns), 3):  # Iterate over personal message columns
            if df.iat[i, j] == 1 or df.iat[i, j + 1] == 1:  # Check personal and system messages
                active_chats = 1
                break
        df.iat[i, active_chats_col] = active_chats
        if active_chats:
            print(f"Active chat detected at {df.index[i]}")  # Debug line

    return df




def process_person_chats(chat_files):
    dataframes = {}
    last_msg_info_dict = {}

    for file in chat_files:
        parts = file.split(os.sep)
        date_folder, person = parts[-4], parts[-2]

        try:
            expected_date = pd.to_datetime(date_folder).date()
        except ValueError:
            continue

        key = f"{expected_date.strftime('%Y-%m-%d')}_{person}"

        if key not in dataframes:
            dataframes[key] = create_template_dataframe()

        parsed_data = parse_chat_file(file)
        group_name = "some_group_name"  # Replace with actual logic to determine group name
        dataframes[key] = populate_dataframe(dataframes[key], parsed_data, group_name, last_msg_info_dict)
    
    return dataframes


# Main script execution
date_directory = "C:\\Users\\mauriceyeng\\Python\\Daily-Reports\\Chat Folder from Drive\\drive-download-20231201T052455Z-001"
chat_files = list_chat_files(date_directory)
person_dataframes = process_person_chats(chat_files)


In [None]:
for key, df in person_dataframes.items():
    print(f"Summary for {key}:")
    print(df.describe())  # Or any other analysis you want


In [None]:
for key, df in person_dataframes.items():
    df.to_csv(f"{key}.csv")  # This will save each dataframe as a CSV file
    print(f"{key} has been saved as csv")

In [None]:
for key, df in person_dataframes.items():
    print(f"Head of dataframe for {key}:")
    print(df.head(5))
    print("\n")  # This adds an extra line for better readability between dataframes


# FIXING BUGS AND FINAL INTEGRATION

In [2]:
import os
import pandas as pd
import datetime
import re

def list_chat_files(date_directory):
    chat_files = []
    for date_folder in os.listdir(date_directory):
        date_path = os.path.join(date_directory, date_folder)
        if os.path.isdir(date_path):
            for team_folder in os.listdir(date_path):
                team_path = os.path.join(date_path, team_folder)
                if os.path.isdir(team_path):
                    for person_folder in os.listdir(team_path):
                        person_path = os.path.join(team_path, person_folder)
                        if os.path.isdir(person_path):
                            for file in os.listdir(person_path):
                                if file.endswith('.txt'):
                                    chat_files.append(os.path.join(person_path, file))
    return chat_files

def parse_chat_file(file_path):
    chat_data = []
    today_minus_one = datetime.datetime.now().date() - datetime.timedelta(days=1)

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            message_match = re.match(r'(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m) - (.*?): (.*)', line)
            system_match = re.match(r'(\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [ap]m) - (.*)', line)
            if message_match:
                date_time_str, sender, message = message_match.groups()
            elif system_match:
                date_time_str, info = system_match.groups()
                sender = None
            else:
                continue

            date_time = pd.to_datetime(date_time_str, format='%d/%m/%y, %I:%M %p')

            if date_time.date() != today_minus_one:
                continue

            is_person = sender is not None and re.match(r'^[+\d\s-]+$', sender) is None
            chat_data.append((date_time, sender, is_person))
    return chat_data

def create_template_dataframe():
    times = [datetime.datetime(2000, 1, 1, 0, 0) + datetime.timedelta(minutes=1 * i) for i in range(1440)]
    intervals = [time.strftime('%I:%M %p') for time in times]
    df = pd.DataFrame(index=intervals)
    df['Number of Active Chats'] = 0  # Initialize 'Number of Active Chats' column
    return df

def populate_dataframe(df, parsed_data, start_column_index, last_msg_info_dict, delay_threshold=15):
    new_columns = {}  # Dictionary to hold new data before concatenation

    for entry in parsed_data:
        date_time, sender, is_person = entry
        interval_index = min((date_time.hour * 60 + date_time.minute) // 1, 1439)
        interval = df.index[interval_index]

        # Debug line: Print each processed entry
        print(f"Processing entry: {entry}, Interval: {interval}")

        # Initialize columns in new_columns dictionary if not exist
        for i in range(3):  # We need 3 columns for Personal, System, and Delay
            if (start_column_index + i not in new_columns):
                new_columns[start_column_index + i] = pd.Series(0, index=df.index, dtype=int)

        # Populate the new_columns dictionary
        if is_person:
            new_columns[start_column_index].at[interval] = 1  # Personal message column

            # Delay calculation
            was_delayed = 0
            if sender in last_msg_info_dict:
                last_msg_time = last_msg_info_dict[sender]
                time_diff = (date_time - last_msg_time).total_seconds() / 60
                if time_diff > delay_threshold:
                    was_delayed = 1
                    # Debug line: Print when a delay is detected
                    print(f"Delay detected for {sender} at {interval}")

            last_msg_info_dict[sender] = date_time
            new_columns[start_column_index + 2].at[interval] = was_delayed
        else:
            new_columns[start_column_index + 1].at[interval] = 1  # System message column

    # Concatenate new columns to the DataFrame at once
    df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)

    return df, start_column_index + 3  # Return the updated dataframe and the next start index


def process_person_chats(chat_files):
    dataframes = {}
    last_msg_info_dict = {}  # Dictionary to keep track of the last message time for each sender

    for file in chat_files:
        parts = file.split(os.sep)
        date_folder, person = parts[-4], parts[-2]

        try:
            expected_date = pd.to_datetime(date_folder).date()
        except ValueError:
            print(f"Skipping file due to incorrect date format in folder name: {file}")
            continue

        key = f"{expected_date.strftime('%Y-%m-%d')}_{person}"

        if key not in dataframes:
            dataframes[key] = create_template_dataframe()
            start_column_index = 0
        else:
            # Filter to get only integer columns and find the maximum
            int_columns = [col for col in dataframes[key].columns if isinstance(col, int)]
            start_column_index = max(int_columns, default=-1) + 1

        parsed_data = parse_chat_file(file)
        # Populate the dataframe and update the start column index for the next group
        dataframes[key], start_column_index = populate_dataframe(dataframes[key], parsed_data, start_column_index, last_msg_info_dict)

    return dataframes

# Main script execution
date_directory = "C:\\Users\\mauriceyeng\\Python\\Daily-Reports\\Chat Folder from Drive\\drive-download-20231201T052455Z-001"
chat_files = list_chat_files(date_directory)
person_dataframes = process_person_chats(chat_files)


In [3]:
for key, df in person_dataframes.items():
    print(f"Head of dataframe for {key}:")
    print(df.head(5))
    print("\n")  # This adds an extra line for better readability between dataframes


Head of dataframe for 2023-12-01_Aditi_Edoofa:
          Number of Active Chats  0  1  2  3  4  5  6  7  8  ...  212  213  \
12:00 AM                       0  0  0  0  0  0  0  0  0  0  ...    0    0   
12:01 AM                       0  0  0  0  0  0  0  0  0  0  ...    0    0   
12:02 AM                       0  0  0  0  0  0  0  0  0  0  ...    0    0   
12:03 AM                       0  0  0  0  0  0  0  0  0  0  ...    0    0   
12:04 AM                       0  0  0  0  0  0  0  0  0  0  ...    0    0   

          214  215  216  217  218  219  220  221  
12:00 AM    0    0    0    0    0    0    0    0  
12:01 AM    0    0    0    0    0    0    0    0  
12:02 AM    0    0    0    0    0    0    0    0  
12:03 AM    0    0    0    0    0    0    0    0  
12:04 AM    0    0    0    0    0    0    0    0  

[5 rows x 223 columns]


Head of dataframe for 2023-12-01_Ananya_Edoofa:
          Number of Active Chats  0  1  2  3  4  5  6  7  8  ...  194  195  \
12:00 AM                  