In [1]:
import re
import pandas as pd
from typing import List, Dict
import os 
import json

In [2]:
# List of profane words
PROFANE_WORDS = [
    "damn", "dammit", "goddamn",
    "assholes",
    "hell", 
    "crap", "bullcrap", 
    "shit", "bullshit", "holy shit", "shitty", 
    "ass", "asshole", "dumbass", "jackass", 
    "fuck", "fucking", "fucked", "motherfucker", "fuckface","fuckoff", 
    "bitch", "bitches", "son of a bitch", 
    "bastard", 
    "jerk", 
    "idiot", 
    "moron", 
    "loser", 
    "screw you", 
    "piss", "pissed", "piss off", 
    "freak", "freaking", 
    "frustration",
    "douche", "douchebag", 
    "prick", 
    "scumbag", 
    "retard", 
    "slut", 
    "trash", 
    "dirtbag", 
    "shithead", 
    "fool", 
    "lazy ass", 
    "deadbeat", "deadbeats"
    "cheapskate", 
    "liar", 
    "crook", "crooks"
    "thief", 
    "steal",
    "suck", 
    "jerkwad"
    "lost",
    "ignore",
    "unprofessionally",
    "charity",
    "don't care",
    "frustration",
    "joke",
    "crying",
    "deeper",
    "dime",
    "serious",
    "waste",
    "shove",
    "stupid",
    "leech",
    "annoying",
    "owe",
    "kidding",
    "mommy",
    "insult",
    "digging",
    "chase",
    "language",
    "dodging",
    "negotiate",
    "vanish",
    "pathetic",
    "deadbeat",
    "harassing",
    "sob",
    "excuses",
    "ruthless",
    "hell",
    "buzz",
    "consequences",
    "racking",
    "ridiculous",
    "rocket science"
    "goon",
    
]

In [3]:
# Function to detect profanity in a text
def detect_profanity(text: str) -> bool:
    """
    Detects profanity in a given text using regex.
    Args:
        text (str): The text to analyze.
    Returns:
        bool: True if profanity is detected, False otherwise.
    """
    pattern = re.compile(r'\b(' + '|'.join(PROFANE_WORDS) + r')\b', re.IGNORECASE)
    return bool(pattern.search(text))

In [4]:
# Function to analyze a conversation for profanity
def analyze_conversation(conversation: List[Dict]) -> Dict[str, List[str]]:
    """
    Analyzes a conversation for profanity by both agents and borrowers.
    Args:
        conversation (List[Dict]): The conversation data in JSON format.
    Returns:
        Dict[str, List[str]]: A dictionary containing lists of profane utterances by agents and borrowers.
    """
    agent_profanity = []
    borrower_profanity = []

    for utterance in conversation:
        speaker = utterance["speaker"]
        text = utterance["text"]
        if detect_profanity(text):
            print(f"Profanity detected in text: {text} (Speaker: {speaker})")
            if speaker == "Agent":
                agent_profanity.append(text)
            elif speaker == "Customer":
                borrower_profanity.append(text)

    return {
        "agent_profanity": agent_profanity,
        "borrower_profanity": borrower_profanity
    }

In [5]:
# Function to process all YAML files in a directory
def process_conversations(data_dir: str) -> pd.DataFrame:
    """
    Processes all YAML files in the specified directory and analyzes them for profanity.
    Args:
        data_dir (str): Path to the directory containing YAML files.
    Returns:
        pd.DataFrame: A DataFrame containing call IDs and profanity analysis results.
    """
    results = []

    # Iterate over all files in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith(".yaml") or filename.endswith(".yml"):
            file_path = os.path.join(data_dir, filename)
            call_id = os.path.splitext(filename)[0]  # Extract call ID from filename

            # Load and parse the YAML file
            with open(file_path, "r") as file:
                conversation = yaml.safe_load(file)

            # Analyze the conversation
            analysis_result = analyze_conversation(conversation)

            # Append results
            results.append({
                "call_id": call_id,
                "agent_profanity": analysis_result["agent_profanity"],
                "borrower_profanity": analysis_result["borrower_profanity"]
            })

    # Convert results to a DataFrame
    return pd.DataFrame(results)


In [6]:
# Function to process all JSON files in a directory
def process_conversations(data_dir: str) -> pd.DataFrame:
    """
    Processes all JSON files in the specified directory and analyzes them for profanity.
    Args:
        data_dir (str): Path to the directory containing JSON files.
    Returns:
        pd.DataFrame: A DataFrame containing call IDs and profanity analysis results.
    """
    results = []

    # Iterate over all files in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):  # Look for JSON files
            file_path = os.path.join(data_dir, filename)
            call_id = os.path.splitext(filename)[0]  # Extract call ID from filename

            # Load and parse the JSON file
            with open(file_path, "r") as file:
                try:
                    conversation = json.load(file)
                    print(f"Processing call ID: {call_id}")
                except json.JSONDecodeError as e:
                    print(f"Error parsing {filename}: {e}")
                    continue

            # Analyze the conversation
            analysis_result = analyze_conversation(conversation)

            # Append results
            results.append({
                "call_id": call_id,
                "agent_profanity": analysis_result["agent_profanity"],
                "borrower_profanity": analysis_result["borrower_profanity"]
            })

    # Convert results to a DataFrame
    return pd.DataFrame(results)

In [7]:
if __name__ == "__main__":
    # Path to the directory containing JSON files
    data_dir = "/Users/mohammadadnaan/Downloads/Prodigal_Tech/intelligent-debt-recovery-genai/data"

    # Verify directory and files
    print(f"Files in directory: {os.listdir(data_dir)}")

    # Process all conversations
    results_df = process_conversations(data_dir)

    # Save results to a CSV file
    output_file = "profanity_analysis_results.csv"
    results_df.to_csv(output_file, index=False)
    print(f"Profanity analysis results saved to {output_file}")

Files in directory: ['2db2965e-54fa-41fa-823b-ed79b943f0b1.json', '8a9655a7-be88-4921-b0ad-04aa1b9953d1.json', '19169ec6-213f-48e9-8ec2-c24ee2e6eb20.json', 'c86714f0-3aeb-4628-8cfa-ee0a46839508.json', '0c8297aa-ced1-414e-a175-bf29a9763d30.json', '89ab3694-2c42-46bc-a602-c545511bf36e.json', '94ecf497-5959-44ec-a011-92af3709c38c.json', '87e28dae-2c70-4122-9452-6ec82164fab2.json', '80e5fe02-ec03-4ca5-902a-fdf38e6b7b8a.json', '3e6dde01-1a46-42b4-92dd-0a211185e660.json', '22dd8271-02f8-46bf-bd4e-28c0692b57d1.json', 'acec9db1-1209-4f98-881e-6d413e1aba2d.json', '6d0dd32e-cc1e-4624-88f9-2bb70db56f1c.json', 'f2c33efc-17f4-4e42-ab36-8394328d9591.json', 'b3cc69d6-7e4d-46bc-908f-29f15cebac61.json', 'c1cf9a45-0a69-4543-9994-117bdd4fdaaf.json', '89d28c5a-f0da-41e9-a93e-0a749c20189c.json', 'd7bbea61-d739-43fb-a198-ced1b59f9491.json', '23712d70-65df-47f3-8b69-fe2682aebc70.json', '367fd789-cfa1-458b-b005-4a846f6769d9.json', '911b0b97-93ec-49b4-a0ac-7a76dd6a40ce.json', '66783778-028a-45ff-b1b0-f5c0ea759