In [139]:
import pandas as pd
df = pd.read_csv('jira_tickets_cache.csv')
# df.head(1)

In [140]:
import pandas as pd
import re # We'll need this for regex operations later
summary_col = 'summary'
description_col = 'description'
comments_col = 'comments' # We'll use this later

# Create new columns for cleaned text to keep originals safe
# Fill NaN (missing) values with an empty string before any cleaning
df['cleaned_summary'] = df[summary_col].fillna('').astype(str)
df['cleaned_description'] = df[description_col].fillna('').astype(str)


In [141]:
import re
import json
import pandas as pd

# --- MODIFIED Step 1 & 2 for 'comments' (Filter by Problem Keywords, Text Only, Sorted) ---

comments_col = 'comments' # Ensure this matches your DataFrame column name
PROBLEM_KEYWORDS = [
    "error", "fail", "not working", "unable to", "crash", "timeout",
    "unexpected", "broken", "incorrect", "missing", "issue", "bug",
    "problem", "exception", "doesn't work", "can't", "cannot" # Added a few more common ones
]
# Create a regex pattern for these keywords, case-insensitive, matching whole words
# \b ensures we match "error" and not "terrorist"
problem_keyword_pattern = r'\b(?:' + '|'.join(re.escape(k) for k in PROBLEM_KEYWORDS) + r')\b'

def parse_filter_and_format_comments(comments_json_str): # Renamed for clarity
    """
    Parses a JSON string of comments, sorts them by timestamp,
    FILTERS them for problem_keywords in 'cleaned_body',
    extracts ONLY the 'cleaned_body' of filtered comments, and joins them.
    """
    if pd.isna(comments_json_str) or not comments_json_str.strip():
        return ""

    try:
        comments_list = json.loads(comments_json_str)
        if not isinstance(comments_list, list) or not comments_list:
            return ""

        try:
            comments_list.sort(key=lambda x: x.get('timestamp', ''))
        except TypeError:
            pass # Proceed with original order if sorting fails

        filtered_comment_texts = []
        for comment_obj in comments_list:
            comment_text = comment_obj.get('cleaned_body', '')
            if comment_text:
                # Check if any problem keyword is in the comment_text (case-insensitive)
                if re.search(problem_keyword_pattern, comment_text, flags=re.IGNORECASE):
                    filtered_comment_texts.append(comment_text)
        
        return "\n".join(filtered_comment_texts) # Join only the filtered comment texts

    except json.JSONDecodeError:
        return ""
    except Exception as e:
        # print(f"Error in parse_filter_and_format_comments: {e}") # For debugging
        return ""

# --- Define ALL your other cleaning functions here (or ensure they are in executed cells above) ---
# strip_jira_markup (ensure it removes "Original Author:", "Posted on:")
# normalize_whitespace
# standardize_case
# remove_user_mentions
# remove_urls
# manage_punctuation
# process_code_and_stack_traces
# remove_id_data_blobs (your Step 9.5)
# remove_or_replace_numbers (your Step 9.7)
# ...and any other functions like remove_domain_specific_data if you developed it further.


# --- RE-RUN THE ENTIRE CLEANING PIPELINE for 'cleaned_comments' ---
# Starting from the NEW parsing and filtering.

print("Re-processing 'cleaned_comments' with KEYWORD FILTERING and all subsequent cleaning...")

# 1. Apply the NEW parsing and filtering function
df['cleaned_comments'] = df[comments_col].apply(parse_filter_and_format_comments)
print("Step 1 (parse_filter_and_format_comments) for comments: Done")

# Display the first few rows of original and cleaned comments
print("\nOriginal and cleaned 'comments' (with timestamp sorting):")
print(df[[comments_col, 'cleaned_comments']].head(2))

print(f"\nMissing values in 'cleaned_comments': {df['cleaned_comments'].isnull().sum()}")



Re-processing 'cleaned_comments' with KEYWORD FILTERING and all subsequent cleaning...
Step 1 (parse_filter_and_format_comments) for comments: Done

Original and cleaned 'comments' (with timestamp sorting):
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [142]:
import re

# (Re-define the function if you're in a new cell, or modify the existing cell)
def strip_jira_markup(text):
    if not isinstance(text, str):
        return ""

    # {panel:...}content{panel} -> content
    text = re.sub(r'\{panel:[^}]*}(.*?)\{panel}', r'\1', text, flags=re.DOTALL)
    
    # {code[:lang]}content{code} -> content
    text = re.sub(r'\{code:[^}]*}(.*?)\{code}', r'\1', text, flags=re.DOTALL)
    text = re.sub(r'\{code}(.*?)\{code}', r'\1', text, flags=re.DOTALL)

    # {color:...}text{color} -> text
    text = re.sub(r'\{color:[^}]*}(.*?)\{color}', r'\1', text, flags=re.DOTALL)

    # Remove "Posted on: ..." lines (potentially left over from panel content)
    # This regex matches lines that start with "Posted on:" possibly with leading/trailing whitespace on the line itself.
    text = re.sub(r'^\s*Posted on:.*?\n', '', text, flags=re.MULTILINE)
    # If it might not end with a newline (e.g., last line of comment):
    text = re.sub(r'^\s*Posted on:.*?$', '', text, flags=re.MULTILINE)


    # Optional: Remove "Original Author: ..." lines if desired
    # text = re.sub(r'^\s*Original Author:.*?\n', '', text, flags=re.MULTILINE)
    # text = re.sub(r'^\s*Original Author:.*?$', '', text, flags=re.MULTILINE)

    # h1. to h6. headers
    text = re.sub(r'h[1-6]\.\s*', '', text)
    # *bold*
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    # _italic_
    text = re.sub(r'_(.*?)_', r'\1', text)
    # +underline+
    text = re.sub(r'\+(.*?)\+', r'\1', text)
    # -strikethrough-
    text = re.sub(r'-(.*?)-', r'\1', text)
    # ??citation??
    text = re.sub(r'\?\?(.*?)\?\?', r'\1', text)
    # {{monospaced}}
    text = re.sub(r'\{\{(.*?)\}\}', r'\1', text)
    # bq. blockquote
    text = re.sub(r'bq\.\s+', '', text)
    # Links
    text = re.sub(r'\[([^|\]]+)\|[^\]]+\]', r'\1', text)
    text = re.sub(r'\[([^\]]+)\]', r'\1', text)
    # Images
    text = re.sub(r'!([^!]+)!', '', text)
    # Basic list markers
    text = re.sub(r'^\s*[\*#-]\s+', '', text, flags=re.MULTILINE)
    # noformat, quote
    text = re.sub(r'\{noformat\}(.*?)\{noformat\}', r'\1', text, flags=re.DOTALL)
    text = re.sub(r'\{quote\}(.*?)\{quote\}', r'\1', text, flags=re.DOTALL)
    
    return text

# --- You would then re-apply this updated function ---
# --- and the subsequent whitespace and case normalization steps ---

# print("Re-applying cleaning starting from Step 3 (Markup Removal)...")

# Step 3: Apply updated strip_jira_markup
# df['cleaned_summary'] = df[summary_col].fillna('').astype(str).apply(strip_jira_markup) # Re-start from original filled NAs if making big changes
# df['cleaned_description'] = df[description_col].fillna('').astype(str).apply(strip_jira_markup)
# df['cleaned_comments'] = df[comments_col].apply(parse_and_format_comments).apply(strip_jira_markup) # Re-parse comments then strip

# For simplicity if previous steps (1 & 2) are solid, you can just apply to the already partially cleaned columns:
df['cleaned_summary'] = df['cleaned_summary'].apply(strip_jira_markup)
df['cleaned_description'] = df['cleaned_description'].apply(strip_jira_markup)
df['cleaned_comments'] = df['cleaned_comments'].apply(strip_jira_markup)


In [143]:
import re

# --- Step 4: Normalize Whitespace ---

def normalize_whitespace(text):
    if not isinstance(text, str):
        return ""
    
    # Step 4.1: Replace multiple spaces, tabs, and newlines with a single space
    # \s+ matches one or more whitespace characters (space, tab, newline, etc.)
    text = re.sub(r'\s+', ' ', text)
    
    # Step 4.2: Trim leading and trailing whitespace
    text = text.strip()
    
    return text

# Apply to the relevant text columns
df['cleaned_summary'] = df['cleaned_summary'].apply(normalize_whitespace)
df['cleaned_description'] = df['cleaned_description'].apply(normalize_whitespace)
df['cleaned_comments'] = df['cleaned_comments'].apply(normalize_whitespace)

print("\nText after whitespace normalization (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string())
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

first_comment_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
if pd.notna(first_comment_idx):
    print(f"(Ticket index {first_comment_idx})")
    # To see the effect clearly, especially if there were many newlines in comments,
    # print the version before this step vs after, if you have it saved.
    # For now, just printing the current state.
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    print("No comments found to display.")



Text after whitespace normalization (first 2 rows):
Cleaned Summary:
0                                                                           EMFDEV Add Coupon benefit
1    Tata Digital Prod (1000006): Tier Downgrade Job saves wrong Note for tier renew/downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [144]:
# --- Step 5: Standardize Case ---

def standardize_case(text):
    if not isinstance(text, str):
        return ""
    
    # Step 5.1: Convert text to lowercase
    text = text.lower()
    
    return text

# Apply to the relevant text columns
df['cleaned_summary'] = df['cleaned_summary'].apply(standardize_case)
df['cleaned_description'] = df['cleaned_description'].apply(standardize_case)
df['cleaned_comments'] = df['cleaned_comments'].apply(standardize_case)

print("\nText after case standardization (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string())
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

# Assuming 'comments_col' variable is still defined from previous steps
first_comment_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
if pd.notna(first_comment_idx):
    print(f"(Ticket index {first_comment_idx})")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    print("No comments found to display.")



Text after case standardization (first 2 rows):
Cleaned Summary:
0                                                                           emfdev add coupon benefit
1    tata digital prod (1000006): tier downgrade job saves wrong note for tier renew/downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [145]:
# df[['cleaned_summary', 'cleaned_description', 'cleaned_comments']].head(10)

In [146]:
import re

# --- Step 6: Handle User Mentions and Tags ---

def remove_user_mentions(text):
    if not isinstance(text, str):
        return ""
    
    # Remove Jira specific mentions like [~username] or [~accountid:xxxx...]
    text = re.sub(r'\[~[^\]]+\]', '', text)
    
    # Remove common @username mentions
    # This regex looks for @ followed by a sequence of word characters (letters, numbers, underscore)
    text = re.sub(r'@\w+', '', text)
    
    return text

# Apply to the relevant text columns
df['cleaned_summary'] = df['cleaned_summary'].apply(remove_user_mentions)
df['cleaned_description'] = df['cleaned_description'].apply(remove_user_mentions)
df['cleaned_comments'] = df['cleaned_comments'].apply(remove_user_mentions)

print("\nText after removing user mentions (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string()) # Assuming pandas display options are set
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

# Assuming 'comments_col' and 'first_comment_idx' are defined from previous steps
if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"(Ticket index {first_comment_idx})")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    # Fallback if first_comment_idx is not valid or not found
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})")
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")


Text after removing user mentions (first 2 rows):
Cleaned Summary:
0                                                                           emfdev add coupon benefit
1    tata digital prod (1000006): tier downgrade job saves wrong note for tier renew/downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [147]:
import re

def remove_urls_robust(text): # Renamed to indicate it's an updated version
    if not isinstance(text, str):
        return ""
    
    url_pattern = r"""\b(?:(?:https?|ftp)://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])"""
    
    text = re.sub(url_pattern, '', text, flags=re.IGNORECASE)
    
    return text


# Apply to the relevant text columns
df['cleaned_summary'] = df['cleaned_summary'].apply(remove_urls)
df['cleaned_description'] = df['cleaned_description'].apply(remove_urls)
df['cleaned_comments'] = df['cleaned_comments'].apply(remove_urls)

print("\nText after removing URLs (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string()) # Assuming pandas display options are set
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

# Assuming 'comments_col' and 'first_comment_idx' are defined from previous steps
# Make sure first_comment_idx is valid and exists in the DataFrame index
if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"(Ticket index {first_comment_idx})")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    # Fallback to find any comment if first_comment_idx is not set or invalid
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})") # Use the newly found index
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")



Text after removing URLs (first 2 rows):
Cleaned Summary:
0                                                                           emfdev add coupon benefit
1    tata digital prod (1000006): tier downgrade job saves wrong note for tier renew/downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [148]:
import re
import string # To help identify punctuation, though we'll define our keep list

# --- Step 8: Manage Special Characters and Punctuation ---

def manage_punctuation(text, keep_punctuation=".-_"):
    if not isinstance(text, str):
        return ""
    
    processed_chars = []
    for char in text:
        # Keep alphanumeric characters, characters in our 'keep_punctuation' list,
        # and existing whitespace (which will be normalized later).
        if char.isalnum() or char in keep_punctuation or char.isspace():
            processed_chars.append(char)
        else:
            # Replace other punctuation/special characters with a space
            processed_chars.append(' ')
            
    text = "".join(processed_chars)
    
    # Normalize whitespace again, as replacing punctuation with spaces can create multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
            
    return text

# Apply to the relevant text columns
# You can change the keep_punctuation argument if needed, e.g., df['cleaned_summary'].apply(manage_punctuation, keep_punctuation=".-_!?")
df['cleaned_summary'] = df['cleaned_summary'].apply(manage_punctuation)
df['cleaned_description'] = df['cleaned_description'].apply(manage_punctuation)
df['cleaned_comments'] = df['cleaned_comments'].apply(manage_punctuation)

print("\nText after managing punctuation (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string())
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

# Assuming 'comments_col' and 'first_comment_idx' are defined from previous steps
if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"(Ticket index {first_comment_idx})")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})")
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")


Text after managing punctuation (first 2 rows):
Cleaned Summary:
0                                                                        emfdev add coupon benefit
1    tata digital prod 1000006 tier downgrade job saves wrong note for tier renew downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [149]:
import re

# --- Step 9: Isolate or Neutralize Code Blocks and Stack Traces ---

def process_code_and_stack_traces(text):
    if not isinstance(text, str):
        return ""

    # 1. Identify and replace common stack trace patterns (Java, Python, C#, etc.)
    # Regex for typical Java/C# stack trace lines: "at package.class.method(File:line)"
    # or "at package.class.method(Native Method)"
    # This is a simplified regex and might need refinement for more complex traces.
    # It looks for multiple lines starting with "at " or common exception headers.
    stack_trace_pattern = r'((?:[a-zA-Z0-9_]+\.)+[a-zA-Z0-9_]+Exception(?:[:\s].*)?\n(?:^\s*at .*(?:\n|$))+)'
    # Python's "Traceback (most recent call last):"
    python_traceback_pattern = r'(Traceback \(most recent call last\):\n(?:(?:^\s*File ".*?", line \d+, in .*\n)|(?:^\s*.*\n))*?\w*Error:.*)'
    
    text = re.sub(stack_trace_pattern, ' <STACK_TRACE> ', text, flags=re.MULTILINE)
    text = re.sub(python_traceback_pattern, ' <STACK_TRACE> ', text, flags=re.MULTILINE)

    # 2. Identify and replace very specific code prompts/patterns
    # Example: mysql> ... ; (simple version, assumes single line for now)
    # This is highly heuristic.
    # Matches "mysql>" followed by any characters non-greedily (.*?) until a semicolon.
    # This is a basic example; real SQL can be multi-line and complex.
    mysql_pattern = r'mysql>.*?;'
    text = re.sub(mysql_pattern, ' <CODE_SNIPPET> ', text, flags=re.IGNORECASE | re.DOTALL)
    
    # Example: content that looks like a block of code with many special chars / indentation
    # This is harder. A placeholder for a more complex heuristic if needed later:
    # If we find multiple lines with high density of { } ; ( ) or heavy indentation:
    # multi_line_code_pattern = r'(?:^\s*[\w\s]*[(){};][\w\s()\[\]{};:,."'\'']*\n){3,}' # Example: 3+ lines with code-like chars
    # text = re.sub(multi_line_code_pattern, ' <CODE_SNIPPET> ', text, flags=re.MULTILINE)


    # The content from original Jira {code} blocks is already part of the 'text' here.
    # If that content strongly resembles a stack trace or our simple SQL pattern, it will be replaced.
    # Otherwise, it remains. A more robust way for {code} blocks would have been to replace
    # them entirely with <CODE_SNIPPET> in the strip_jira_markup function itself if that's desired.

    # Normalize whitespace again, as replacements can introduce or leave extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
            
    return text

# Apply to the relevant text columns
df['cleaned_summary'] = df['cleaned_summary'].apply(process_code_and_stack_traces)
df['cleaned_description'] = df['cleaned_description'].apply(process_code_and_stack_traces)
df['cleaned_comments'] = df['cleaned_comments'].apply(process_code_and_stack_traces)

print("\nText after processing code/stack traces (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string())
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

# Assuming 'comments_col' and 'first_comment_idx' are defined
if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"(Ticket index {first_comment_idx})")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})")
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")


Text after processing code/stack traces (first 2 rows):
Cleaned Summary:
0                                                                        emfdev add coupon benefit
1    tata digital prod 1000006 tier downgrade job saves wrong note for tier renew downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [150]:
import re # Just in case it's a new cell and re wasn't imported recently

# --- Final Whitespace Normalization Pass ---

# Ensure the normalize_whitespace function is defined
# (If you're running this in a new cell, you might need to redefine it or ensure the cell where it's defined has been run)
# For completeness, here it is again:
def normalize_whitespace(text):
    if not isinstance(text, str):
        return ""
    # Replace multiple spaces, tabs, and newlines (any whitespace sequence) with a single space
    text = re.sub(r'\s+', ' ', text)
    # Trim leading and trailing whitespace
    text = text.strip()
    return text

print("Applying final whitespace normalization pass...")

df['cleaned_summary'] = df['cleaned_summary'].apply(normalize_whitespace)
df['cleaned_description'] = df['cleaned_description'].apply(normalize_whitespace)
df['cleaned_comments'] = df['cleaned_comments'].apply(normalize_whitespace)

print("\nText after FINAL whitespace normalization (first 2 rows):")
print("Cleaned Summary:")
print(df['cleaned_summary'].head(2).to_string())
print("\nCleaned Description:")
print(df['cleaned_description'].head(2).to_string())
print("\nCleaned Comments (example from first ticket with comments):")

# Assuming 'comments_col' and 'first_comment_idx' are defined from previous steps
if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"(Ticket index {first_comment_idx})")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    # Fallback to find any comment if first_comment_idx is not set or invalid
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min()
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})")
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")

print("\nPhase 1 and Phase 2 text cleaning complete for summary, description, and comments.")

Applying final whitespace normalization pass...

Text after FINAL whitespace normalization (first 2 rows):
Cleaned Summary:
0                                                                        emfdev add coupon benefit
1    tata digital prod 1000006 tier downgrade job saves wrong note for tier renew downgrade reason

Cleaned Description:
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [151]:
# import pandas as pd

# # Ensure pandas display options are set to show full column width
# # This is good practice if you're also looking at DataFrames,
# # but for printing a single string, it's not strictly necessary.
# pd.set_option('display.max_colwidth', None)

# # Check if the DataFrame is not empty and the column exists
# if not df.empty and 'cleaned_comments' in df.columns:
#     # Access the first item in the 'cleaned_comments' Series
#     first_cleaned_comment = df['cleaned_comments'].iloc[0]
    
#     print("Full content of the first 'cleaned_comments' entry:")
#     print(first_cleaned_comment)
# else:
#     if df.empty:
#         print("The DataFrame 'df' is empty.")
#     else:
#         print("Column 'cleaned_comments' not found in the DataFrame.")


In [152]:
# df.head(3)

In [153]:
import re

# --- Step 9.6 (New): Remove Domain-Specific Data Patterns (Corrected for Warnings & Logic) ---

def remove_domain_specific_data(text):
    if not isinstance(text, str):
        return ""

    # Pattern for data tokens (re-used from remove_id_data_blobs)
    # The (?i) for null is fine here as it's self-contained in this definition.
    data_token = r'(?:\b\d+(?:\.\d+)?\b|\b[a-zA-Z0-9_.-]{8,}\b|(?i)\bnull\b)' # Reduced ID length to 8 for more matches
    
    # A sequence of at least 3 such data tokens to identify data lines/blobs
    # This will be part of a multi-line pattern check
    data_tokens_on_line = r'(?:' + data_token + r'\s*){3,}' # At least 3 data tokens on a line

    # 1. Handle "customerid issualdate" and similar data headers followed by data lines
    #    Also, handle lines that are just lists of data tokens.
    #    This pattern looks for the header, then captures the following lines if they look like data.
    header_pattern_str = r'(?:^customerid\s*(?:issualdate\s*)?.*?$)' # Header line, case-insensitive via flag
    # A line that is predominantly data tokens
    data_line_pattern_str = r'^\s*' + data_tokens_on_line + r'\s*$'

    # Combine: Look for a header OR a data line, and if it's a data line,
    # or if a header is followed by data lines, replace.
    # This is tricky. Let's try to identify consecutive lines of data tokens first.
    
    # Iteratively find blocks of 2 or more consecutive lines that match data_line_pattern_str
    # This is a placeholder for a more complex block-finding logic if needed.
    # For now, let's simplify: if a line IS a data_line_pattern_str, replace it.
    text = re.sub(data_line_pattern_str, ' <DATA_BLOB> ', text, flags=re.MULTILINE | re.IGNORECASE)
    
    # Then, if a header is followed by <DATA_BLOB>, consolidate or just ensure header is also gone if it was separate
    text = re.sub(header_pattern_str + r'\n(\s*<DATA_BLOB>\s*)', ' <DATA_BLOB> ', text, flags=re.MULTILINE | re.IGNORECASE)
    # If header didn't have data blob immediately after, but was on its own and we want to remove it too (less safe)
    # text = re.sub(header_pattern_str, ' <DATA_HEADER_REMOVED> ', text, flags=re.MULTILINE | re.IGNORECASE)


    # 2. Handle Rule Expressions / Configuration Details
    rule_keywords = r'(?:currentcustomer\.|currentevent\.|rule\d+|ruleset\d+|actiontop\d+)'
    # A line that seems to be predominantly rule/log like
    # If a line has at least two rule_keywords or one and looks like a typical assignment/check:
    complex_rule_line_str = r'^\s*' + rule_keywords + r'[^=\n]+(?:=|isbefore|isafter|contains)[^\n]+$'
    text = re.sub(complex_rule_line_str, ' <RULE_CONFIGURATION> ', text, flags=re.MULTILINE | re.IGNORECASE)
    
    # Simpler: Replace lines that have "request id" followed by a long string
    text = re.sub(r'^\s*request id\s+[a-f0-9]{20,}.*?$', ' <LOG_IDENTIFIERS> ', text, flags=re.MULTILINE | re.IGNORECASE)


    # Consolidate multiple generated tokens and normalize whitespace
    # Added re.IGNORECASE here just in case tokens somehow got cased, though they shouldn't.
    text = re.sub(r'(\s*<(?:DATA_BLOB|RULE_CONFIGURATION|LOG_IDENTIFIERS)>\s*)+', r' \1 ', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# --- Apply this new step ---
print("Applying Step 9.6 (Corrected for Warnings & Logic): Remove Domain-Specific Data Patterns...")

df['cleaned_comments'] = df['cleaned_comments'].apply(remove_domain_specific_data)
# df['cleaned_summary'] = df['cleaned_summary'].apply(remove_domain_specific_data)
# df['cleaned_description'] = df['cleaned_description'].apply(remove_domain_specific_data)

print("\nText after Step 9.6 (Corrected for Warnings & Logic - Remove Domain-Specific Data Patterns):")
pd.set_option('display.max_colwidth', None) # Ensure full text is shown

if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"\nExample of 'cleaned_comments' for ticket index {first_comment_idx}:")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min() # Ensure comments_col defined
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})")
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")

# --- Test with your specific example text ---
your_example_text = """fyi verma reddy can you please share more details here like promotion name target event details and impacted users if it is aware. reddy yadav could you please help me baldi i m pulling the list of users promotion ids target ids. will add to ticket by afternoon baldi kailasanathan i updated the ticket with a sheet. of 3 cases only 1 is ready to be picked. the other 2 need config fixed first and i see that it is still not closed. i will check with implementation team and get back with an eta. reddy i don t have access to the sheet looking at the ticket description this looks to be at target loyalty. can you please share the access with everyone kailasanathan done kailasanathan config change is doen for below orgs and can be picked for cleanup.. once the cleanup script is ready and implemented for these 3 orgs we can do the cleanup for rest of them.ready for cleanup orglumen 9000209roche 9000284northside - 9000248cc reddy the case 4 in the sheet mentions update target achieved value for user-target .usertarget table s achieved value stores the sum of tracked targetvalue of usertargeteventlog entries. basically there is one to many relationship between usertarget and usertargeteventlog.please let me know which entries of usertargeteventlog should be excluded from corresponding usertarget. then only i can update usertarget s achievedvalue targetachievedemfmessagesent columns and negate the targetachievedeventlog and unifiedtargeteventlog. case 2 - it mentions that target is achieved but incentives are not earned.i started with org 9000209 and found data discrepancy. the sheet for case2 contains users who already have points from the target completion events. i have mentioned already allocated points for users of org 9000209 in same sheet.please let me know what to do for these users as these users fall in case1. cleanup for case 12 and 16 is done. tiwari - for case 2 most likely user must have been awarded for one cycle only. can you verify for other cycle too reddy can you connect with sourabh once on the cleanup hi yadav in the cleanup sheet for case 2 usertarget id of corresponding cycle is also mentioned. hence i tried only the usertarget ids which are mentioned. if there are other cycles of users which needs to be checked then they will have different usertarget ids. can you get the correct usertarget ids added in the sheet reddy - can you confirm if the document has the updated data with correct usertargetid in place yadav yes but i can re-run and validate if any new user targets were added since sunday. will connect with saurabh we are waiting for approval from optum to run the cleanup here. reddy tiwari is there an update on the approval from optum and cleanup being run a timeline would be great for this as it is impacting roughly 20-30 open incidents. cc .sharif morgan - tiwari won t be able to help here. reddy yadav can help. verma cc. morgan there are many tickets linked to the stride config related clean ups. could you please expedite this between nitish and prateek cc baldi peterson verma please reach out to nitish and align with him directly. kailey is on ops team and is getting stuck as an inbetween. updtae optum has approved the clean up on friday. shared the final list for cleanup to sourabh. we should be able to execute this on tuesday once tiwari is back.cc peterson morgan verma baldi cleanup for case 1.1 1.2 and 2.2 are done. case 2.1 is in progress currently. hi nitish cleanup for case 2.1 is also complete.please verify once from your end.cc baldi verma hi reddy after replaying stride target completed events again for benco org for list following are the results following users didn t get the points because they have targetcompleted before promotion was issued to them. hence validation failed warn promotion with id 2095 was earned by the customer but falls outside the valid promotion expiry period. customerid issualdate 3504875 20250115 21 07 57 4314072 20250212 15 42 46 3460262 20250220 10 26 23 4704136 20250114 17 27 25 4887108 20250224 09 16 16 three users got points in first run which were present in but they are also present in . overall summary users who got points from 1st and 2nd run including customerid 4251288 4813126 4495592 4441551 4384604 3504875 4368652 4400329 4314072 3812372 4142994 3460262 4421770 4704136 4366648 4103750 4384604 4887108 4005907 users who didn t get points from 1st and 2nd run including. this has some customers who should be excluded as manual allocation was done and 5 customer from point 1. - customerid 4251288 4813126 4495592 4441551 4384604 3504875 4368652 4400329 4314072 3812372 4142994 3460262 4421770 4704136 4366648 4103750 4384604 4887108 4005907 -cc baldi tiwari awesome saurabh. thank you for the update. for the 5 users whose target completed event falls prior to promotion enrollment can we unenroll re-enroll fire bes to give them their reward. we normally would not do this but they also have genuine events that were received post promotion issual that would have achieved their target properly post issual if not for our config issue in jan that counted pre issual events. hi reddy can i update promotion issual date to 1st jan for these users and replay targetcompleted events. it will be quicker and less error prone can we revert the issual date back again after the firing reporting may be impacted if we change the issual date tiwari yeah i have done the cleanup in mentioned way and reverted the issual date back to original. points are awarded to remaining 5 customers of benco.now lumen org is remaining where choice reward was not present in payload. hi nitish for lumen 9000209 points are awarded to eligible customers after updating choice reward and promotion issualdate.for global atlantic points are awarded to one customer after updating choice reward and promotion issualdate.other details are mentioned in the sheet where points are not awarded. most of them have hit capping limit.everything seems to be done on this ticket closing it now. please let us know if anything else remains.cc baldi verma"""
print("\n--- Processing Your Specific Example Text ---")
processed_example = remove_domain_specific_data(your_example_text)
print(processed_example)



Example of 'cleaned_comments' for ticket index 0:


--- Processing Your Specific Example Text ---
fyi verma reddy can you please share more details here like promotion name target event details and impacted users if it is aware. reddy yadav could you please help me baldi i m pulling the list of users promotion ids target ids. will add to ticket by afternoon baldi kailasanathan i updated the ticket with a sheet. of 3 cases only 1 is ready to be picked. the other 2 need config fixed first and i see that it is still not closed. i will check with implementation team and get back with an eta. reddy i don t have access to the sheet looking at the ticket description this looks to be at target loyalty. can you please share the access with everyone kailasanathan done kailasanathan config change is doen for below orgs and can be picked for cleanup.. once the cleanup script is ready and implemented for these 3 orgs we can do the cleanup for rest of them.ready for cleanup orglumen 9000209roche 9

In [154]:
# df.head(3)

In [155]:
import re

# --- Step 9.7: Remove/Replace Numbers (while trying to preserve version-like patterns) ---

def remove_or_replace_numbers(text):
    if not isinstance(text, str):
        return ""

    # 1. Protect common version-like patterns (e.g., v1.2.3, 1.2.3.4, project-1.2)
    #    We'll replace them with a unique placeholder, then add them back later.
    version_patterns = [
        r'\b\d+\.\d+\.\d+(?:\.\d+)?\b',  # e.g., 1.2.3 or 1.2.3.4
        r'\bv\d+\.\d+(?:\.\d+)?\b',      # e.g., v1.2 or v1.2.3
        r'\b[a-zA-Z_][a-zA-Z0-9_]*-\d+\.\d+\b' # e.g., project-1.2 (simple)
    ]
    
    protected_versions = []
    placeholder_base = "||VERSION_PLACEHOLDER_{}||"

    for i, pattern in enumerate(version_patterns):
        matches = re.finditer(pattern, text)
        for match in reversed(list(matches)): # Iterate in reverse to handle indices correctly during replacement
            placeholder = placeholder_base.format(len(protected_versions))
            protected_versions.append(match.group(0))
            start, end = match.span()
            text = text[:start] + placeholder + text[end:]
            
    # 2. Remove standalone numbers (integers or floats not part of a larger word structure)
    #    \b ensures we match whole numbers, not digits within words.
    #    This will remove numbers like "2095", "3504875", "07", "5" etc.
    #    It will also remove numbers like "1.1", "2.2" if they weren't caught by version patterns.
    text = re.sub(r'\b\d+(?:\.\d+)?\b', ' ', text) # Replace with space to separate words

    # 3. Restore protected versions
    for i, version_str in enumerate(reversed(protected_versions)): # Restore in reverse order of placeholder creation
        placeholder = placeholder_base.format(len(protected_versions) - 1 - i)
        text = text.replace(placeholder, version_str, 1) # Replace only the first occurrence

    # Normalize whitespace again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# --- Apply this new step ---
print("Applying Step 9.7: Remove/Replace Numbers...")

# Apply to the cleaned_comments (and other fields if desired)
df['cleaned_comments'] = df['cleaned_comments'].apply(remove_or_replace_numbers)
# df['cleaned_summary'] = df['cleaned_summary'].apply(remove_or_replace_numbers)
# df['cleaned_description'] = df['cleaned_description'].apply(remove_or_replace_numbers)


print("\nText after Step 9.7 (Remove/Replace Numbers):")
pd.set_option('display.max_colwidth', None) # Ensure full text view

if 'first_comment_idx' in locals() and pd.notna(first_comment_idx) and first_comment_idx in df.index:
    print(f"\nExample of 'cleaned_comments' for ticket index {first_comment_idx}:")
    print(df.loc[first_comment_idx, 'cleaned_comments'])
else:
    temp_idx = df[df[comments_col].fillna('').str.len() > 0].index.min() # Ensure comments_col defined
    if pd.notna(temp_idx):
        print(f"(Ticket index {temp_idx})")
        print(df.loc[temp_idx, 'cleaned_comments'])
    else:
        print("No comments found to display.")

# --- Test with your specific example text that contained many numbers ---
your_number_heavy_text = """aying stride target completed events again for benco org for list following are the results following users didn t get the points because they have targetcompleted before promotion was issued to them. hence validation failed warn promotion with id 2095 was earned by the customer but falls outside the valid promotion expiry period. customerid issualdate 3504875 20250115 21 07 57 4314072 20250212 15 42 46 3460262 20250220 10 26 23 4704136 20250114 17 27 25 4887108 20250224 09 16 16 three users got points in first run which were present in but they are also present in . overall summary users who got points from 1st and 2nd run including customerid 4251288 4813126 4495592 4441551 4384604 3504875 4368652 4400329 4314072 3812372 4142994 3460262 4421770 4704136 4366648 4103750 4384604 4887108 4005907 users who didn t get points from 1st and 2nd run including. this has some customers who should be excluded as manual allocation was done and 5 customer from point 1. - customerid 4251288 4813126 4495592 4441551 4384604 3504875 4368652 4400329 4314072 3812372 4142994 3460262 4421770 4704136 4366648 4103750 4384604 4887108 4005907 -cc baldi tiwari awesome saurabh. thank you for the update. for the 5 users whose target completed event falls prior to promotion enrollment can we unenroll re-enroll fire bes to give them their reward. we normally would not do this but they also have genuine events that were received post promotion issual that would have achieved their target properly post issual if not for our config issue in jan that counted pre issual events. hi reddy can i update promotion issual date to 1st jan for these users and replay targetcompleted events. it will be quicker and less error prone can we revert the issual date back again after the firing reporting may be impacted if we change the issual date tiwari yeah i have done the cleanup in mentioned way and reverted the issual date back to original. points are awarded to remaining 5 customers of benco.now lumen org is remaining where choice reward was not present in payload. hi nitish for lumen 9000209 points are awarded to eligible customers after updating choice reward and promotion issualdate.for global atlantic points are awarded to one customer after updating choice reward and promotion issualdate.other details are mentioned in the sheet where points are not awarded. most of them have hit capping limit.everything seems to be done on this ticket closing it now. please let us know if anything else remains.cc baldi verma"""
print("\n--- Processing Your Number-Heavy Example Text (After Number Removal) ---")
processed_number_example = remove_or_replace_numbers(your_number_heavy_text)
# To see the effect more clearly, you might want to apply ALL previous cleaning steps to your_number_heavy_text first,
# then apply remove_or_replace_numbers.
# For a quick test, just applying this function will show if it removes the numbers.
print(processed_number_example)

Applying Step 9.7: Remove/Replace Numbers...

Text after Step 9.7 (Remove/Replace Numbers):

Example of 'cleaned_comments' for ticket index 0:


--- Processing Your Number-Heavy Example Text (After Number Removal) ---
aying stride target completed events again for benco org for list following are the results following users didn t get the points because they have targetcompleted before promotion was issued to them. hence validation failed warn promotion with id was earned by the customer but falls outside the valid promotion expiry period. customerid issualdate three users got points in first run which were present in but they are also present in . overall summary users who got points from 1st and 2nd run including customerid users who didn t get points from 1st and 2nd run including. this has some customers who should be excluded as manual allocation was done and customer from point . - customerid -cc baldi tiwari awesome saurabh. thank you for the update. for the users whose target c

In [156]:
print("Column names in your DataFrame:")
print(df.columns)

Column names in your DataFrame:
Index(['ticket_id', 'summary', 'description', 'status', 'priority', 'reporter',
       'assignee', 'created_at', 'updated_at', 'labels', 'components',
       'owned_by_team', 'brand', 'product', 'geo_region', 'environment',
       'root_cause', 'sprint', 'comments', 'url', 'issue_type',
       'cleaned_summary', 'cleaned_description', 'cleaned_comments'],
      dtype='object')


In [157]:
print(f"\nCurrent shape of the DataFrame (rows, columns): {df.shape}")


Current shape of the DataFrame (rows, columns): (10, 24)


In [160]:
import pandas as pd

# Assuming 'df' is your DataFrame and 'cleaned_comments' is the column with the cleaned text.
# If 'cleaned_comments' might have NaN values (though our pipeline tries to make them empty strings),
# fillna('') ensures the .split() method doesn't error out.
# .str.split() splits the string in each cell by whitespace by default, resulting in a list of words.
# .str.len() then gets the length of each of these lists (i.e., the word count for that cell).
# .max() finds the maximum value in the resulting Series of word counts.

if 'cleaned_comments' in df.columns:
    # Calculate word counts for each comment
    word_counts = df['cleaned_comments'].fillna('').str.split().str.len()
    
    # Count comments with more than 200 words
    long_comments_count = len(word_counts[word_counts > 400])
    print(f"Number of comments with more than 200 words: {long_comments_count}")
    
    # Optional: Display some statistics
    print("\nWord count statistics:")
    print(f"Mean word count: {word_counts.mean():.1f}")
    print(f"Median word count: {word_counts.median():.1f}")
    print(f"Maximum word count: {word_counts.max()}")
else:
    print("Column 'cleaned_comments' not found in the DataFrame.")


Number of comments with more than 200 words: 1

Word count statistics:
Mean word count: 116.4
Median word count: 23.5
Maximum word count: 442
