# Maintenance Report NLP Analysis

This notebook implements an NLP pipeline for analyzing maintenance reports using BERT and NLTK.

In [1]:
import os
import platform
import re
import torch
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification, pipeline, logging
from spellchecker import SpellChecker
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Suppress warnings from the transformers library
logging.set_verbosity_error()

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', quiet=True)

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger', quiet=True)

In [2]:
# # Check if GPU is available
# if platform.system() == 'Darwin':  # macOS
#     if torch.backends.mps.is_available() and torch.backends.mps.is_built():
#         device = torch.device("mps")
#         print("MPS backend is available. PyTorch is using the GPU.")
#     else:
#         device = torch.device("cpu")
#         print("MPS backend is not available. PyTorch is using the CPU.")
# else:  # Windows or other platforms
#     if torch.cuda.is_available():
#         device = torch.device("cuda")
#         print("CUDA backend is available. PyTorch is using the GPU.")
#     else:
#         device = torch.device("cpu")
#         print("CUDA backend is not available. PyTorch is using the CPU.")

In [3]:
import pyodbc
import pandas as pd

In [4]:
import os
import pandas as pd
import pyodbc
from contextlib import contextmanager
import logging
from typing import Optional, List, Tuple
import time

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
CSV_FILE_PATH = 'data/ice_makers.csv'
DB_CONFIG = {
    'driver': 'ODBC Driver 17 for SQL Server',
    'server': '35.184.99.218',
    'database': 'coolsys',
    'uid': 'sqlserver',
    'pwd': 'Ybz8Vq+|>\H/<2py'
}

SQL_QUERY = """
SELECT
    w.wrkordr_wrk_rqstd,
    w.wrkordr_wrk_prfrmd,
    w2.wrkordreqpmnt_wrk_rqstd,
    w2.wrkordreqpmnt_wrk_prfrmd,
    w3.wrkordrinvntry_dscrptn
FROM
    coolsys.dbo.wrkordr w
    INNER JOIN coolsys.dbo.wrkordrinvntry w3 ON w.wrkordr_rn = w3.wrkordr_rn
    INNER JOIN coolsys.dbo.wrkordreqpmnt w2 ON w.wrkordr_rn = w2.wrkordr_rn
WHERE
    w.wrkordr_wrk_rqstd LIKE '%ICE MACHINE%'
    OR w.wrkordr_wrk_prfrmd LIKE '%ICE MACHINE%'
    OR w2.wrkordreqpmnt_wrk_rqstd LIKE '%ICE MACHINE%'
    OR w2.wrkordreqpmnt_wrk_prfrmd LIKE '%ICE MACHINE%'
    OR w3.wrkordrinvntry_dscrptn LIKE '%ICE MACHINE%';
"""

@contextmanager
def database_connection():
    """Context manager for database connections with proper error handling."""
    conn = None
    try:
        conn_str = (
            f'DRIVER={{{DB_CONFIG["driver"]}}};'
            f'SERVER={DB_CONFIG["server"]};'
            f'DATABASE={DB_CONFIG["database"]};'
            f'UID={DB_CONFIG["uid"]};'
            f'PWD={DB_CONFIG["pwd"]}'
        )
        conn = pyodbc.connect(conn_str, timeout=30)  # Add connection timeout
        logger.info("Database connection established")
        yield conn
    except pyodbc.Error as e:
        logger.error(f"Database connection error: {str(e)}", exc_info=True)
        raise
    finally:
        if conn:
            conn.close()
            logger.info("Database connection closed")

def fetch_data_from_db() -> Tuple[Optional[pd.DataFrame], Optional[str]]:
    """Fetch data from database with optimized performance and error handling."""
    try:
        with database_connection() as conn:
            # Configure connection for better performance
            conn.setdecoding(pyodbc.SQL_CHAR, encoding='utf-8')
            conn.setdecoding(pyodbc.SQL_WCHAR, encoding='utf-8')
            conn.setencoding(encoding='utf-8')
            
            # Use pandas read_sql for better performance
            start_time = time.time()
            df = pd.read_sql(SQL_QUERY, conn)
            logger.info(f"Query executed in {time.time() - start_time:.2f} seconds")
            
            return df, None
    except Exception as e:
        error_msg = f"Error fetching data: {str(e)}"
        logger.error(error_msg, exc_info=True)
        return None, error_msg

def save_to_csv(df: pd.DataFrame, filepath: str) -> None:
    """Save DataFrame to CSV with error handling."""
    try:
        # Ensure directory exists
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        # Save with optimized settings
        df.to_csv(filepath, index=False, compression='infer')
        logger.info(f"Data saved to {filepath}")
    except Exception as e:
        logger.error(f"Error saving to CSV: {str(e)}", exc_info=True)
        raise

def load_data() -> pd.DataFrame:
    """Main function to load data with caching mechanism."""
    try:
        # Try loading from cache first
        if os.path.exists(CSV_FILE_PATH):
            logger.info(f"Loading data from cache: {CSV_FILE_PATH}")
            return pd.read_csv(CSV_FILE_PATH)
        
        # Fetch from database if cache doesn't exist
        logger.info("Cache not found, fetching from database")
        df, error = fetch_data_from_db()
        
        if error:
            raise RuntimeError(error)
        
        if df is not None and not df.empty:
            # Save to cache
            save_to_csv(df, CSV_FILE_PATH)
            return df
        else:
            raise ValueError("No data retrieved from database")
            
    except Exception as e:
        logger.error(f"Error in load_data: {str(e)}", exc_info=True)
        raise

# Main execution
try:
    df = load_data()
    logger.info(f"DataFrame shape: {df.shape}")
except Exception as e:
    logger.error("Failed to load data", exc_info=True)
    raise

INFO:__main__:Loading data from cache: data/ice_makers.csv
INFO:__main__:DataFrame shape: (296547, 5)


In [5]:
print(df.shape)
df.head()

(296547, 5)


Unnamed: 0,wrkordr_wrk_rqstd,wrkordr_wrk_prfrmd,wrkordreqpmnt_wrk_rqstd,wrkordreqpmnt_wrk_prfrmd,wrkordrinvntry_dscrptn
0,PREVENTIVE MAINT -- 2012 JAN DEEP DIVE\rFILL...,,PREVENTIVE MAINT -- 2012 JAN DEEP DIVE\rFILL...,ST713886-PERFORMED PER SCOPE-NO PROBLEMS NOTED,ICE MACHINE CLEANER NICKEL SAFE
1,PREVENTIVE MAINT -- 2012 JAN DEEP DIVE\rFILL...,,PREVENTIVE MAINT -- 2012 JAN DEEP DIVE\rFILL...,ST713890 - PERFORMED PM PER SCOPE,ICE MACHINE CLEANER NICKEL SAFE
2,SERVICE CALL -- 2012 JAN DEEP DIVE\rFILL OUT...,,SERVICE CALL -- 2012 JAN DEEP DIVE\rFILL OUT...,ST713888-PERFORMED PER SCOPE-NO PROBLEMS NOTED,ICE MACHINE CLEANER NICKEL SAFE
3,PREVENTIVE MAINT -- 2012 JAN DEEP DIVE\rFILL...,,PREVENTIVE MAINT -- 2012 JAN DEEP DIVE\rFILL...,COMPLETED PER SCOPE - NO PROBLEMS NOTED - CMP ...,ICE MACHINE CLEANER NICKEL SAFE
4,ICE MACHINE NOT WORKING - MAKING LOUD NOISE WH...,I/M R/R COND FAN MOTOR,ICE MACHINE NOT WORKING - MAKING LOUD NOISE WH...,ST 212897 - REMOVED AND REPLACED COND FAN MOTO...,FAN MTR 240V 606/806/1006 3/4 MS X MS


In [6]:
import re
import pandas as pd
import logging

# Configure logging if not already configured
logger = logging.getLogger(__name__)

# Define columns to combine
columns_to_combine = ['wrkordr_wrk_prfrmd', 'wrkordreqpmnt_wrk_prfrmd', 'wrkordrinvntry_dscrptn']

# Pre-compile the regex pattern for better performance
pattern = re.compile(r'[^a-zA-Z0-9\s]|\r')

try:
    # Make a copy of only source columns
    df = df[columns_to_combine].copy()

    # Perform operations in a memory-efficient way
    df['combined_column'] = (
        df[columns_to_combine]
        .fillna('')  # Replace NaN with empty string
        .astype(str)  # Convert to string type (more compatible than string[pyarrow])
        .agg(' '.join, axis=1)  # Join columns with space
        .str.replace(pattern, ' ', regex=True)  # Clean text using pre-compiled pattern
    )

    # Clean up extra whitespace
    df['combined_column'] = df['combined_column'].str.strip().str.replace(r'\s+', ' ')

    # Optional: Free memory if the original columns are no longer needed
    if columns_to_combine:
        df.drop(columns=columns_to_combine, inplace=True)

except Exception as e:
    logger.error(f"Error processing DataFrame: {str(e)}")
    raise



In [7]:
# Select a random sample of 100,000 records from the DataFrame
df_sample = df.sample(n=100000, random_state=42)

In [8]:
from functools import lru_cache
import torch
from transformers import (
    BertTokenizer, 
    BertForTokenClassification, 
    logging as transformers_logging,
    AutoTokenizer,
    AutoModelForTokenClassification
)
import logging
from typing import Tuple, Optional
import os
import sys
from transformers import __version__ as transformers_version

# Configure detailed logging
logging.basicConfig(
    level=logging.DEBUG,  # Changed to DEBUG for more detailed logs
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
transformers_logging.set_verbosity_info()

def check_environment():
    """Check and log environment details."""
    logger.info(f"Python version: {sys.version}")
    logger.info(f"PyTorch version: {torch.__version__}")
    logger.info(f"Transformers version: {transformers_version}")
    logger.info(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        logger.info(f"CUDA version: {torch.version.cuda}")
        logger.info(f"GPU device: {torch.cuda.get_device_name(0)}")

@lru_cache(maxsize=1)
def get_device() -> torch.device:
    """Determine the optimal device with caching."""
    try:
        if torch.cuda.is_available():
            logger.info("CUDA device detected")
            return torch.device("cuda")
        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            logger.info("MPS device detected")
            return torch.device("mps")
        logger.info("Using CPU device")
        return torch.device("cpu")
    except Exception as e:
        logger.warning(f"Error detecting device, defaulting to CPU: {str(e)}")
        return torch.device("cpu")

def load_model_components(model_name: str, device: torch.device) -> Tuple[Optional[BertTokenizer], Optional[BertForTokenClassification]]:
    """Load model components with optimizations and proper error handling."""
    try:
        # Set up caching directory
        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
        os.makedirs(cache_dir, exist_ok=True)
        logger.info(f"Using cache directory: {cache_dir}")

        # Try loading with Auto classes first
        logger.info("Attempting to load tokenizer...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                cache_dir=cache_dir,
                use_fast=True,
                model_max_length=512
            )
        except Exception as e:
            logger.warning(f"AutoTokenizer failed, trying BertTokenizer: {str(e)}")
            tokenizer = BertTokenizer.from_pretrained(
                model_name,
                cache_dir=cache_dir,
                use_fast=True,
                model_max_length=512
            )
        logger.info("Tokenizer loaded successfully")

        # Load model with Auto class
        logger.info("Attempting to load model...")
        try:
            model = AutoModelForTokenClassification.from_pretrained(
                model_name,
                cache_dir=cache_dir,
                return_dict=True,  # Changed to True for better compatibility
                low_cpu_mem_usage=True
            )
        except Exception as e:
            logger.warning(f"AutoModel failed, trying BertForTokenClassification: {str(e)}")
            model = BertForTokenClassification.from_pretrained(
                model_name,
                cache_dir=cache_dir,
                return_dict=True,
                low_cpu_mem_usage=True
            )
        logger.info("Model loaded successfully")

        # Move model to device and optimize
        logger.info(f"Moving model to device: {device}")
        model = model.to(device)
        model.eval()
        
        # Disable gradient computation for inference
        torch.set_grad_enabled(False)
        
        return tokenizer, model

    except Exception as e:
        logger.error(f"Error loading model components: {str(e)}", exc_info=True)
        return None, None

def initialize_model() -> Tuple[Optional[BertTokenizer], Optional[BertForTokenClassification], Optional[torch.device]]:
    """Initialize model with proper error handling and optimizations."""
    try:
        # Check environment first
        check_environment()
        
        # Get device (cached)
        device = get_device()
        logger.info(f"Using device: {device}")

        # Try alternative model if the original fails
        model_names = [
            'vblagoje/bert-english-uncased-finetuned-pos',
            'bert-base-uncased'  # Fallback model
        ]
        
        for model_name in model_names:
            logger.info(f"Attempting to load model: {model_name}")
            tokenizer, model = load_model_components(model_name, device)
            
            if tokenizer is not None and model is not None:
                logger.info(f"Successfully loaded model: {model_name}")
                return tokenizer, model, device
            
            logger.warning(f"Failed to load model: {model_name}, trying next option...")
        
        raise RuntimeError("All model loading attempts failed")

    except Exception as e:
        logger.error(f"Error initializing model: {str(e)}", exc_info=True)
        return None, None, None

# Initialize the model with proper error handling
try:
    # Clear any existing cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    tokenizer, model, device = initialize_model()
    if None in (tokenizer, model, device):
        raise RuntimeError("Model initialization failed")
    logger.info("Model initialized successfully")
except Exception as e:
    logger.error("Failed to initialize model", exc_info=True)
    raise

INFO:__main__:Python version: 3.11.10 (main, Sep  7 2024, 01:03:31) [Clang 16.0.0 (clang-1600.0.26.3)]
INFO:__main__:PyTorch version: 2.5.1
INFO:__main__:Transformers version: 4.46.2
INFO:__main__:CUDA available: False
INFO:__main__:MPS device detected
INFO:__main__:Using device: mps
INFO:__main__:Attempting to load model: vblagoje/bert-english-uncased-finetuned-pos
INFO:__main__:Using cache directory: /Users/nikolay_tishchenko/.cache/huggingface
INFO:__main__:Attempting to load tokenizer...
loading configuration file config.json from cache at /Users/nikolay_tishchenko/.cache/huggingface/models--vblagoje--bert-english-uncased-finetuned-pos/snapshots/46ec120264b121e8d92bef19b45c107d06d2cb99/config.json
Model config BertConfig {
  "_name_or_path": "vblagoje/bert-english-uncased-finetuned-pos",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_

In [9]:
from functools import lru_cache
import re
from typing import List, Tuple, Optional
from dataclasses import dataclass

@dataclass
class ProcessingContext:
    current_verb: Optional[str] = None
    current_objects: List[str] = None
    
    def __post_init__(self):
        if self.current_objects is None:
            self.current_objects = []

    def reset(self):
        self.current_verb = None
        self.current_objects.clear()

@lru_cache(maxsize=1024)
def is_potential_verb(word: str, tag: str) -> bool:
    """Cached check for potential verbs."""
    return (tag.startswith('VB') or tag == 'VBD' or tag == 'VBN' or 
            (tag == 'JJ' and (word.endswith('ed') or word.endswith('en'))))

@lru_cache(maxsize=1024)
def is_object_component(tag: str, word: str) -> bool:
    """Cached check for object components."""
    return (tag.startswith('NN') or 
            (tag.startswith('JJ') and not word.endswith('ed')) or 
            tag == 'IN' or tag.startswith('VBG'))

def preprocess_text(text: str) -> str:
    """Preprocess text with compiled regex patterns."""
    # Compile regex patterns once
    PATTERNS = {
        'as_needed': re.compile(r'\bas needed\b'),
        'conjunctions': re.compile(r'\s*,\s*and\s+'),
        'whitespace': re.compile(r'\s+')
    }
    
    text = text.lower()
    text = PATTERNS['as_needed'].sub('', text)
    text = PATTERNS['conjunctions'].sub(', ', text)
    return PATTERNS['whitespace'].sub(' ', text).strip()

def process_compound_objects(tagged: List[Tuple[str, str]], start_idx: int) -> Tuple[List[str], int]:
    """Process compound objects and return objects and new index."""
    objects = [tagged[start_idx][0]]
    i = start_idx + 1
    
    while i < len(tagged) and is_object_component(tagged[i][1], tagged[i][0]):
        objects.append(tagged[i][0])
        i += 1
    
    return objects, i - 1

def process_text(text: str, tokenizer, model, device) -> List[Tuple[str, str]]:
    try:
        # Initialize tools lazily (only when needed)
        spell = SpellChecker()
        lemmatizer = WordNetLemmatizer()
        
        # Preprocess text
        text = preprocess_text(text)
        
        # Process text
        tokens = word_tokenize(text)
        tagged = pos_tag(tokens)
        
        verb_object_pairs = []
        context = ProcessingContext()
        
        def add_pair():
            if context.current_verb and context.current_objects:
                lemmatized_verb = lemmatizer.lemmatize(context.current_verb, 'v')
                verb_object_pairs.append((lemmatized_verb, ' '.join(context.current_objects)))
        
        i = 0
        while i < len(tagged):
            word, tag = tagged[i]
            
            if is_potential_verb(word, tag):
                # Save previous pair if exists
                add_pair()
                context.current_verb = spell.correction(word)
                context.current_objects.clear()
                
            elif is_object_component(tag, word):
                if context.current_verb:
                    temp_objects, new_idx = process_compound_objects(tagged, i)
                    if any(tagged[j][1].startswith('NN') for j in range(i, new_idx + 1)):
                        context.current_objects.extend(temp_objects)
                    i = new_idx
                    
            elif word == ',':
                add_pair()
                context.reset()
            
            i += 1
        
        # Add final pair if exists
        add_pair()
        
        return verb_object_pairs

    except Exception as e:
        import traceback
        print(f"Error processing text: {str(e)}")
        print(f"Traceback: {traceback.format_exc()}")
        return []

In [10]:
def categorize_task(verb,obj):
    """Categorize a task based on its verb into one of four categories: Inspection, Cleaning, Maintenance, or Other."""
    inspection_verbs = {'inspect', 'check', 'examine', 'monitor', 'observe', 'verify', 'test', 'diagnose', 'assess'}
    cleaning_verbs = {'clean', 'brush', 'wash', 'wipe', 'descale', 'sanitize', 'flush', 'clear'}
    maintenance_verbs = {'lubricate', 'adjust', 'replace', 'repair', 'calibrate', 'install', 'fix', 'tighten', 'service'}
    
    verb = verb.lower()
    if verb in inspection_verbs:
        return "Inspection"
    elif verb in cleaning_verbs:
        return "Cleaning"
    elif verb in maintenance_verbs:
        return "Maintenance"
    else:
        return "Other"

In [None]:
# Process all records in df_sample['combined_column']
from collections import Counter

print("Starting to process maintenance records...")

# Initialize counters for tasks and categories
all_verb_object_pairs = []  # Keep as list to track frequencies
task_frequencies = Counter()

# Process each record
total_records = len(df_sample['combined_column'].dropna())
print(f"Total records to process: {total_records}")

for idx, text in enumerate(df_sample['combined_column'].dropna(), 1):
    if idx % (total_records // 20) == 0:  # Print progress every 5%
        print(f"Processing record {idx}/{total_records} ({(idx/total_records*100):.1f}%)")
    
    pairs = process_text(text, tokenizer, model, device)
    if pairs:  # Only add if we got valid pairs
        for pair in pairs:
            task_frequencies[tuple(pair)] += 1
            all_verb_object_pairs.append(tuple(pair))

print(f"\nTotal task occurrences: {len(all_verb_object_pairs)}")
print(f"Unique tasks: {len(task_frequencies)}")

# Group tasks by category with frequencies
tasks_by_category = {}
category_totals = {}

for (verb, obj), freq in task_frequencies.most_common():
    category = categorize_task(verb, obj)
    if category not in tasks_by_category:
        tasks_by_category[category] = []
        category_totals[category] = 0
    tasks_by_category[category].append((verb, obj, freq))
    category_totals[category] += freq

# Print tasks by category with frequencies
print("\nTask Categories Analysis")
print("=" * 50)

total_tasks = sum(category_totals.values())

for category in ["Other", "Maintenance", "Inspection", "Cleaning"]:
    if category in tasks_by_category:
        print(f"\n{category} Tasks ({category_totals[category]} occurrences, {(category_totals[category]/total_tasks*100):.1f}%)")
        print("-" * 40)
        
        # Sort tasks by frequency within category
        sorted_tasks = sorted(tasks_by_category[category], key=lambda x: (-x[2], x[0]))
        
        for verb, obj, freq in sorted_tasks:
            percentage = (freq / category_totals[category]) * 100
            print(f"• {verb.capitalize()}: {obj} ({freq} times, {percentage:.1f}% of {category})")

# Print overall statistics
print("\nTask Distribution Statistics")
print("=" * 50)
for category, total in category_totals.items():
    percentage = (total / total_tasks) * 100
    unique_tasks = len(tasks_by_category[category])
    print(f"{category}: {total} occurrences ({percentage:.1f}%), {unique_tasks} unique tasks")

# Create a summary DataFrame
summary_data = {
    'Category': [],
    'Total_Occurrences': [],
    'Unique_Tasks': [],
    'Percentage': []
}

for category in tasks_by_category:
    summary_data['Category'].append(category)
    summary_data['Total_Occurrences'].append(category_totals[category])
    summary_data['Unique_Tasks'].append(len(tasks_by_category[category]))
    summary_data['Percentage'].append((category_totals[category] / total_tasks) * 100)

summary_df = pd.DataFrame(summary_data)
print("\nSummary DataFrame:")
print("=" * 50)
print(summary_df.to_string(index=False, float_format=lambda x: '{:.1f}'.format(x)))

Attempting to convert .bin model on the fly to safetensors.


Starting to process maintenance records...
Total records to process: 100000


In [None]:
summary_df

In [None]:
import os
import pandas as pd

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Set pandas display options to prevent wrapping and ensure consistent formatting
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: f'{x:,}')

# Create DataFrames for each category
category_dfs = {}
for category in ['Maintenance', 'Inspection', 'Cleaning', 'Other']:
    category_tasks = []
    for (verb, obj), freq in task_frequencies.items():
        task_category = categorize_task(verb,obj)
        if task_category == category:
            category_tasks.append({
                'Verb': verb,
                'Object': obj,
                'Frequency': freq,
                'Task Description': f"{verb} {obj}".strip()  # Ensure no extra spaces
            })
    
    if category_tasks:  # Only create DataFrame if there are tasks
        category_dfs[category] = pd.DataFrame(category_tasks)
        category_dfs[category] = category_dfs[category].sort_values('Frequency', ascending=False)
        
        print(f"\n{category} Tasks:")
        print("=" * 50)
        
        # Format the DataFrame display
        df_display = category_dfs[category].head(10).copy()
        df_display.index = range(1, len(df_display) + 1)  # Start index from 1
        print(df_display.to_string(
            justify='left',
            col_space={
                'Verb': 20,
                'Object': 40,
                'Frequency': 15,
                'Task Description': 45
            }
        ))

# Save DataFrames to CSV files (optional)
for category, df in category_dfs.items():
    if not df.empty:
        filename = f"{category.lower()}_tasks.csv"
        df.to_csv(filename, index=False)
        print(f"\nSaved {category} tasks to {filename}")

In [None]:
# Display summary of all tasks
print(f"\nMaintenance Tasks Analysis Summary ({pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')})")
print("=" * 120)

# Create a combined DataFrame of all tasks
all_tasks = []
for category, df in category_dfs.items():
    df = df.copy()
    df['Category'] = category
    all_tasks.append(df)
all_tasks_df = pd.concat(all_tasks, ignore_index=True)

# Create summary DataFrame with formatting
summary_df = all_tasks_df.groupby('Category').agg({
    'Task Description': 'count',
    'Frequency': 'sum'
}).rename(columns={
    'Task Description': 'Unique Tasks',
    'Frequency': 'Total Occurrences'
})

# Add percentage columns
total_tasks = summary_df['Unique Tasks'].sum()
total_occurrences = summary_df['Total Occurrences'].sum()

summary_df['% of Tasks'] = (summary_df['Unique Tasks'] / total_tasks * 100).round(1)
summary_df['% of Occurrences'] = (summary_df['Total Occurrences'] / total_occurrences * 100).round(1)

# Format numbers with thousands separator
summary_df['Unique Tasks'] = summary_df['Unique Tasks'].apply(lambda x: f"{x:,}")
summary_df['Total Occurrences'] = summary_df['Total Occurrences'].apply(lambda x: f"{x:,}")


# Sort categories in logical order
category_order = ['Maintenance', 'Inspection', 'Cleaning', 'Other']
summary_df = summary_df.reindex(category_order)

# Set display options for better formatting
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: f'{x:,.0f}' if x.is_integer() else f'{x:.1f}%')

# Print formatted DataFrame
print(summary_df)
print("=" * 120)
print(f"Total Unique Tasks: {total_tasks:,.0f}")
print(f"Total Task Occurrences: {total_occurrences:,.0f}")
print("=" * 120)

In [None]:
all_tasks_df.head()