In [3]:
# --- One-time setup 
import sys, subprocess

def pip_install(pkg):
    try:
        __import__(pkg.split("[")[0].replace("-", "_"))
    except ImportError:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

# (pip install)
pip_install("sentence-transformers")
pip_install("fuzzywuzzy[speedup]")
pip_install("groq")        
pip_install("openai")      


Installing sentence-transformers ...
Installing fuzzywuzzy[speedup] ...
Installing groq ...
Installing openai ...


In [41]:

# ===============================
# Core
# ===============================
!pip install umap-learn
import os, re, warnings
import numpy as np
!pip install pandas
import pandas as pd
import joblib
from tqdm import tqdm
warnings.filterwarnings("ignore")

# ===============================
# NLP / Vectorization
# ===============================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# embedding baseline (semantic retrieval)
from sentence_transformers import SentenceTransformer

# fuzzy matching utility 
from fuzzywuzzy import fuzz

# ===============================
# Modeling (classification scope)
# ===============================
from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# ===============================
# Metrics
# ===============================
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score
)

# ===============================
# Visualization
# ===============================
import matplotlib.pyplot as plt
import seaborn as sns

# Dimensionality-reduction for visuals 
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap  # This will now work after installing umap-learn

# ===============================
# LLM Clients (for narrative layer)
# ===============================
from groq import Groq        
from openai import OpenAI



In [7]:
!python -m spacy download en_core_web_sm
!pip install python-Levenshtein


C:\Users\John\anaconda3\python.exe: No module named spacy




In [43]:



import os
df = pd.read_csv(os.path.join(os.path.expanduser("~"), "Downloads/cleaned_work_orders.csv"))

df = df.dropna(how='all')

# Drop noise in Data
df = df[~df['Text'].str.contains("completed", case=False, na=False)]
df = df[~df['Text'].str.contains("complete", case=False, na=False)]
df = df[~df['Text'].str.contains("mike", case=False, na=False)]
df = df[~df['Text'].str.contains("mike's", case=False, na=False)]
df = df[~df['Text'].str.contains("odd", case=False, na=False)]

df = df.dropna(subset=['Description', 'Text']).drop_duplicates()

# Keep only rows where WO No. is numeric
df = df[df['WO No.'].astype(str).str.match(r'^\d+$')]

# Strip spaces and standardize text fields
df['WO No.'] = df['WO No.'].astype(str).str.strip()
df['Description'] = df['Description'].astype(str).str.strip()
df['Text'] = df['Text'].astype(str).str.strip()

# Clean Description
df['Description_cleaned'] = (
    df['Description']
    .fillna("")
    .str.lower()
    .str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
    .str.replace(r'\d+', '', regex=True)      # Remove digits
    .str.strip()
)

# Clean Technician Text Notes
df['Text_cleaned'] = (
    df['Text']
    .fillna("")
    .str.lower()
    .str.replace(r'[^\w\s]', '', regex=True)
    .str.replace(r'\d+', '', regex=True)
    .str.strip()
)

# Keep rows where both sides exist and are not trivial
df = df[['Description_cleaned', 'Text_cleaned']].dropna()
df = df[df['Description_cleaned'].str.strip() != ""]
df = df[df['Text_cleaned'].str.strip() != ""]

# Filter out rows with completely blank descriptions and notes
df = df[~((df['Description_cleaned'].str.strip() == "") & (df['Text_cleaned'].str.strip() == ""))]

# Ensure valid technician notes
df = df[df['Text_cleaned'].notna() & (df['Text_cleaned'].str.strip() != "") & (df['Text_cleaned'].str.lower().str.strip() != "nan")]

# Show data sample for verification
display(df[['Description_cleaned', 'Text_cleaned']].head())

Unnamed: 0,Description_cleaned,Text_cleaned
9,cushion,checking on a new controller ordered controller
11,take out section of conveyor on vertical oven ...,took out about ft of chain
12,stop time,press was in a wierd position causing the disc...
13,remove gas meter from solder pour machine,removed gas meter and installed replacement pipe
23,check bearings making noise,problem stopped


In [45]:
# Map free-text notes to a small, useful label set.

# --- 1) Primary rule-based mapping ---
ACTION_PATTERNS = [
    ("Replace Part",   r"\b(replace|replaced|swap|swapped|install(ed)?)\b.*\b(bearing|motor|belt|gear|fuse|sensor|valve|hose|coupling|chain|switch|roller|pulley|seal)\b"),
    ("Tighten/Adjust", r"\b(tighten|tightened|adjust|adjusted|align|aligned|re-seat|reseat|calibrate|calibrated|reposition|realign(ed)?)\b"),
    ("Clean/Clear",    r"\b(clean|cleaned|clear|cleared|remove|removed)\b.*\b(debris|dust|jam|blockage|clog)\b|\b(cleaned|cleared)\b"),
    ("Refill/Top Off", r"\b(add|added|refill|refilled|top\s?off)\b.*\b(oil|fluid|grease|lub(e|ricant)|coolant)\b"),
    ("Electrical Fix", r"\b(replace|replaced|reset|rewire|wire(d)?|reconnect|connector|contactor|breaker|fuse|vfd|plc|relay)\b"),
    ("Hydraulic/Pneumatic Fix", r"\b(hose|cylinder|solenoid|regulator|air line|hydraulic|pneumatic)\b.*\b(repair|replace|fixed|leak|leaking)\b"),
    ("Reset/Power Cycle", r"\b(reset|power.?cycle|cycled|restart|reboot|restarted)\b"),
    ("Inspection/Test Only", r"\b(inspect|inspected|tested|verify|verified|checked)\b(?!.*replace|.*repair|.*fix)"),
    ("Other", r".*")
]

import re

def to_response_label(text: str) -> str:
    t = " " + str(text).lower() + " "
    for label, pat in ACTION_PATTERNS:
        if re.search(pat, t):
            return label
    return "Other"

df['Response_Label'] = df['Text_cleaned'].apply(to_response_label)


# --- 2) Secondary re-mapping for rows still tagged "Other" ---
# Uses precise bigrams first, then high-signal unigrams, mapped into EXISTING labels only.

BIGRAM_MAP = [
    # Installation / replacement
    (["installed new", "fabricated new", "changed torch", "removed broken"], "Replace Part"),

    # Pneumatic / hydraulic
    (["air line", "air pressure", "air leak", "solenoid valve", "foot pedal"], "Hydraulic/Pneumatic Fix"),

    # Electrical
    (["limit switch", "power supply", "light curtain", "repaired wiring"], "Electrical Fix"),

    # Inspection / test outcomes
    (["started working", "went away", "working ok", "worked fine", "ran fine"], "Inspection/Test Only"),

    # Mechanical tighten/adjust
    (["came loose", "took apart"], "Tighten/Adjust"),
]

UNIGRAM_MAP = [
    # Installation / replacement
    (["installed", "install", "changed", "removed", "new"], "Replace Part"),

    # Pneumatic / hydraulic
    (["air", "line", "hose", "pump", "cylinder", "solenoid", "regulator"], "Hydraulic/Pneumatic Fix"),

    # Electrical
    (["switch", "wiring", "controller", "power", "fuse"], "Electrical Fix"),

    # Inspection / test / ambiguous OK
    (["found", "checked", "ok", "not working", "problem"], "Inspection/Test Only"),

    # Mechanical adjust
    (["loose", "aligned", "adjust", "tighten", "tightened"], "Tighten/Adjust"),
]

def remap_other_label(note: str) -> str:
    t = str(note).lower()

    # 1) Bigram priority (exact substring search for speed/clarity)
    for phrases, mapped in BIGRAM_MAP:
        if any(p in t for p in phrases):
            return mapped

    # 2) Unigram fallbacks
    for terms, mapped in UNIGRAM_MAP:
        if any(w in t for w in terms):
            return mapped

    return "Other"

mask_other = (df["Response_Label"] == "Other")
df.loc[mask_other, "Response_Label"] = df.loc[mask_other, "Text_cleaned"].apply(remap_other_label)


# --- 3) Collapse tiny labels again (keeps classes trainable) ---
min_count = 15
vc = df['Response_Label'].value_counts()
valid = vc[vc >= min_count].index
df.loc[~df['Response_Label'].isin(valid), 'Response_Label'] = "Other"

# new distribution to confirm "Other" 
print("Label distribution AFTER re-map:\n", df['Response_Label'].value_counts())



Label distribution AFTER re-map:
 Response_Label
Electrical Fix             14487
Other                       9822
Replace Part                8775
Tighten/Adjust              5862
Hydraulic/Pneumatic Fix     3718
Inspection/Test Only        3388
Clean/Clear                 3170
Reset/Power Cycle            226
Refill/Top Off               211
Name: count, dtype: int64


# **6. Narrative Conversion**

# Prediction - Logic Loop

In [25]:
def get_best_cosine_match(input_text, reference_df, tfidf_vectorizer):
    input_vec = tfidf_vectorizer.transform([input_text])
    ref_vecs = tfidf_vectorizer.transform(reference_df['Description_cleaned'])
    similarities = cosine_similarity(input_vec, ref_vecs).flatten()

    best_idx = similarities.argmax()
    best_text = reference_df.iloc[best_idx]['Text_cleaned']
    best_score = similarities[best_idx]

    return best_text, best_score

# Narrative Conversion

In [26]:
import os
client = OpenAI(api_key="",)
from groq import Groq

# Sample data to convert into narrative
data_dict = {
    'data': df[['Description_cleaned', 'Response_Label']].sample(5).to_string(index=False)
}


# OpenAI API Key
client  = Groq(
    api_key=("GROQ_API_KEY"),
)

# Generate narrative summary using a language model
chat_completion = client.chat.completions.create(
    messages=[
        {"role": "user", "content": f"""You are a smart diagnostic assistant.
Analyze the following maintenance data and return a readable summary of what’s happening, what actions were taken, and what patterns you notice. Be brief, insightful, and professional.

Maintenance Data:
{data_dict['data']}
"""}
    ],
    model="llama3-8b-8192",
)

narrative = chat_completion.choices[0].message.content
print(narrative.replace('\n', ' '))

**Summary:**  Based on the provided maintenance data, it appears that the equipment is experiencing a series of electrical and mechanical issues. A total of six faults have been reported, with four of them involving electrical issues and two related to mechanical problems.  **Actions Taken:**  * Electrical fault lights were addressed through repairs * Press noise issue was resolved through a repair * Faults with air leaks, oiler low air cylinder, and tie rod sticking were also repaired  **Patterns Noted:**  * A significant number of electrical faults were observed, indicating potential issues with wiring, circuits, or electrical components. * The occurrence of multiple mechanical problems, such as air leaks and sticking tie rods, suggests possible wear and tear or maintenance neglect. * The fact that all reported issues were resolved through repairs implies that the maintenance team is proactive in addressing equipment malfunctions.  **Recommendations:**  * Conduct a thorough inspectio

In [67]:
import joblib
from pathlib import Path

# Define the missing functions and model pipeline
def _llm_narrative(prompt):
    # This is a simple placeholder function - replace with actual LLM implementation
    return "Based on the symptoms, this appears to be a mechanical issue with the conveyor belt system. \
The belt squeal and hot motor near the gearbox suggest belt slippage or misalignment. \
First, check if the belt is properly tensioned and aligned to prevent further damage. \l
Inspect the gearbox for proper lubrication and signs of wear. \
Plan to clean and lubricate the system, adjust belt tension, and if problems persist, consider replacing worn components in the drive system."

def retrieve_neighbors(description, top_label, k=5, restrict=True):
    # Placeholder function to simulate retrieving similar cases
    return [
        {"label": "Belt Adjustment", "desc": "Conveyor belt slipping and making noise", "notes": "Adjusted tension and realigned belt", "score": 0.92},
        {"label": "Gearbox Repair", "desc": "Motor running hot with grinding noise", "notes": "Replaced worn gears and added lubricant", "score": 0.85},
        {"label": "Motor Cooling", "desc": "Overheating motor causing trips", "notes": "Cleaned vents and improved airflow", "score": 0.78}
    ]

# Define a function to get top actions 
def get_top_actions(description, k=3):
    # Placeholder function to simulate model predictions
    return [
        ("Belt Adjustment", 0.75),
        ("Gearbox Inspection", 0.65),
        ("Motor Cooling", 0.45)
    ]

def generate_narrative(description, topk_actions=3, k_neighbors=5, restrict_neighbors=True):
    # Get top actions if not provided
    if isinstance(topk_actions, int):
        top_actions = get_top_actions(description, k=topk_actions)
    else:
        top_actions = topk_actions
    
    # Get top label for neighbor retrieval
    top_label = top_actions[0][0] if top_actions else ""

    # Nearest historical cases
    neighbors = retrieve_neighbors(description, top_label, k=k_neighbors, restrict=restrict_neighbors)

    # Build a compact, model-agnostic prompt
    examples = []
    for n in neighbors:
        ex = f"- Label: {n.get('label','')}\n  Desc: {n.get('desc','')[:200]}\n  Notes: {n.get('notes','')[:220]}"
        examples.append(ex)
    examples_text = "\n".join(examples) if examples else "No close historical cases were retrieved."

    preds_text = "\n".join([f"{i+1}. {lbl}  ({prob:.1%})" for i,(lbl,prob) in enumerate(top_actions)])

    prompt = f"""You are a maintenance triage helper. 
Given a short problem description, provide a brief, practical narrative that suggests next steps.
Be concise and professional—no fluff. Avoid stating you are an AI.

Problem Description:
{description}

Top candidate actions (model):
{preds_text}

Nearest historical cases:
{examples_text}

Write 3–6 sentences covering:
1) What's most likely going on.
2) What to check first (quick wins / safety).
3) A concrete action plan.
4) If uncertain, suggest 1–2 alternate paths.
"""

    narrative = _llm_narrative(prompt)
    return {
        "description": description,
        "top_actions": top_actions,
        "neighbors": neighbors,
        "narrative": narrative
    }

# ---------- 5) Example usage ----------
example_text = "Conveyor stopped, belt squeal, motor running hot near gearbox; intermittent trip on startup."
out = generate_narrative(example_text, topk_actions=3, k_neighbors=5, restrict_neighbors=True)

print("— Problem —")
print(out["description"])
print("\n— Top actions —")
for a,p in out["top_actions"]:
    print(f"  • {a}: {p:.1%}")
print("\n— Narrative —")
print(out["narrative"])
print("\n— Similar cases —")
for i, n in enumerate(out["neighbors"], 1):
    print(f"[{i}] ({n.get('label','')}, sim={n.get('score',0):.2f})")
    print("    Desc :", (n.get('desc','') or "")[:140])
    print("    Notes:", (n.get('notes','') or "")[:140])

— Problem —
Conveyor stopped, belt squeal, motor running hot near gearbox; intermittent trip on startup.

— Top actions —
  • Belt Adjustment: 75.0%
  • Gearbox Inspection: 65.0%
  • Motor Cooling: 45.0%

— Narrative —
Based on the symptoms, this appears to be a mechanical issue with the conveyor belt system. The belt squeal and hot motor near the gearbox suggest belt slippage or misalignment. First, check if the belt is properly tensioned and aligned to prevent further damage. Inspect the gearbox for proper lubrication and signs of wear. Plan to clean and lubricate the system, adjust belt tension, and if problems persist, consider replacing worn components in the drive system.

— Similar cases —
[1] (Belt Adjustment, sim=0.92)
    Desc : Conveyor belt slipping and making noise
    Notes: Adjusted tension and realigned belt
[2] (Gearbox Repair, sim=0.85)
    Desc : Motor running hot with grinding noise
    Notes: Replaced worn gears and added lubricant
[3] (Motor Cooling, sim=0.78)
   