In [22]:
import sys
import os

# Get the parent directory (project root)
project_root = os.path.abspath('..')
sys.path.append(project_root)

from dataEngineer.modeling.MLmodel2 import *

In [23]:
import pandas as pd
import numpy as np
import shutil

In [24]:
data = {
    'text': [
        "the kitchen sink is leaking water everywhere",
        "my breaker keeps tripping and the lights are out",
        "i was overcharged on my last bill",
        "the toilet won't stop running",
        "the power outlet is sparking and smells like smoke",
        "my bill is wrong, you charged me twice",
        "the faucet is dripping",
        "the main fuse box is hot"
    ],
    'problem_type': [
        'Leaking', 'Outage', 'Billing Error', 'Leaking', 
        'Hazard', 'Billing Error', 'Leaking', 'Hazard'
    ],
    'category': [
        'Plumbing', 'Electric', 'Billing', 'Plumbing',
        'Electric', 'Billing', 'Plumbing', 'Electric'
    ]
}
df = pd.DataFrame(data)

In [25]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models'
if os.path.exists(MODEL_DIR):
    shutil.rmtree(MODEL_DIR) # Clean up previous runs

# --- 3. Initialize and Train ---
# We pass *both* label columns to the constructor
tasks = ['problem_type', 'category']
classifier = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='logreg',
    use_hyperparameter_tuning=False # Set to False for a quick demo
)

In [26]:
print("========== INITIAL TRAINING ==========")
classifier.train(df, text_column='text')


--- No models found. Training new LOGREG models from scratch. ---

--- Training Initial Model for Task: 'problem_type' ---
Registering classes for 'problem_type': ['Billing Error' 'Hazard' 'Leaking' 'Outage']

Initial model training complete for 'problem_type'.

--- Initial 'problem_type' Model Performance on Test Set ---
Model Accuracy: 0.5000
Classification Report:
               precision    recall  f1-score   support

Billing Error       1.00      1.00      1.00         1
       Hazard       0.00      0.00      0.00         0
      Leaking       0.00      0.00      0.00         0
       Outage       0.00      0.00      0.00         1

     accuracy                           0.50         2
    macro avg       0.25      0.25      0.25         2
 weighted avg       0.50      0.50      0.50         2


--- Training Initial Model for Task: 'category' ---
Registering classes for 'category': ['Billing' 'Electric' 'Plumbing']

Initial model training complete for 'category'.

--- Initial '

In [27]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = classifier.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")




Predictions for: the light switch is broken
  Problem Type: Hazard
  Category:     Electric

Predictions for: my sink is clogged and i have a high bill
  Problem Type: Billing Error
  Category:     Billing


In [28]:
new_data = {
    'text': [
        "my internet is down",
        "i paid but you say i didn't",
        "the whole building has no power"
    ],
    'problem_type': [
        'Outage', 'Billing Error', 'Outage'
    ],
    'category': [
        'Network', 'Billing', 'Electric'
    ]
}
new_df = pd.DataFrame(new_data)

In [29]:
print("\n========== INCREMENTAL TRAINING ==========")
classifier.train(new_df, text_column='text', split_new_data_for_eval=False)



--- Found existing vectorizer. Performing incremental update for all tasks. ---
All models and vectorizer loaded successfully from /home/jax/NHA-112/models/my_multi_task_models/

--- Checking for new classes in update batch ---
  New class examples: ['Network']
  These rows will be SKIPPED for this task's update.
  Skipping 1 rows containing new classes.
  Proceeding with the remaining 2 valid rows for training.

Training on full filtered batch without evaluation...
Updating model for task: 'problem_type'
Updating model for task: 'category'

All model updates complete.
Shared vectorizer saved to /home/jax/NHA-112/models/my_multi_task_models/multi_task_sgd_vectorizer.pkl
Model for task 'problem_type' saved to /home/jax/NHA-112/models/my_multi_task_models/problem_type_logreg_sgd_model.pkl
Model for task 'category' saved to /home/jax/NHA-112/models/my_multi_task_models/category_logreg_sgd_model.pkl


In [30]:
print("\n========== MAKING PREDICTIONS (AFTER UPDATE) ==========")
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = classifier.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
# The model will predict the *closest* class it knows.
# Since it never learned 'Network', it might guess 'Electric' or 'Billing'.
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][1]}")



Predictions for: the internet is slow ('internet')
  Problem Type: Leaking
  Category:     Electric

Predictions for: my bill is too high ('bill')
  Problem Type: Leaking
  Category:     Billing


In [31]:
# shutil.rmtree(MODEL_DIR)