In [1]:
import sys
import os

# Get the parent directory (project root)
project_root = os.path.abspath('..')
sys.path.append(project_root)

from dataEngineer.modeling.MLmodel2 import *


[32m2025-11-30 19:24:16.845[0m | [1mINFO    [0m | [36mdataEngineer.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/jax/NHA-112[0m


In [2]:
import pandas as pd
import numpy as np
import shutil

In [3]:
data = pd.read_csv('/home/jax/NHA-112/data/interim/reddit_complaints_dataset.csv')
df = pd.DataFrame(data)

In [4]:
df['text'] = df['text'].convert_dtypes('object')

In [5]:
df['text'].dtype

string[python]

In [6]:
df = df[df['text'].str.strip() != ""]
df.reset_index(drop=True, inplace=True)

In [7]:
categories = df['category'].unique()

for category in categories:
    cat = df[df['category'] == category]
    counts = cat['problem_type'].value_counts()
    
    for problem_type, count in counts.items():
        if count <= 1:
            df = df[~((df['category'] == category) & (df['problem_type'] == problem_type))]
        

In [8]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models_beforeCleaning_logostic'

In [9]:
tasks = ['category','problem_type']
logostice = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='logreg',
    use_hyperparameter_tuning=True
)

In [10]:
print("========== INITIAL TRAINING ==========")
logostice.train(df, text_column='text')


--- No models found. Training new LOGREG models from scratch. ---

--- Training Initial Model for Task: 'category' ---
Registering classes for 'category': ['banking' 'education' 'government' 'health' 'housing' 'insurance'
 'shopping' 'technology' 'transport']

--- Performing Hyperparameter Tuning with RandomizedSearchCV ---
Fitting 3 folds for each of 15 candidates, totalling 45 fits

Best parameters for 'category': {'alpha': np.float64(4.207053950287933e-05), 'penalty': 'elasticnet'}

Initial model training complete for 'category'.

--- Initial 'category' Model Performance on Test Set ---
Model Accuracy: 0.8173
Classification Report:
              precision    recall  f1-score   support

     banking       0.80      0.81      0.80       144
   education       0.81      0.82      0.81       106
  government       0.75      0.80      0.77       127
      health       0.88      0.86      0.87        78
     housing       0.92      0.89      0.91       108
   insurance       0.69      0.




Best parameters for 'problem_type': {'alpha': np.float64(4.207053950287933e-05), 'penalty': 'elasticnet'}

Initial model training complete for 'problem_type'.

--- Initial 'problem_type' Model Performance on Test Set ---
Model Accuracy: 0.5821
Classification Report:
                        precision    recall  f1-score   support

        account locked       0.00      0.00      0.00         9
             atm error       1.00      0.06      0.11        18
  bad customer service       0.50      0.17      0.25        12
          claim denied       0.00      0.00      0.00         4
            corruption       0.00      0.00      0.00         1
   course registration       0.00      0.00      0.00         1
    credit card issues       0.48      0.71      0.57        35
             data loss       0.00      0.00      0.00         3
           delayed bus       0.00      0.00      0.00         3
   document processing       0.43      0.16      0.23        38
    emergency response     

In [11]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = logostice.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")




Predictions for: the light switch is broken
  Problem Type: other
  Category:     housing

Predictions for: my sink is clogged and i have a high bill
  Problem Type: other
  Category:     health


In [12]:
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = logostice.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][1]}")
print(f"  Category:     {final_predictions['category'][1]}")


Predictions for: the internet is slow ('internet')
  Problem Type: network problem
  Category:     technology

Predictions for: my bill is too high ('bill')
  Problem Type: other
  Category:     health


In [14]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models_beforeCleaning_svm'

In [15]:
tasks = ['category','problem_type']
svm_ = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='svm',
    use_hyperparameter_tuning=True
)

In [16]:
svm_.train(df, text_column='text')


--- No models found. Training new SVM models from scratch. ---

--- Training Initial Model for Task: 'category' ---
Registering classes for 'category': ['banking' 'education' 'government' 'health' 'housing' 'insurance'
 'shopping' 'technology' 'transport']

--- Performing Hyperparameter Tuning with RandomizedSearchCV ---
Fitting 3 folds for each of 15 candidates, totalling 45 fits

Best parameters for 'category': {'alpha': np.float64(0.00031489116479568613), 'loss': 'hinge', 'penalty': 'elasticnet'}

Initial model training complete for 'category'.

--- Initial 'category' Model Performance on Test Set ---
Model Accuracy: 0.8252
Classification Report:
              precision    recall  f1-score   support

     banking       0.86      0.79      0.83       144
   education       0.78      0.92      0.84       106
  government       0.80      0.78      0.79       127
      health       0.84      0.90      0.87        78
     housing       0.87      0.92      0.89       108
   insurance    




Best parameters for 'problem_type': {'alpha': np.float64(0.00031489116479568613), 'loss': 'hinge', 'penalty': 'elasticnet'}

Initial model training complete for 'problem_type'.

--- Initial 'problem_type' Model Performance on Test Set ---
Model Accuracy: 0.5769
Classification Report:
                        precision    recall  f1-score   support

        account locked       0.00      0.00      0.00         9
             atm error       0.20      0.06      0.09        18
  bad customer service       0.29      0.17      0.21        12
          claim denied       0.00      0.00      0.00         4
            corruption       0.00      0.00      0.00         1
   course registration       0.00      0.00      0.00         1
    credit card issues       0.56      0.71      0.62        35
             data loss       0.00      0.00      0.00         3
           delayed bus       0.12      0.33      0.18         3
   document processing       0.50      0.11      0.17        38
    emerg

In [17]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = svm_.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")


Predictions for: the light switch is broken
  Problem Type: other
  Category:     technology

Predictions for: my sink is clogged and i have a high bill
  Problem Type: other
  Category:     education


In [18]:
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = svm_.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][1]}")
print(f"  Category:     {final_predictions['category'][1]}")


Predictions for: the internet is slow ('internet')
  Problem Type: network problem
  Category:     technology

Predictions for: my bill is too high ('bill')
  Problem Type: other
  Category:     education
