In [40]:
import sys
import os

# Get the parent directory (project root)
project_root = os.path.abspath('..')
sys.path.append(project_root)

from dataEngineer.modeling.MLmodel2 import *
from dataEngineer.pipeLine import *

In [41]:
import pandas as pd
import numpy as np
import shutil

In [42]:
data = pd.read_csv('/home/jax/NHA-112/data/interim/reddit_complaints_dataset.csv')
df = pd.DataFrame(data)

In [43]:
df['text'] = df['text'].convert_dtypes('object')

In [44]:
df['text'].dtype

string[python]

In [45]:
categories = df['category'].unique()

for category in categories:
    cat = df[df['category'] == category]
    counts = cat['problem_type'].value_counts()
    
    for problem_type, count in counts.items():
        if count <= 1:
            df = df[~((df['category'] == category) & (df['problem_type'] == problem_type))]
        

In [46]:
df = df[df['text'].str.strip() != ""]
df.reset_index(drop=True, inplace=True)
cleaning_pipeline = Pipeline([("text_preprocessor", NltkTextPreprocessor())])

print("Applying sklearn pipeline for text cleaning and lemmatization...")
processed_text_series = pd.Series(
    cleaning_pipeline.fit_transform(df['text']), name="processed_text"
)
df["processed_text"] = processed_text_series

Applying sklearn pipeline for text cleaning and lemmatization...


In [None]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models_afterCleaning_logostic'

tasks = ['category','problem_type']
logostice = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='logreg',
    use_hyperparameter_tuning=True #
)

In [48]:
# logostice.load()
# X_test_tfidf = logostice.vectorizer.transform(df['processed_text'])
# for col in logostice.label_columns:
#     print(f"\nEvaluating task: {col}")
#     logostice.evaluate(
#         logostice.models[col],
#         X_test_tfidf,
#         df[col],      
#         title=f"--- Evaluation for '{col}' ---"
#     )


In [49]:
logostice.train(df, text_column='processed_text')


--- No models found. Training new LOGREG models from scratch. ---

--- Training Initial Model for Task: 'category' ---
Registering classes for 'category': ['banking' 'education' 'government' 'health' 'housing' 'insurance'
 'shopping' 'technology' 'transport']

--- Performing Hyperparameter Tuning with RandomizedSearchCV ---
Fitting 3 folds for each of 15 candidates, totalling 45 fits

Best parameters for 'category': {'alpha': np.float64(7.068974950624602e-05), 'penalty': 'l2'}

Initial model training complete for 'category'.

--- Initial 'category' Model Performance on Test Set ---
Model Accuracy: 0.8305
Classification Report:
              precision    recall  f1-score   support

     banking       0.78      0.85      0.81       144
   education       0.85      0.86      0.85       106
  government       0.77      0.78      0.78       127
      health       0.87      0.92      0.89        78
     housing       0.95      0.88      0.91       108
   insurance       0.83      0.78      




Best parameters for 'problem_type': {'alpha': np.float64(4.207053950287933e-05), 'penalty': 'elasticnet'}

Initial model training complete for 'problem_type'.

--- Initial 'problem_type' Model Performance on Test Set ---
Model Accuracy: 0.5834
Classification Report:
                        precision    recall  f1-score   support

        account locked       0.00      0.00      0.00         9
             atm error       0.00      0.00      0.00        18
  bad customer service       0.67      0.33      0.44        12
          claim denied       0.00      0.00      0.00         4
            corruption       0.00      0.00      0.00         1
   course registration       0.00      0.00      0.00         1
    credit card issues       0.53      0.66      0.59        35
             data loss       0.00      0.00      0.00         3
           delayed bus       0.00      0.00      0.00         3
   document processing       0.47      0.18      0.26        38
    emergency response     

In [50]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = logostice.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")




Predictions for: the light switch is broken
  Problem Type: other
  Category:     technology

Predictions for: my sink is clogged and i have a high bill
  Problem Type: other
  Category:     health


In [51]:
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = logostice.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][1]}")
print(f"  Category:     {final_predictions['category'][1]}")


Predictions for: the internet is slow ('internet')
  Problem Type: network problem
  Category:     technology

Predictions for: my bill is too high ('bill')
  Problem Type: other
  Category:     health


In [None]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models_afterCleaning_svm'

In [53]:
tasks = ['category','problem_type']
svm_ = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='svm',
    use_hyperparameter_tuning=True
)

In [54]:
svm_.train(df, text_column='processed_text')


--- No models found. Training new SVM models from scratch. ---

--- Training Initial Model for Task: 'category' ---
Registering classes for 'category': ['banking' 'education' 'government' 'health' 'housing' 'insurance'
 'shopping' 'technology' 'transport']

--- Performing Hyperparameter Tuning with RandomizedSearchCV ---
Fitting 3 folds for each of 15 candidates, totalling 45 fits

Best parameters for 'category': {'alpha': np.float64(0.00031489116479568613), 'loss': 'hinge', 'penalty': 'elasticnet'}

Initial model training complete for 'category'.

--- Initial 'category' Model Performance on Test Set ---
Model Accuracy: 0.8279
Classification Report:
              precision    recall  f1-score   support

     banking       0.85      0.82      0.83       144
   education       0.80      0.87      0.83       106
  government       0.86      0.74      0.80       127
      health       0.82      0.91      0.86        78
     housing       0.88      0.91      0.89       108
   insurance    




Best parameters for 'problem_type': {'alpha': np.float64(0.00031489116479568613), 'loss': 'hinge', 'penalty': 'elasticnet'}

Initial model training complete for 'problem_type'.

--- Initial 'problem_type' Model Performance on Test Set ---
Model Accuracy: 0.5992
Classification Report:
                        precision    recall  f1-score   support

        account locked       0.00      0.00      0.00         9
             atm error       0.50      0.11      0.18        18
  bad customer service       0.33      0.08      0.13        12
          claim denied       0.00      0.00      0.00         4
            corruption       0.00      0.00      0.00         1
   course registration       0.00      0.00      0.00         1
    credit card issues       0.60      0.60      0.60        35
             data loss       0.00      0.00      0.00         3
           delayed bus       0.00      0.00      0.00         3
   document processing       0.47      0.21      0.29        38
    emerg

In [55]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = svm_.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")


Predictions for: the light switch is broken
  Problem Type: other
  Category:     technology

Predictions for: my sink is clogged and i have a high bill
  Problem Type: other
  Category:     health


In [56]:
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = svm_.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][1]}")
print(f"  Category:     {final_predictions['category'][1]}")


Predictions for: the internet is slow ('internet')
  Problem Type: network problem
  Category:     technology

Predictions for: my bill is too high ('bill')
  Problem Type: other
  Category:     health


In [57]:
text1 = 'ابني جاء باكيا من المدرسه'
text2 = 'النت بطئ'
trasform1 = [translate_arabic_to_english(text1)]
trasform2 = [translate_arabic_to_english(text2)]

print([trasform1])
print([trasform2])

[['My son came from school crying']]
[['The internet is slow']]


In [58]:
predict1_log = logostice.predict(trasform1)
pridict1_svm = svm_.predict(trasform1)


In [59]:
print(f"  Problem Type: {predict1_log['problem_type'][0]}")
print(f"  Category:     {predict1_log['category'][0]}") 

  Problem Type: other
  Category:     government


In [60]:
print(f"  Problem Type: {pridict1_svm['problem_type'][0]}")
print(f"  Category:     {pridict1_svm['category'][0]}") 

  Problem Type: other
  Category:     government


In [61]:
trasform2

['The internet is slow']

In [62]:
predict2_log = logostice.predict(trasform2)
pridict2_svm = svm_.predict(trasform2)


In [63]:
print(f"  Problem Type: {predict2_log['problem_type']}")
print(f"  Category:     {predict2_log['category']}") 

  Problem Type: ['network problem']
  Category:     ['technology']


In [64]:
print(f"  Problem Type: {pridict2_svm['problem_type'][0]}")
print(f"  Category:     {pridict2_svm['category'][0]}") 

  Problem Type: network problem
  Category:     technology
