In [82]:
import sys
import os

# Get the parent directory (project root)
project_root = os.path.abspath('..')
sys.path.append(project_root)

from dataEngineer.modeling.MLmodel2 import *
from dataEngineer.pipeLine import *

In [83]:
import pandas as pd
import numpy as np
import shutil

In [84]:
data = pd.read_csv('/home/jax/NHA-112/data/interim/reddit_complaints_dataset.csv')
df = pd.DataFrame(data)

In [85]:
df['text'] = df['text'].convert_dtypes('object')

In [86]:
df['text'].dtype

string[python]

In [87]:
categories = df['category'].unique()

for category in categories:
    cat = df[df['category'] == category]
    counts = cat['problem_type'].value_counts()
    
    for problem_type, count in counts.items():
        if count <= 1:
            df = df[~((df['category'] == category) & (df['problem_type'] == problem_type))]
        

In [88]:
df = df[df['text'].str.strip() != ""]
df.reset_index(drop=True, inplace=True)
cleaning_pipeline = Pipeline([("text_preprocessor", NltkTextPreprocessor())])

print("Applying sklearn pipeline for text cleaning and lemmatization...")
processed_text_series = pd.Series(
    cleaning_pipeline.fit_transform(df['text']), name="processed_text"
)
df["processed_text"] = processed_text_series

Applying sklearn pipeline for text cleaning and lemmatization...


In [89]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models_afterCleaning_logostic'

tasks = ['problem_type', 'category']
logostice = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='logreg',
    use_hyperparameter_tuning=True #
)

In [90]:
# logostice.train(df, text_column='processed_text')

In [91]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = logostice.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")




All models and vectorizer loaded successfully from /home/jax/NHA-112/models/my_multi_task_models_afterCleaning_logostic/
Predictions for: the light switch is broken
  Problem Type: other
  Category:     housing

Predictions for: my sink is clogged and i have a high bill
  Problem Type: other
  Category:     education


In [92]:
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = logostice.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][1]}")
print(f"  Category:     {final_predictions['category'][1]}")


Predictions for: the internet is slow ('internet')
  Problem Type: network problem
  Category:     technology

Predictions for: my bill is too high ('bill')
  Problem Type: other
  Category:     education


In [93]:
MODEL_DIR = '/home/jax/NHA-112/models/my_multi_task_models_afterCleaning_svm'

In [94]:
tasks = ['problem_type', 'category']
svm_ = MultiTaskTextClassifier(
    label_columns=tasks,
    model_dir=MODEL_DIR,
    model_type='svm',
    use_hyperparameter_tuning=True
)

In [95]:
# svm_.train(df, text_column='processed_text')

In [96]:
print("\n========== MAKING PREDICTIONS ==========")
new_texts = [
    "the light switch is broken",
    "my sink is clogged and i have a high bill"
]

predictions = svm_.predict(new_texts)
print(f"Predictions for: {new_texts[0]}")
print(f"  Problem Type: {predictions['problem_type'][0]}")
print(f"  Category:     {predictions['category'][0]}")

print(f"\nPredictions for: {new_texts[1]}")
print(f"  Problem Type: {predictions['problem_type'][1]}")
print(f"  Category:     {predictions['category'][1]}")


All models and vectorizer loaded successfully from /home/jax/NHA-112/models/my_multi_task_models_afterCleaning_svm/
Predictions for: the light switch is broken
  Problem Type: other
  Category:     housing

Predictions for: my sink is clogged and i have a high bill
  Problem Type: other
  Category:     education


In [97]:
final_texts = [
    "the internet is slow", # The model has *not* learned "Network"
    "my bill is too high"   # The model *has* learned "Billing"
]
final_predictions = svm_.predict(final_texts)

print(f"\nPredictions for: {final_texts[0]} ('internet')")
print(f"  Problem Type: {final_predictions['problem_type'][0]}")
print(f"  Category:     {final_predictions['category'][0]}") 

print(f"\nPredictions for: {final_texts[1]} ('bill')")
# This prediction will be correct.
print(f"  Problem Type: {final_predictions['problem_type'][1]}")
print(f"  Category:     {final_predictions['category'][1]}")


Predictions for: the internet is slow ('internet')
  Problem Type: network problem
  Category:     technology

Predictions for: my bill is too high ('bill')
  Problem Type: other
  Category:     education


In [106]:
text1 = 'ابني جاء باكيا من المدرسه'
text2 = 'النت بطئ'
trasform1 = [translate_arabic_to_english(text1)]
trasform2 = [translate_arabic_to_english(text2)]

print([trasform1])
print([trasform2])

[['My son came from school crying']]
[['The internet is slow']]


In [102]:
predict1_log = logostice.predict(trasform1)
pridict1_svm = svm_.predict(trasform1)


In [103]:
print(f"  Problem Type: {predict1_log['problem_type'][0]}")
print(f"  Category:     {predict1_log['category'][0]}") 

  Problem Type: document processing
  Category:     government


In [104]:
print(f"  Problem Type: {pridict1_svm['problem_type'][0]}")
print(f"  Category:     {pridict1_svm['category'][0]}") 

  Problem Type: other
  Category:     education


In [107]:
trasform2

['The internet is slow']

In [109]:
predict2_log = logostice.predict(trasform2)
pridict2_svm = svm_.predict(trasform2)


In [110]:
print(f"  Problem Type: {predict2_log['problem_type']}")
print(f"  Category:     {predict2_log['category']}") 

  Problem Type: ['network problem']
  Category:     ['technology']


In [111]:
print(f"  Problem Type: {pridict2_svm['problem_type'][0]}")
print(f"  Category:     {pridict2_svm['category'][0]}") 

  Problem Type: network problem
  Category:     technology
