In [1]:
!pip install loguru



In [2]:
# @title ## 1. Setup: Mount Google Drive & Install Libraries
# Purpose: This cell connects your Colab environment to your Google Drive
# and installs the required 'transformers' library for BERT.

from google.colab import drive
import os

# Mount your Google Drive. You'll be prompted to authorize this.
drive.mount('/content/drive')

# Install the Hugging Face transformers library quietly
!pip install -q transformers

print("✅ Google Drive mounted and libraries installed.")


# @title ## 2. Clone Your GitHub Repository
# Purpose: This cell downloads your code and data from GitHub into the Colab environment.

# Define the repository URL
repo_url = "https://github.com/nhahub/NHA-112.git"
repo_name = "NHA-112"

# Clone the repository
if not os.path.exists(repo_name):
    !git clone {repo_url}
else:
    print("Repository already cloned.")

# Change the working directory into your repository
os.chdir(repo_name)
print(f"✅ Repository cloned. Current working directory: {os.getcwd()}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted and libraries installed.
Repository already cloned.
✅ Repository cloned. Current working directory: /content/NHA-112


In [3]:
# @title ## 3. Import Your BertTextClassifier Class
# Purpose: This cell adds your repository's code to Python's path and imports
# the necessary libraries and your custom classifier.
import sys
import pandas as pd
from sklearn.model_selection import train_test_split

# Add the current directory to the Python path to find your module
sys.path.append('.')

# Now you can import your class
from dataEngineer.modeling.Deeplearning2 import MultiOutputClassificationModel

print("✅ SentimentAnalysisModel class imported successfully.")

[32m2025-11-14 17:29:34.565[0m | [1mINFO    [0m | [36mdataEngineer.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /content/NHA-112[0m


✅ SentimentAnalysisModel class imported successfully.


In [4]:
# 1. Create Sample Data
data = {
    'text': [
        "My power was out all day and the food in my fridge is spoiled",
        "I was overcharged on my recent electricity bill, this is unacceptable",
        "My internet is so slow I can't even watch a video",
        "The wifi connection keeps dropping every 5 minutes",
        "My bill is wrong, I already paid it last week!",
        "Why is my internet bill so high this month?",
        "A tree fell on the power line, we have no electricity",
        "Your technician never showed up for the internet install"
    ],
    'main_category': [
        "electric", "electric", "internet", "internet",
        "electric", "internet", "electric", "internet"
    ],
    'sub_category': [
        "outage", "bill issue", "speed issue", "connection drop",
        "bill issue", "bill issue", "outage", "service issue"
    ]
}
df = pd.DataFrame(data)




In [5]:
data_path = "/content/NHA-112/data/interim/reddit_complaints_dataset.csv"

In [6]:
df = pd.read_csv(data_path)

In [7]:
df.columns

Index(['category', 'subreddit', 'problem_type', 'title', 'text'], dtype='object')

In [11]:
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully!")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")


drive_model_path = '/content/drive/MyDrive/bert_product_review_model200/sentiment_classifier.pth'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(drive_model_path), exist_ok=True)

print(f"\nModel will be saved to: {drive_model_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!

Model will be saved to: /content/drive/MyDrive/bert_product_review_model200/sentiment_classifier.pth


In [12]:
# 2. Initialize the Model Handler
# Use a smaller model for faster testing, e.g., 'prajjwal1/bert-tiny' or 'distilbert-base-uncased'
# 'bert-base-uncased' is fine if you have the resources
model_handler = MultiOutputClassificationModel(
    model_name='distilbert-base-uncased',
    model_path=drive_model_path
)

Using device: cuda


In [13]:
# 3. Train the Model
# Note: This is a tiny dataset, so accuracy will be high but not meaningful.
# Use a real dataset for actual training.
model_handler.train(
    df=df,
    text_column='text',
    category_column='category',
    subcategory_column='problem_type',
    num_epochs=5,
    batch_size=4
)


--- No model found. Starting training from scratch. ---
Categories: ['banking', 'education', 'government', 'health', 'housing', 'insurance', 'shopping', 'technology', 'transport']
Sub-Categories: ['account locked', 'atm error', 'bad customer service', 'claim denied', 'corruption', 'course registration', 'credit card issues', 'data loss', 'delayed bus', 'document processing', 'emergency response', 'eviction', 'exam stress', 'fake product', 'flight cancellation', 'fraud issues', 'fraud transaction', 'fuel price', 'hardware failure', 'hospital delay', 'insurance claim', 'landlord problem', 'late delivery', 'legal confusion', 'loan problems', 'maintenance issue', 'medicine shortage', 'neighbor issue', 'network problem', 'online learning issues', 'other', 'policy confusion', 'poor quality', 'poor teaching', 'public service delay', 'refund issues', 'rent increase', 'road accident', 'slow performance', 'slow response', 'software bug', 'tax issue', 'traffic jam', 'tuition fees', 'wrong diagno

In [14]:
# 4. Predict on new text
print("\n--- Making Predictions ---")

test_text_1 = "My power is out again, this is the third time this week."
prediction_1 = model_handler.predict(test_text_1)
print(f"\nText: '{test_text_1}'")
print(f"Predicted Category: {prediction_1['category']['prediction']} (Conf: {prediction_1['category']['confidence']:.2f})")
print(f"Predicted Sub-Category: {prediction_1['sub_category']['prediction']} (Conf: {prediction_1['sub_category']['confidence']:.2f})")
print("Top Category Predictions:", prediction_1['category']['top_predictions'])
print("Top Sub-Category Predictions:", prediction_1['sub_category']['top_predictions'])


test_text_2 = "I am being charged for services I did not ask for on my internet bill."
prediction_2 = model_handler.predict(test_text_2)
print(f"\nText: '{test_text_2}'")
print(f"Predicted Category: {prediction_2['category']['prediction']} (Conf: {prediction_2['category']['confidence']:.2f})")
print(f"Predicted Sub-Category: {prediction_2['sub_category']['prediction']} (Conf: {prediction_2['sub_category']['confidence']:.2f})")
print("Top Category Predictions:", prediction_2['category']['top_predictions'])
print("Top Sub-Category Predictions:", prediction_2['sub_category']['top_predictions'])


--- Making Predictions ---

Text: 'My power is out again, this is the third time this week.'
Predicted Category: technology (Conf: 0.57)
Predicted Sub-Category: other (Conf: 0.99)
Top Category Predictions: {'technology': 0.5743008255958557, 'education': 0.17742657661437988, 'banking': 0.12447728216648102}
Top Sub-Category Predictions: {'other': 0.9945005178451538, 'hardware failure': 0.000789194367825985, 'document processing': 0.0005087854224257171}

Text: 'I am being charged for services I did not ask for on my internet bill.'
Predicted Category: technology (Conf: 0.95)
Predicted Sub-Category: other (Conf: 0.37)
Top Category Predictions: {'technology': 0.9542701244354248, 'shopping': 0.01935923658311367, 'banking': 0.01373559981584549}
Top Sub-Category Predictions: {'other': 0.36710110306739807, 'hardware failure': 0.16038013994693756, 'network problem': 0.13688674569129944}
