In [3]:
!pip install loguru



In [4]:
# @title ## 1. Setup: Mount Google Drive & Install Libraries
# Purpose: This cell connects your Colab environment to your Google Drive
# and installs the required 'transformers' library for BERT.

from google.colab import drive
import os

# Mount your Google Drive. You'll be prompted to authorize this.
drive.mount('/content/drive')

# Install the Hugging Face transformers library quietly
!pip install -q transformers

print("✅ Google Drive mounted and libraries installed.")


# @title ## 2. Clone Your GitHub Repository
# Purpose: This cell downloads your code and data from GitHub into the Colab environment.

# Define the repository URL
repo_url = "https://github.com/nhahub/NHA-112.git"
repo_name = "NHA-112"

# Clone the repository
if not os.path.exists(repo_name):
    !git clone {repo_url}
else:
    print("Repository already cloned.")

# Change the working directory into your repository
os.chdir(repo_name)
print(f"✅ Repository cloned. Current working directory: {os.getcwd()}")



Mounted at /content/drive
✅ Google Drive mounted and libraries installed.
Cloning into 'NHA-112'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (94/94), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 112 (delta 39), reused 72 (delta 17), pack-reused 18 (from 1)[K
Receiving objects: 100% (112/112), 83.00 MiB | 44.13 MiB/s, done.
Resolving deltas: 100% (39/39), done.
✅ Repository cloned. Current working directory: /content/NHA-112


In [5]:
# @title ## 3. Import Your BertTextClassifier Class
# Purpose: This cell adds your repository's code to Python's path and imports
# the necessary libraries and your custom classifier.
import sys
import pandas as pd
from sklearn.model_selection import train_test_split

# Add the current directory to the Python path to find your module
sys.path.append('.')

# Now you can import your class
from dataEngineer.modeling.Deeplearning import SentimentAnalysisModel

print("✅ SentimentAnalysisModel class imported successfully.")

[32m2025-10-17 13:25:50.474[0m | [1mINFO    [0m | [36mdataEngineer.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /content/NHA-112[0m


✅ SentimentAnalysisModel class imported successfully.


In [6]:
!unzip /content/NHA-112/data/raw/Clean_ready_data.zip

Archive:  /content/NHA-112/data/raw/Clean_ready_data.zip
  inflating: Clean_ready_data.csv    


In [7]:
# @title ## 4. Load and Preprocess the Data
# Purpose: Load the drug Text dataset, clean it, and create categorical labels
# that our classifier can use (e.g., 'positive', 'negative', 'neutral').

# Path to the training data within your cloned repo
data_path = "/content/NHA-112/Clean_ready_data.csv"

print("Loading data...")
# Read the tab-separated file
df = pd.read_csv(data_path)

# --- Data Cleaning and Preprocessing ---
# Drop rows with missing Texts or Ratings
df.dropna(subset=['Text', 'Rating'], inplace=True)

# Function to map numerical Ratings to sentiment categories
def to_sentiment(Rating):
    Rating = int(Rating)
    if Rating <= 2:
        return 'negative'
    elif Rating <= 3:
        return 'neutral'
    else:
        return 'positive'

# Apply the function to create a new 'sentiment' column
df['sentiment'] = df['Rating'].apply(to_sentiment)

# --- Create a smaller sample for faster training ---
# The full dataset is large. For this demo, we'll use a balanced sample of 6,000 Texts.
# For your final model, you might want to use the whole dataset.
print("Creating a smaller, balanced sample for demonstration...")
sample_df = df.groupby('sentiment').apply(lambda x: x.sample(2000, random_state=42)).reset_index(drop=True)

print("\nData loading and preprocessing complete.")
print(f"Sample size: {len(sample_df)} Texts.")
print("\nValue counts in the sample:")
print(sample_df['sentiment'].value_counts())
print("\nSample data pText:")
print(sample_df.head())


Loading data...
Creating a smaller, balanced sample for demonstration...

Data loading and preprocessing complete.
Sample size: 6000 Texts.

Value counts in the sample:
sentiment
negative    2000
neutral     2000
positive    2000
Name: count, dtype: int64

Sample data pText:
   Unnamed: 0  Rating                              Summary  \
0      228933       2                     erroneous  image   
1      425026       2                          not gourmet   
2      367473       1  showed more interest in a dust ball   
3      503623       2                real candy big pieces   
4      223821       1         worst of all timothys k cups   

                                                Text sentiment  
0  dont be fooled by the carton image showing a r...  negative  
1  not gourmet by any stretch of the imagination ...  negative  
2  my cats love catnip they have always reacted t...  negative  
3  i bought it and i was happy to see big pieces ...  negative  
4  i have tried all of tim

  sample_df = df.groupby('sentiment').apply(lambda x: x.sample(2000, random_state=42)).reset_index(drop=True)


In [8]:
!ls -la /content/drive/MyDrive/bert_product_review_model3 || true

total 486994
-rw------- 1 root root 498681185 Oct 17 12:39 sentiment_classifier.pth


In [None]:
# Step 1: Mount Google Drive
# A prompt will appear asking for authorization.
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully!")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")


drive_model_path = '/content/drive/MyDrive/bert_product_review_model20/sentiment_classifier.pth'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(drive_model_path), exist_ok=True)

print(f"\nModel will be saved to: {drive_model_path}")

# Initialize the classifier, pointing it to your Google Drive path
# Using 'distilbert' for a faster training session
sentiment_analyzer = SentimentAnalysisModel(
    model_name='distilbert-base-uncased',
    model_path=drive_model_path
)

# Train the model
sentiment_analyzer.train(
    df=df,
    text_column='Text',
    label_column='sentiment',
    num_epochs=3,
    batch_size=8 # Reduced batch size for small dataset
)

print("\n🎉 Model training is complete and the model has been saved to your Google Drive!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!

Model will be saved to: /content/drive/MyDrive/bert_product_review_model20/sentiment_classifier.pth


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Using device: cuda

--- No model found. Starting training from scratch. ---


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Categories: ['negative', 'neutral', 'positive']
Starting training for 3 epochs with learning rate 2e-05...


In [None]:
predictor = SentimentAnalysisModel(
    model_name='distilbert-base-uncased',
    model_path=drive_model_path
)

# The .predict() method will automatically call .load() if the model isn't in memory

new_text_positive = "This was a fantastic experience, I would recommend it to all my friends."
new_text_negative = "I am so disappointed with this. It broke after just one use."

print(f"\n--- Predicting for Positive Text ---")
print(f"Text: '{new_text_positive}'")
sentiment, confidence, top_preds = predictor.predict(new_text_positive)
if sentiment:
    print(f"Predicted Sentiment: {sentiment}")
    print(f"Confidence: {confidence:.2%}")
    print(f"Top Predictions: {top_preds}")

print(f"\n--- Predicting for Negative Text ---")
print(f"Text: '{new_text_negative}'")
sentiment, confidence, top_preds = predictor.predict(new_text_negative)
if sentiment:
    print(f"Predicted Sentiment: {sentiment}")
    print(f"Confidence: {confidence:.2%}")
    print(f"Top Predictions: {top_preds}")