1. Setup and Imports

In [None]:
# Cell 1: Setup and Imports
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import time
import nltk

# Add project root to Python path
project_root = str(Path.cwd().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Project modules
from src.data.data_processing import DataProcessor
from src.models.model_training import ModelTrainer

# Load YAML configuration
import yaml
with open('configs/config.yml', 'r') as f:
    config = yaml.safe_load(f)

# Initialize data_processor (we will re-use it, but in is_prediction mode)
data_processor = DataProcessor('configs/config.yml')

# Initialize NLTK (quietly, so it won't re-download unnecessarily)
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except Exception as e:
    print(f"[WARN] NLTK download failed: {e}")

print("[INFO] Setup complete.")


2. Load Saved Model

In [None]:
# Cell 2: Load the Saved Model
import joblib
from src.models.model_factory import ModelFactory, LearningType

def load_saved_model(model_path: str, vectorizer_path: str, config: Dict) -> ModelTrainer:
    """
    Load a saved model (including training history and features).
    """
    model_data = joblib.load(model_path)
    print("[INFO] Loaded model_data keys:", list(model_data.keys()))

    # Create a ModelTrainer
    trainer = ModelTrainer(config)

    # Identify the learning type / model type from the saved object
    model_type = model_data.get('model_type', 'classifier')
    learning_type = model_data.get('learning_type', 'supervised')
    print(f"[INFO] Model type from file: {model_type}")
    print(f"[INFO] Learning type from file: {learning_type}")

    # Construct a new model instance
    if model_type == "classifier":
        trainer.learning_type = LearningType.SUPERVISED_CLASSIFICATION
        trainer.model = ModelFactory.create_model(LearningType.SUPERVISED_CLASSIFICATION, config)
    elif model_type == "regressor":
        trainer.learning_type = LearningType.SUPERVISED_REGRESSION
        trainer.model = ModelFactory.create_model(LearningType.SUPERVISED_REGRESSION, config)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    # Load the actual sklearn estimator and info
    trainer.model.model = model_data['model']
    trainer.model.config = model_data['config']
    trainer.model.training_history = model_data['training_history']
    trainer.model.is_fitted = model_data['is_fitted']

    # If you stored your training features inside 'model_data', load them:
    if 'training_features' in model_data:
        trainer.training_features = model_data['training_features']
        print(f"[INFO] Loaded {len(trainer.training_features)} training features.")

    # Load the fitted vectorizer
    fitted_vectorizer = joblib.load(vectorizer_path)
    print("[INFO] Loaded fitted vectorizer from:", vectorizer_path)

    # Now we can patch the DataProcessor with that fitted vectorizer
    #    or create a brand new DataProcessor if needed.
    #    For now let's assume we do this:
    data_processor = DataProcessor('configs/config.yml')
    data_processor.vectorizer = fitted_vectorizer
    print("[INFO] DataProcessor is now holding the fitted vectorizer in .vectorizer")

    # Return both the trainer and the data_processor
    return trainer, data_processor

# Example usage of load_saved_model:
MODEL_PATH = "models/example_model/0.1.0/Deep Trees Model_20250116_172241_20250116_172241"

# # your saved vectorizer
# (Adapt the file paths to however you saved them. If you saved them inside the same .pkl, you can directly load them from model_data['vectorizer'] as well.)
VECTORIZER_PATH = "models/tfidf_vectorizer.joblib"

try:
    trainer, data_processor = load_saved_model(MODEL_PATH, VECTORIZER_PATH, config)
    print("[INFO] Model loaded. Trainer learning_type =", trainer.learning_type)
except Exception as e:
    print("[ERROR] Model loading failed:", str(e))


3. Prediction Function

In [None]:
# Cell 3: Predicting from a list of dictionaries
def predict_sentiments(
    data: List[Dict], 
    data_processor: DataProcessor, 
    trainer: ModelTrainer
) -> List[Dict]:
    """
    Process incoming data payload and predict with the loaded model.
    We do *not* rely on any CSV file. We build X from data payload.
    """
    # 1) Convert the input payload into a DataFrame
    df = pd.DataFrame(data)

    # 2) Process data in 'prediction' mode, so the vectorizer calls transform, not fit_transform
    processed_df = data_processor.process_data(
        df,
        data_file_name='NO_SAVE.csv',
        is_prediction=True
    )

    # 3) Identify the correct set of columns that the model expects
    if hasattr(trainer, 'training_features') and trainer.training_features:
        # Intersect with the columns we have
        columns_we_have = processed_df.columns.tolist()
        common_features = [col for col in trainer.training_features if col in columns_we_have]
        if not common_features:
            raise ValueError("No overlapping features between processed DF and trainer's training_features.")

        # Reorder processed_df to the same order
        X = processed_df[common_features]
    else:
        # If no training_features are stored, use all numeric + encoded columns, or something else
        X = processed_df

    print("\n[INFO] Using features for prediction:", list(X.columns))

    # 4) Predict
    predictions = trainer.predict(X)

    # 5) Attach predictions to each original item
    results = []
    for idx, row in enumerate(data):
        row_copy = row.copy()
        row_copy["predicted_sentiment"] = predictions[idx]
        results.append(row_copy)

    return results


4. Make Predictions

In [None]:
# Cell 4: Simulated Test Payload & Predictions

# Example: Suppose user sends this data in an API call
test_payload = [
    {
        "tweet_id": 123456789,
        "from_platform": "Nvidia",
        "tweet": "NVIDIA still the big boss of hardware AI technologies"
    },
    {
        "tweet_id": 987654321,
        "from_platform": "Nvidia",
        "tweet": "Fuck ! What's going wrong with this firm? They are producing a boring product"
    }
]

try:
    # Make predictions
    prediction_results = predict_sentiments(test_payload, data_processor, trainer)

    print("\n[INFO] Prediction Results:")
    for item in prediction_results:
        print("Tweet ID:", item["tweet_id"])
        print("Platform:", item["from_platform"])
        print("Tweet:", item["tweet"])
        print("Predicted Sentiment:", item["predicted_sentiment"])
        print("-" * 70)
except Exception as e:
    print("[ERROR] Prediction error:", str(e))


5. Utility Functions for Different Input Types

In [None]:
# Cell 5: Additional Utilities

def load_json_from_file(file_path: str) -> List[Dict]:
    """Load incoming data from a JSON file."""
    import json
    with open(file_path, 'r') as f:
        return json.load(f)

def predict_from_json_file(file_path: str) -> List[Dict]:
    """Predict from a JSON file directly."""
    data = load_json_from_file(file_path)
    return predict_sentiments(data, data_processor, trainer)

def predict_single_tweet(
    tweet_text: str, 
    platform: str = "Unknown", 
    tweet_id: int = None
) -> Dict:
    """
    Predict sentiment for a single tweet string.
    """
    if tweet_id is None:
        tweet_id = int(time.time())  # Random ID
    single_payload = [{
        "tweet_id": tweet_id,
        "from_platform": platform,
        "tweet": tweet_text
    }]
    results = predict_sentiments(single_payload, data_processor, trainer)
    return results[0]

# Example usage
try:
    single_tweet_text = "I love the new GPU performance from Nvidia!"
    single_pred = predict_single_tweet(single_tweet_text, "Twitter")
    print("\nSingle tweet prediction ->", single_pred)
except Exception as e:
    print("[ERROR] Single tweet prediction error:", str(e))


6. Example Usage

In [None]:
# Cell 6: Example Usage
try:
    # 1. Predict from JSON string
    json_str = '''
    [
        {
            "tweet_id": 123456789,
            "from_platform": "Nvidia",
            "tweet": "NVIDIA still the big boss of hardware AI technologies"
        },
        {
            "tweet_id": 987654321,
            "from_platform": "Nvidia",
            "tweet": "What's going wrong with this firm. They are producing a bullshit"
        }
    ]
    '''
    test_data = json.loads(json_str)
    #results = predict_sentiments(test_data, data_processor, trainer)

    # 2. Predict single tweet
    #single_result = predict_single_tweet(
        "NVIDIA's new GPU is amazing!",
        platform="Twitter"
    )

    # Display results
    print("\nSingle Tweet Prediction:")
    print(f"Tweet: {single_result['tweet']}")
    print(f"Predicted Sentiment: {single_result['predicted_sentiment']}")
except Exception as e:
    print(f"Example usage error: {str(e)}")