1. Setup and Imports

In [14]:
# Cell 1: Setup and Imports
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import time

# Add project root to Python path
project_root = str(Path.cwd().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Import your modules
from src.data.data_processing import DataProcessor
from src.models.model_training import ModelTrainer
import yaml

# Load configuration
with open('configs/config.yml', 'r') as f:
    config = yaml.safe_load(f)

# Global variables
trainer = None
data_processor = None

# Initialize NLTK resources
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except Exception as e:
    print(f"Warning during NLTK resources download: {e}")

# Initialize data processor with TF-IDF configuration
data_processor = DataProcessor('configs/config.yml')

# Configure vectorizer from config
text_features_config = config.get('features', {}).get('text_features', {})
data_processor.vectorizer = TfidfVectorizer(
    max_features=text_features_config.get('max_features', 50),
    min_df=text_features_config.get('min_df', 0.02),
    max_df=text_features_config.get('max_df', 0.90),
    ngram_range=(1, 2)
)

2. Load Saved Model

In [15]:
# Cell 2: Load Saved Model
def load_saved_model(model_path: str, config: Dict) -> ModelTrainer:
    """
    Load a saved model and its associated vectorizer state.
    
    Args:
        model_path: Path to the saved model
        config: Configuration dictionary
    
    Returns:
        ModelTrainer: Initialized and loaded model trainer
    """
    try:
        # Load model data
        import joblib
        model_data = joblib.load(model_path)
        
        print("Available keys in model_data:", list(model_data.keys()))
        
        # Initialize trainer
        trainer = ModelTrainer(config)
        
        # Initialize model type
        from src.models.model_factory import ModelFactory, LearningType
        
        print(f"Loading model type: {model_data['model_type']}")
        print(f"Loading learning type: {model_data['learning_type']}")
        
        if model_data['model_type'] == "classifier":
            trainer.learning_type = LearningType.SUPERVISED_CLASSIFICATION
            trainer.model = ModelFactory.create_model(
                LearningType.SUPERVISED_CLASSIFICATION, 
                config
            )
        else:
            raise ValueError(f"Unsupported model type: {model_data['model_type']}")
        
        # Load model state
        trainer.model.model = model_data['model']
        trainer.model.config = model_data['config']
        trainer.model.training_history = model_data['training_history']
        trainer.model.is_fitted = model_data['is_fitted']
        
        # Store training features if available
        if 'training_features' in model_data:
            trainer.training_features = model_data['training_features']
            print(f"Loaded {len(trainer.training_features)} training features")
        
        print(f"Model loaded successfully from: {model_path}")
        return trainer
        
    except Exception as e:
        print(f"Error loading model details:")
        print(f"Model path: {model_path}")
        if 'model_data' in locals():
            print(f"Model data keys: {model_data.keys()}")
        raise

# Load model
try:
    MODEL_PATH = "models/example_model/0.1.0/Deep Trees Model_20250113_230521_20250113_230521"
    trainer = load_saved_model(MODEL_PATH, config)
    print("Model type:", trainer.model.model_type)
    print("Learning type:", trainer.model.learning_type)
except Exception as e:
    print(f"Error loading model: {str(e)}")

Loading model type: classifier
Loading learning type: supervised

 MODEL FACTORY: Random Forest Classifier will be used for training with the following parameters: 
{'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
Model and vectorizer loaded successfully from: models/example_model/0.1.0/Deep Trees Model_20250113_210714
Model type: classifier
Learning type: supervised


3. Prediction Function

In [10]:
# Cell 3: Prediction Function
def predict_sentiments(data: List[Dict], data_processor: DataProcessor, trainer: ModelTrainer) -> List[Dict]:
    """
    Predict sentiments for new tweets using consistent feature processing.
    
    Args:
        data: List of dictionaries containing tweet data
        data_processor: Initialized DataProcessor
        trainer: Trained ModelTrainer with loaded model
    
    Returns:
        List of dictionaries with predictions added
    """
    try:
        # Convert input to DataFrame
        df = pd.DataFrame(data)
        
        # Process data ensuring consistent features
        processed_df = data_processor.process_data(
            df, 
            data_file_name='NO_TO_SAVE', 
            is_prediction=True
        )
        
        # Get numeric and categorical encoded columns
        feature_columns = processed_df.columns[
            (processed_df.dtypes != 'object') | 
            (processed_df.columns.str.endswith('_categorical_encoded'))
        ].tolist()
        
        # Remove target column if present
        if 'sentiment_categorical_encoded' in feature_columns:
            feature_columns.remove('sentiment_categorical_encoded')
        
        # Organize features in the same order as training
        if hasattr(trainer, 'training_features'):
            common_features = [f for f in trainer.training_features if f in feature_columns]
            if not common_features:
                raise ValueError("No common features found between training and prediction data")
            X = processed_df[common_features]
        else:
            X = processed_df[feature_columns]
        
        print("Using features for prediction:", X.columns.tolist())
        
        # Make predictions
        predictions = trainer.predict(X)
        
        # Format results
        results = []
        for i, item in enumerate(data):
            result = item.copy()
            result['predicted_sentiment'] = predictions[i]
            results.append(result)
        
        return results
        
    except Exception as e:
        print("Error during prediction processing:")
        print(f"Available features: {processed_df.columns.tolist()}")
        raise

# Test data
test_payload = [
    {
        "tweet_id": 123456789,
        "from_platform": "Nvidia",
        "tweet": "NVIDIA still the big boss of hardware AI technologies"
    },
    {
        "tweet_id": 987654321,
        "from_platform": "Nvidia",
        "tweet": "What's going wrong with this firm. They are producing a bullshit"
    }
]

4. Make Predictions

In [11]:
# Cell 4: Make Predictions
try:
    results = predict_sentiments(test_payload, data_processor, trainer)
    
    # Display results
    print("\nPrediction Results:")
    print("-" * 80)
    for result in results:
        print(f"\nTweet ID: {result['tweet_id']}")
        print(f"Platform: {result['from_platform']}")
        print(f"Tweet: {result['tweet']}")
        print(f"Predicted Sentiment: {result['predicted_sentiment']}")
        print("-" * 80)
except Exception as e:
    print(f"Prediction error: {str(e)}")

[nltk_data] Downloading package punkt to /Users/mdabo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mdabo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mdabo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mdabo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Error during prediction processing:
Available features: ['tweet_id', 'tweet_feature_ai', 'tweet_feature_ai technology', 'tweet_feature_big', 'tweet_feature_big bos', 'tweet_feature_bos', 'tweet_feature_bos hardware', 'tweet_feature_bullshit', 'tweet_feature_firm', 'tweet_feature_firm producing', 'tweet_feature_going', 'tweet_feature_going wrong', 'tweet_feature_hardware', 'tweet_feature_hardware ai', 'tweet_feature_nvidia', 'tweet_feature_nvidia still', 'tweet_feature_producing', 'tweet_feature_producing bullshit', 'tweet_feature_still', 'tweet_feature_still big', 'tweet_feature_technology', 'tweet_feature_wrong', 'tweet_feature_wrong firm', 'from_platform_categorical_encoded']
Prediction error: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- tweet_feature_ai
- tweet_feature_ai technology
- tweet_feature_big
- tweet_feature_big bos
- tweet_feature_bos
- ...
Feature names seen at fit time, yet now missing:
- tweet_feature_amazon
- tw

5. Utility Functions for Different Input Types

In [None]:
# Cell 5: Utility Functions
def load_json_from_file(file_path: str) -> List[Dict]:
    """Load test data from JSON file"""
    with open(file_path, 'r') as f:
        return json.load(f)

def predict_from_json_file(file_path: str) -> List[Dict]:
    """Make predictions from JSON file"""
    data = load_json_from_file(file_path)
    return predict_sentiments(data, data_processor, trainer)

def predict_single_tweet(tweet_text: str, platform: str = "Unknown") -> Dict:
    """Make prediction for a single tweet"""
    data = [{
        "tweet_id": int(time.time()),
        "from_platform": platform,
        "tweet": tweet_text
    }]
    results = predict_sentiments(data, data_processor, trainer)
    return results[0]

6. Example Usage

In [None]:
# Cell 6: Example Usage
try:
    # 1. Predict from JSON string
    json_str = '''
    [
        {
            "tweet_id": 123456789,
            "from_platform": "Nvidia",
            "tweet": "NVIDIA still the big boss of hardware AI technologies"
        },
        {
            "tweet_id": 987654321,
            "from_platform": "Nvidia",
            "tweet": "What's going wrong with this firm. They are producing a bullshit"
        }
    ]
    '''
    test_data = json.loads(json_str)
    results = predict_sentiments(test_data, data_processor, trainer)

    # 2. Predict single tweet
    single_result = predict_single_tweet(
        "NVIDIA's new GPU is amazing!",
        platform="Twitter"
    )

    # Display results
    print("\nSingle Tweet Prediction:")
    print(f"Tweet: {single_result['tweet']}")
    print(f"Predicted Sentiment: {single_result['predicted_sentiment']}")
except Exception as e:
    print(f"Example usage error: {str(e)}")