In [29]:
import pandas as pd
import numpy as np
import json
import time
import logging
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [30]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [31]:
# Load data
def load_data(jsonl_file_path):
    logger.info(f"Starting to load JSONL data: {jsonl_file_path}")
    start_time = time.time()
    
    data = []
    try:
        with open(jsonl_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    # Skip empty lines
                    if not line.strip():
                        continue
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    logger.warning(f"Failed to parse line: {line[:50]}... Error: {str(e)}")
        
        logger.info(f"Data loaded successfully! Loaded {len(data)} movie records")
        logger.info(f"Data loading time: {time.time() - start_time:.2f} seconds")
        return data
    except Exception as e:
        logger.error(f"Data loading failed: {str(e)}")
        raise

In [32]:
# Learn relationship between embeddings and box office revenue
def learn_embedding_box_office_relationship(embeddings, box_office_values):
    scaler = StandardScaler()
    box_office_scaled = scaler.fit_transform(box_office_values.reshape(-1, 1))
    
    model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        max_iter=500,
        random_state=42
    )
    
    model.fit(embeddings, box_office_scaled.ravel())
    return model, scaler

In [33]:
def process_data(movies_data,model_name=None):    
    try:
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        logger.info("Successfully loaded SentenceBERT model: all-MiniLM-L6-v2")
    except Exception as e:
        logger.error(f"Failed to load model all-MiniLM-L6-v2: {str(e)}")
        sentence_model = None
    
    # Initialize vader sentiment analyzer
    logger.info("Initializing VADER sentiment analyzer")
    vader_analyzer = SentimentIntensityAnalyzer()
    
    # Process movie data
    processed_data = []
    all_movie_embeddings = []
    all_box_office_values = []
    
    logger.info(f"Starting to process {len(movies_data)} movie records...")
    
    embedding_dimension = 50  # Default embedding dimension
    
    for i, movie in enumerate(movies_data):
            
        movie_features = {}
        
        # Extract metadata features
        movie_features['movie_id'] = movie.get('movie_id', '')
        movie_features['averageRating'] = float(movie.get('averageRating', 0))
        movie_features['numVotes'] = int(movie.get('numVotes', 0))
        movie_features['runtimeMinutes'] = int(movie.get('runtimeMinutes', '0').replace('"', ''))
        movie_features['startYear'] = float(movie.get('startYear', 0))
        movie_features['isAdult'] = int(movie.get('isAdult', 0))
        
        genres = movie.get('genres', '')
        movie_features['genres'] = genres
        movie_features['genre_count'] = len(genres.split(',')) if genres else 0
        
        movie_features['has_director'] = 1 if movie.get('directors') else 0
        movie_features['has_writer'] = 1 if movie.get('writers') else 0
        movie_features['writer_count'] = len(movie.get('writers', '').split(',')) if movie.get('writers') else 0
        movie_features['director_count'] = len(movie.get('directors', '').split(',')) if movie.get('directors') else 0
        
        # Process box office data
        box_office_str = movie.get('worldwide_box_office', '0')
        box_office = float(box_office_str.replace('$', '').replace(',', ''))
        movie_features['box_office'] = box_office
        
        # Process reviews with enhanced analysis
        reviews = movie.get('review_bodies', [])
        
        if reviews:
            max_reviews = min(20, len(reviews))
            reviews = reviews[:max_reviews]
            
            if sentence_model is not None:
                try:
                    logger.debug(f"Generating text embeddings for movie {movie_features['movie_id']}")
                    embeddings = sentence_model.encode(reviews)
                    
                    # Store embeddings for relationship learning
                    for embedding in embeddings:
                        all_movie_embeddings.append(embedding)
                        all_box_office_values.append(box_office)
                    
                    # Calculate embedding statistics
                    avg_embedding = np.mean(embeddings, axis=0)
                    std_embedding = np.std(embeddings, axis=0)
                    max_embedding = np.max(embeddings, axis=0)
                    min_embedding = np.min(embeddings, axis=0)
    
                    movie_features['embedding_std'] = np.mean(std_embedding)
                    movie_features['embedding_range'] = np.mean(max_embedding - min_embedding)
                    
                    # Box office scaled embedding
                    box_office_scaled_embedding = avg_embedding * np.log1p(box_office)
                    
                    # Add dimensionality reduced features
                    reduced_avg_embedding = avg_embedding[:embedding_dimension]
                    reduced_scaled_embedding = box_office_scaled_embedding[:embedding_dimension]
                    
                    # Add average embedding features
                    for j, val in enumerate(reduced_avg_embedding):
                        movie_features[f'embed_{j}'] = val

                    # Add box office scaled embedding features
                    for j, val in enumerate(reduced_scaled_embedding):
                        movie_features[f'scaled_embed_{j}'] = val
                    
                except Exception as e:
                    logger.error(f"Failed to generate embeddings: {str(e)}")
                    # Set default values if embedding generation fails
                    movie_features['embedding_std'] = 0
                    movie_features['embedding_range'] = 0
      
                    
                    empty_embedding = np.zeros(embedding_dimension)
                    for j in range(embedding_dimension):
                        movie_features[f'embed_{j}'] = 0
                        movie_features[f'positive_embed_{j}'] = 0
                        movie_features[f'negative_embed_{j}'] = 0
                        movie_features[f'scaled_embed_{j}'] = 0
            else:
                logger.warning("Using fallback embedding method for reviews")
                from collections import Counter
                all_words = ' '.join(reviews).lower().split()
                word_counts = Counter(all_words)
                most_common = [word for word, _ in word_counts.most_common(embedding_dimension)]
                
                reduced_embedding = np.zeros(embedding_dimension)
                for j, word in enumerate(most_common):
                    if j < embedding_dimension:
                        count = sum(1 for review in reviews if word in review.lower())
                        reduced_embedding[j] = count / len(reviews)
                
                for j, val in enumerate(reduced_embedding):
                    movie_features[f'embed_{j}'] = val
                
                # Set default values for enhanced features
                movie_features['embedding_std'] = 0
                movie_features['embedding_range'] = 0
                
                for j in range(embedding_dimension):
                    movie_features[f'positive_embed_{j}'] = 0
                    movie_features[f'negative_embed_{j}'] = 0
                    movie_features[f'scaled_embed_{j}'] = 0
            
            # VADER sentiment analysis
            vader_scores = []
            compound_scores = []
            try:
                for review in reviews:
                    vs = vader_analyzer.polarity_scores(review)
                    vader_scores.append(vs)
                    compound_scores.append(vs['compound'])
            except Exception as e:
                logger.error(f"VADER sentiment analysis failed: {str(e)}")
                compound_scores = [0] * len(reviews)
            
            # Calculate VADER sentiment features
            movie_features['vader_avg_compound'] = np.mean(compound_scores) if compound_scores else 0
            movie_features['vader_std_compound'] = np.std(compound_scores) if compound_scores else 0
            movie_features['vader_positive_ratio'] = sum(1 for s in compound_scores if s > 0.05) / len(compound_scores) if compound_scores else 0
            movie_features['vader_negative_ratio'] = sum(1 for s in compound_scores if s < -0.05) / len(compound_scores) if compound_scores else 0
            movie_features['vader_neutral_ratio'] = sum(1 for s in compound_scores if -0.05 <= s <= 0.05) / len(compound_scores) if compound_scores else 0
            movie_features['review_count'] = len(reviews)
            movie_features['avg_review_length'] = np.mean([len(r) for r in reviews]) if reviews else 0
            
        else:
            # Set default values if no reviews
            movie_features['vader_avg_compound'] = 0
            movie_features['vader_std_compound'] = 0
            movie_features['vader_positive_ratio'] = 0
            movie_features['vader_negative_ratio'] = 0
            movie_features['vader_neutral_ratio'] = 0
            movie_features['review_count'] = 0
            movie_features['avg_review_length'] = 0
            movie_features['embedding_std'] = 0
            movie_features['embedding_range'] = 0
            
            # Empty embedding features
            for j in range(embedding_dimension):
                movie_features[f'embed_{j}'] = 0
                movie_features[f'positive_embed_{j}'] = 0
                movie_features[f'negative_embed_{j}'] = 0
                movie_features[f'scaled_embed_{j}'] = 0
        
        processed_data.append(movie_features)
    
    # Learn embedding-box office relationship if we have data
    if all_movie_embeddings and all_box_office_values:
        logger.info("Learning embedding-box office relationship...")
        try:
            relationship_model, box_office_scaler = learn_embedding_box_office_relationship(
                np.array(all_movie_embeddings), 
                np.array(all_box_office_values)
            )
            
            # Apply learned relationship to each movie
            for movie_features in processed_data:
                # Get this movie's average embedding
                embedding_features = [movie_features.get(f'embed_{j}', 0) for j in range(embedding_dimension)]
                
                if any(embedding_features):  # Check if we have valid embeddings
                    # Create full embedding by padding with zeros
                    full_embedding = np.zeros(all_movie_embeddings[0].shape)
                    full_embedding[:embedding_dimension] = embedding_features
                    
                    # Use relationship model to predict box office trend
                    predicted_trend = relationship_model.predict([full_embedding])[0]
                    # Reverse scale the prediction
                    predicted_box_office = box_office_scaler.inverse_transform([[predicted_trend]])[0][0]
                    movie_features['embedding_predicted_trend'] = predicted_box_office
                else:
                    movie_features['embedding_predicted_trend'] = 0
                    
        except Exception as e:
            logger.error(f"Failed to learn embedding-box office relationship: {str(e)}")
            # Set default values if relationship learning fails
            for movie_features in processed_data:
                movie_features['embedding_predicted_trend'] = 0
    else:
        # Set default values if no data for relationship learning
        for movie_features in processed_data:
            movie_features['embedding_predicted_trend'] = 0
    
    # Convert to DataFrame
    logger.info("Converting data to DataFrame format")
    df = pd.DataFrame(processed_data)
    return df

def prepare_features(df):
    logger.info("Starting feature engineering")
    
    # Handle missing values
    df = df.fillna(0)
    
    # Take Log
    df['log_box_office'] = np.log1p(df['box_office'])
    
    # Create lists of categorical and numerical features
    categorical_cols = ['genres'] 
    numeric_cols = [col for col in df.columns if col not in ['movie_id', 'box_office', 'log_box_office', 'genres']]
    
    # Get all feature columns and target variable
    X = df.drop(['movie_id', 'box_office', 'log_box_office'], axis=1)
    y = df['log_box_office']  # Use log-transformed target variable
    
    # Create feature processing pipeline
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='drop'  # Drop unspecified columns
    )
    
    # Apply feature processing
    logger.info("Applying feature processing pipeline...")
    X_transformed = preprocessor.fit_transform(X)
    
    # Get processed feature names
    feature_names = numeric_cols.copy()
    
    # If there are categorical features, add their feature names
    if categorical_cols:
        # Get OneHotEncoder transformed feature names
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_feature_names = ohe.get_feature_names_out(categorical_cols)
        feature_names.extend(cat_feature_names)
    
    logger.info(f"Feature processing complete, transformed feature dimensions: {X_transformed.shape}")
    
    return X_transformed, y, feature_names, preprocessor

In [None]:
# Train and evaluate
def train_and_evaluate_model(X, y, feature_names, epochs=1):
    logger.info("Starting model training and evaluation")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train model with incremental epochs
    if epochs > 1:
        logger.info(f"Training Random Forest with {epochs} incremental epochs...")
        
        # Initialize with fewer trees
        model = RandomForestRegressor(
            n_estimators=100//epochs,
            max_depth=10,
            min_samples_leaf=3,
            random_state=42,
            warm_start=True  # Enables adding trees without clearing existing ones
        )
        
        # Incremental training
        for epoch in range(1, epochs+1):
            logger.info(f"Random Forest epoch {epoch}/{epochs}...")
            
            model.n_estimators = (100//epochs) * epoch
            model.fit(X_train, y_train)
            
            # Evaluate current epoch
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            logger.info(f"Epoch {epoch} - Training R²: {train_score:.4f}, Test R²: {test_score:.4f}")
    else:
        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=10, 
            min_samples_leaf=3,
            random_state=42
        )
        
        model.fit(X_train, y_train)
    
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    logger.info(f"Training set R² score: {train_score:.4f}")
    logger.info(f"Test set R² score: {test_score:.4f}")
    
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    
    logger.info(f"Log - Mean Squared Error (MSE): {mse:.2f}")
    logger.info(f"Log - Root Mean Squared Error (RMSE): {rmse:.2f}")
    logger.info(f"Log - Mean Absolute Error (MAE): {mae:.2f}")
    
    # Transform back to original value
    y_pred_orig = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test)
    
    mse_orig = mean_squared_error(y_test_orig, y_pred_orig)
    rmse_orig = np.sqrt(mse_orig)
    mae_orig = mean_absolute_error(y_test_orig, y_pred_orig)
    
    logger.info(f"Original - Mean Squared Error (MSE): {mse_orig:.2f}")
    logger.info(f"Original - Root Mean Squared Error (RMSE): {rmse_orig:.2f}")
    logger.info(f"Original - Mean Absolute Error (MAE): {mae_orig:.2f}")
    
    # Visualize prediction results
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual Log Box Office')
    plt.ylabel('Predicted Log Box Office')
    plt.title('Box Office Prediction: Actual vs Predicted (Log)')
    plt.grid(True)
    plt.savefig('prediction_results_log.png')
    plt.close()
    
    # Original visualization
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_orig, y_pred_orig, alpha=0.7)
    plt.plot([y_test_orig.min(), y_test_orig.max()], [y_test_orig.min(), y_test_orig.max()], 'k--', lw=2)
    plt.xlabel('Actual Box Office ($)')
    plt.ylabel('Predicted Box Office ($)')
    plt.title('Box Office Prediction: Actual vs Predicted')
    plt.grid(True)
    plt.savefig('prediction_results_original.png')
    plt.close()
    
    # Feature importance analysis
    feature_importance = model.feature_importances_
    
    # If there are too many features, only show the top 15 most important ones
    if len(feature_names) > 10:
        sorted_idx = np.argsort(feature_importance)[-10:]
        top_features = [feature_names[i] for i in sorted_idx]
        top_importance = feature_importance[sorted_idx]
        
        plt.figure(figsize=(12, 8))
        plt.barh(range(10), top_importance)
        plt.yticks(range(10), top_features)
        plt.xlabel('Feature Importance')
        plt.title('Top 10 Important Features for Box Office Prediction')
        plt.tight_layout()
        plt.savefig('feature_importance_rank.png')
        plt.close()
    else:
        sorted_idx = np.argsort(feature_importance)
        plt.figure(figsize=(12, 8))
        plt.barh(range(len(feature_names)), feature_importance[sorted_idx])
        plt.yticks(range(len(feature_names)), [feature_names[i] for i in sorted_idx])
        plt.xlabel('Feature Importance')
        plt.title('Feature Importance for Box Office Prediction')
        plt.tight_layout()
        plt.savefig('feature_importance_rank.png')
        plt.close()
    
    # Cross-validation
    logger.info("Performing cross-validation")
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    logger.info(f"5-fold cross-validation R² score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    
    # Feature selection
    logger.info("Performing feature selection")
    selector = SelectFromModel(model, threshold="median", prefit=True)
    support = selector.get_support()
    selected_features = [feature for feature, selected in zip(feature_names, support) if selected]
    logger.info(f"Number of selected features: {len(selected_features)}")
    logger.info(f"Selected features: {selected_features[:10]}..." if len(selected_features) > 10 else selected_features)
    
    # Retrain model with selected features
    X_selected = selector.transform(X)
    X_train_selected, X_test_selected, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    selected_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_leaf=3,
        random_state=42
    )
    
    selected_model.fit(X_train_selected, y_train)
    
    # Evaluation after feature selection
    train_score_selected = selected_model.score(X_train_selected, y_train)
    test_score_selected = selected_model.score(X_test_selected, y_test)
    
    logger.info(f"Training set R² score after feature selection: {train_score_selected:.4f}")
    logger.info(f"Test set R² score after feature selection: {test_score_selected:.4f}")
    
    # Hyperparameter tuning
    logger.info("Performing hyperparameter tuning")
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 3, 5]
    }
    
    grid_search = GridSearchCV(
        RandomForestRegressor(random_state=42),
        param_grid=param_grid,
        cv=3,
        scoring='r2',
        n_jobs=-1
    )
    
    # Use selected features 
    grid_search.fit(X_selected, y)
    
    logger.info(f"Best parameters: {grid_search.best_params_}")
    logger.info(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Model with best parameters
    best_model = grid_search.best_estimator_
    
    # Evaluate best model
    X_train_best, X_test_best, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    best_model.fit(X_train_best, y_train)
    
    best_train_score = best_model.score(X_train_best, y_train)
    best_test_score = best_model.score(X_test_best, y_test)
    
    logger.info(f"Best model training set R² score: {best_train_score:.4f}")
    logger.info(f"Best model test set R² score: {best_test_score:.4f}")
    
    # Sentiment feature importance analysis
    sentiment_features = ['vader_avg_compound', 'vader_std_compound', 'vader_positive_ratio', 
                         'vader_negative_ratio', 'vader_neutral_ratio']
    
    # Check if sentiment features are in selected features
    selected_sentiment_features = [f for f in sentiment_features if f in selected_features]
    
    if selected_sentiment_features:
        logger.info("\nSelected sentiment feature importance:")
        for feat in selected_sentiment_features:
            idx = feature_names.index(feat)
            logger.info(f"{feat}: {feature_importance[idx]:.4f}")
    else:
        logger.info("\nNo sentiment features were selected")
    
    # importance of sentiment features in original RandomForest model
    logger.info("\nSentiment feature importance in original model:")
    for feat in sentiment_features:
        if feat in feature_names:
            idx = feature_names.index(feat)
            logger.info(f"{feat}: {feature_importance[idx]:.4f}")
        else:
            logger.info(f"{feat}: Not in feature list")
    
    return best_model, selector, X_test_best, y_test

In [37]:
def main(json_file_path, sbert_epochs=1, rf_epochs=1, model_name=None):

    movies_data = load_data(json_file_path)
    
    # Process data with SentenceBERT
    logger.info(f"Processing data and extracting features with SentenceBERT (epochs={sbert_epochs})")
    df = process_data(movies_data, model_name=model_name)
    
    # Save processed data
    df.to_csv('processed_movies.csv', index=False)
    logger.info("Processed data saved to 'processed_movies.csv'")

    X, y, feature_names, preprocessor = prepare_features(df)
    
    # Train and evaluate model with epochs
    logger.info(f"Training and evaluating regression model with epochs={rf_epochs} ")
    best_model, selector, X_test, y_test = train_and_evaluate_model(X, y, feature_names, epochs=rf_epochs)
    
    # Print final model performance 
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Transform back to original 
    y_pred_orig = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test)
    rmse_orig = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
    
    logger.info("\nFinal model performance summary:")
    logger.info(f"Test set R² score: {r2:.4f}")
    logger.info(f"Log space RMSE: {rmse:.4f}")
    logger.info(f"Original box office RMSE: ${rmse_orig:.2f}")
    
    return best_model

json_file_path = "data/imdb1000.jsonl"
model = main(json_file_path, sbert_epochs=5, rf_epochs=5)



2025-04-20 02:45:26,474 - INFO - Starting to load JSONL data: data/imdb1000.jsonl
2025-04-20 02:45:26,637 - INFO - Data loaded successfully! Loaded 1000 movie records
2025-04-20 02:45:26,638 - INFO - Data loading time: 0.16 seconds
2025-04-20 02:45:26,638 - INFO - Processing data and extracting features with SentenceBERT (epochs=5)
2025-04-20 02:45:26,640 - INFO - Use pytorch device_name: mps
2025-04-20 02:45:26,641 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-04-20 02:45:31,721 - INFO - Successfully loaded SentenceBERT model: all-MiniLM-L6-v2
2025-04-20 02:45:31,721 - INFO - Initializing VADER sentiment analyzer
2025-04-20 02:45:31,728 - INFO - Starting to process 1000 movie records...
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.69it/s]
Batche