In [1]:
# Install the required tf-keras package

# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, TFBertModel
from tf_keras.layers import Dense, LSTM, Input, Dropout, Bidirectional
from tf_keras.models import Model
import logging
import pickle
import tensorflow as tf

# Set up logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s]: %(message)s')
log = logging.info


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Define data paths
base_dir = 'D:\\manav\\Documents\\Engineering\\Masters\\EE8206\\Project\\MovieSuccessPredictor'
processed_data_dir = os.path.join(base_dir, 'data', 'processed')
final_features_path = os.path.join(processed_data_dir, 'final_features.csv')
final_script_features_path = os.path.join(processed_data_dir, 'final_script_features.csv')

# Load the final features dataset
log("Loading final features dataset...")
try:
    final_features = pd.read_csv(final_features_path)
    log(f"Original final features dataset loaded with shape: {final_features.shape}")

    # Reduce the dataset size by taking a subset (e.g., first 10,000 rows)
    final_features = final_features.head(5000)
    log(f"Reduced final features dataset shape: {final_features.shape}")
    log("First 15 rows of the reduced final features dataset:")
    print(final_features.head(15))
except FileNotFoundError:
    log(f"File not found: {final_features_path}")
    raise

# Load the final script features dataset
log("Loading final script features dataset...")
try:
    final_script_features = pd.read_csv(final_script_features_path)
    log(f"Final script features dataset loaded with shape: {final_script_features.shape}")
    log("First 15 rows of the final script features dataset:")
    print(final_script_features.head(15))
except FileNotFoundError:
    log(f"File not found: {final_script_features_path}")
    raise


[2024-08-02 18:17:04,991] [INFO]: Loading final features dataset...
  final_features = pd.read_csv(final_features_path)
[2024-08-02 18:17:07,591] [INFO]: Original final features dataset loaded with shape: (1443182, 50)
[2024-08-02 18:17:07,592] [INFO]: Reduced final features dataset shape: (5000, 50)
[2024-08-02 18:17:07,592] [INFO]: First 15 rows of the reduced final features dataset:
[2024-08-02 18:17:07,604] [INFO]: Loading final script features dataset...
[2024-08-02 18:17:07,628] [INFO]: Final script features dataset loaded with shape: (930, 3)
[2024-08-02 18:17:07,628] [INFO]: First 15 rows of the final script features dataset:


    averageRating  numVotes  runtimeMinutes_normalized  startYear  \
0             5.7      2063                   0.000017     1894.0   
1             5.6       279                   0.000084     1892.0   
2             6.5      2038                   0.000084     1892.0   
3             5.4       180                   0.000202     1892.0   
4             6.2      2798                   0.000017     1893.0   
5             5.0       191                   0.000017     1894.0   
6             5.4       878                   0.000017     1894.0   
7             5.4      2210                   0.000017     1894.0   
8             5.4       212                   0.000757     1894.0   
9             6.8      7633                   0.000017     1895.0   
10            5.2       391                   0.000017     1895.0   
11            7.4     12998                   0.000017     1896.0   
12            5.7      1983                   0.000017     1895.0   
13            7.1      5902       

In [3]:
# Splitting data for IMDb and TMDb features (Random Forest Model)
log("Splitting data into training and testing sets for IMDb and TMDb features...")
X = final_features.drop(columns=['averageRating'])
y = final_features['averageRating']  # Assuming this is the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
log(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
log(f"Testing set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")


[2024-08-02 18:17:07,635] [INFO]: Splitting data into training and testing sets for IMDb and TMDb features...
[2024-08-02 18:17:07,640] [INFO]: Training set shape: X_train: (3000, 49), y_train: (3000,)
[2024-08-02 18:17:07,641] [INFO]: Testing set shape: X_test: (2000, 49), y_test: (2000,)


In [4]:
# Required imports
from sklearn.metrics import mean_squared_error, r2_score

# ... (previous code for loading data and defining models)

# Hyperparameter Tuning using GridSearchCV
log("Starting hyperparameter tuning with GridSearchCV...")
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initializing GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fitting the GridSearchCV model
log("Fitting GridSearchCV model...")
grid_search.fit(X_train, y_train)

# Extracting the best parameters
best_params = grid_search.best_params_
log(f"Best parameters found: {best_params}")

# Building the Random Forest Model with best parameters
log("Building the Random Forest Regressor with best parameters...")
rf_model = RandomForestRegressor(**best_params, random_state=42)
rf_model.fit(X_train, y_train)
log("Random Forest model training completed.")

# Evaluate the Random Forest Model
log("Evaluating the Random Forest model...")
y_pred_rf = rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

# Log evaluation metrics
log(f"Random Forest Model RMSE: {rmse_rf}")
log(f"R^2 Score: {r2_rf}")

# Feature Importance
log("Calculating feature importances...")
importances = rf_model.feature_importances_
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

log("Top 15 features by importance:")
log("\n" + feature_importance.head(15).to_string(index=False))

# Ensure the model directory exists
model_dir = os.path.join(base_dir, 'models', 'randomforest')
os.makedirs(model_dir, exist_ok=True)

# Save the model for future use
model_save_path = os.path.join(model_dir, 'random_forest_model.pkl')
log(f"Saving the Random Forest model to {model_save_path}...")
with open(model_save_path, 'wb') as file:
    pickle.dump(rf_model, file)
log("Model saved successfully.")



[2024-08-02 18:17:07,649] [INFO]: Starting hyperparameter tuning with GridSearchCV...
[2024-08-02 18:17:07,649] [INFO]: Fitting GridSearchCV model...


Fitting 5 folds for each of 162 candidates, totalling 810 fits


[2024-08-02 18:19:15,105] [INFO]: Best parameters found: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
[2024-08-02 18:19:15,106] [INFO]: Building the Random Forest Regressor with best parameters...
[2024-08-02 18:19:16,011] [INFO]: Random Forest model training completed.
[2024-08-02 18:19:16,012] [INFO]: Evaluating the Random Forest model...
[2024-08-02 18:19:16,041] [INFO]: Random Forest Model RMSE: 0.9451587976541747
[2024-08-02 18:19:16,042] [INFO]: R^2 Score: 0.20121054837798746
[2024-08-02 18:19:16,042] [INFO]: Calculating feature importances...
[2024-08-02 18:19:16,052] [INFO]: Top 15 features by importance:
[2024-08-02 18:19:16,053] [INFO]: 
                  feature  importance
                 numVotes    0.366392
                startYear    0.322989
runtimeMinutes_normalized    0.158757
                adventure    0.042325
                   comedy    0.021774
                    drama    0.018012
                  

In [5]:
# Function to correctly parse the tokenized script padded data
def parse_tokenized_script_padded(token_str):
    try:
        # Remove any characters that might cause issues and convert to a list
        token_str = token_str.replace('\n', '').replace('[', ' ').replace(']', ' ').strip()
        token_list = list(map(int, token_str.split()))
        return token_list
    except Exception as e:
        log(f"Error parsing tokenized_script_padded entry: {e}")
        return None  # Return None for problematic entries

# Preparing data for the Neural Network
log("Preparing data for the Neural Network (Script Features)...")
try:
    X_script = np.array(final_script_features['tokenized_script_padded'].apply(parse_tokenized_script_padded).tolist())
    # Removing any None entries that could have occurred during parsing
    X_script = np.array([x for x in X_script if x is not None])
except Exception as e:
    log(f"Exception occurred during conversion: {e}")
    raise

# Check for any issues with the conversion and handle accordingly
if X_script.ndim == 1 or X_script.shape[1] != 512:  # Check for correct dimensions
    log("Error in parsing tokenized_script_padded data. Ensure data is correctly formatted as lists of length 512.")
    raise ValueError("Incorrect data format for tokenized_script_padded.")

# Combining tokenized scripts with additional features
log("Combining tokenized scripts with additional features...")
additional_features = final_script_features[['sentiment', 'readabilityScore']].values
X_script_combined = np.concatenate([X_script, additional_features], axis=1)

log(f"Combined script feature shape: {X_script_combined.shape}")

[2024-08-02 18:19:16,065] [INFO]: Preparing data for the Neural Network (Script Features)...
[2024-08-02 18:19:16,151] [INFO]: Combining tokenized scripts with additional features...
[2024-08-02 18:19:16,153] [INFO]: Combined script feature shape: (930, 514)


In [6]:
# Load BERT tokenizer and model
log("Loading BERT tokenizer and model...")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define a function to safely convert the tokenized script data into input suitable for BERT
def prepare_bert_input(tokenized_data):
    try:
        # Ensure the list is of integers and truncate/pad to the maximum BERT input length of 512
        return tokenized_data[:512] + [0] * (512 - len(tokenized_data)) if len(tokenized_data) < 512 else tokenized_data[:512]
    except Exception as e:
        log(f"Error in preparing BERT input: {e}")
        return [0] * 512  # Return a zero-padded list on error

# Convert the 'tokenized_script_padded' column to a list of lists suitable for BERT
log("Converting tokenized script data to BERT-compatible input...")
X_script_bert_input = final_script_features['tokenized_script_padded'].apply(parse_tokenized_script_padded).apply(prepare_bert_input).tolist()

# Batch processing for extracting BERT embeddings
def batch_process_bert_embeddings(tokenized_scripts, batch_size=8):
    all_embeddings = []
    num_batches = len(tokenized_scripts) // batch_size + (1 if len(tokenized_scripts) % batch_size != 0 else 0)
    
    for i in range(num_batches):
        log(f"Processing batch {i + 1}/{num_batches}...")
        batch = tokenized_scripts[i * batch_size:(i + 1) * batch_size]
        input_ids_batch = tf.convert_to_tensor(batch, dtype=tf.int32)
        batch_embeddings = get_bert_embeddings(input_ids_batch)
        all_embeddings.append(batch_embeddings)
    
    # Concatenate all batches into a single tensor
    return tf.concat(all_embeddings, axis=0)

# Function to extract BERT embeddings for a batch of tokenized scripts
def get_bert_embeddings(input_ids):
    # Pass through BERT model and get the last hidden state
    outputs = bert_model(input_ids)
    return outputs.last_hidden_state

# Extracting BERT embeddings for the tokenized scripts in batches
log("Extracting BERT embeddings for tokenized scripts in batches...")
bert_embeddings = batch_process_bert_embeddings(X_script_bert_input, batch_size=8)
log(f"BERT embeddings shape: {bert_embeddings.shape}")

[2024-08-02 18:19:16,159] [INFO]: Loading BERT tokenizer and model...






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions 

In [12]:
from tensorflow import keras

# Convert BERT embeddings to numpy array and ensure correct shape
log("Converting BERT embeddings to numpy array and ensuring correct shape...")
bert_embeddings_np = bert_embeddings.numpy()

# Reducing the dimensionality of BERT embeddings using mean pooling
log("Reducing the dimensionality of BERT embeddings using mean pooling...")
bert_embeddings_mean = np.mean(bert_embeddings_np, axis=1)  # Shape: (batch_size, embedding_size)

# Concatenate BERT embeddings with additional features
log("Concatenating BERT embeddings with additional features...")
additional_features = final_script_features[['readabilityScore']].values  # Exclude 'sentiment' from features
X_combined = np.concatenate([bert_embeddings_mean, additional_features], axis=1)
log(f"Combined feature shape: {X_combined.shape}")

# Define target variable (sentiment score)
log("Defining target variable...")
y = final_script_features['sentiment'].values

# Splitting the data into training and testing sets
log("Splitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
log(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
log(f"Testing set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")

# Building the Neural Network model
firstLayerNeurons = 128
secondLayerNeurons = 64

log("Building the Neural Network model with a custom structure...")
model = keras.Sequential([
    # First dense layer with BERT embeddings and additional features as input
    keras.layers.Dense(firstLayerNeurons, activation=tf.nn.relu, input_shape=(X_combined.shape[1],)),
    
    # Second dense layer
    keras.layers.Dense(secondLayerNeurons, activation=tf.nn.relu),
    
    # Output layer for regression (predicting sentiment score)
    keras.layers.Dense(1)
])

# Compile the model
log("Compiling the model...")
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_squared_error'])

# Model summary
log("Model summary:")
model.summary()

# Train the model
log("Training the model...")
model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the test set
log("Evaluating the model...")
test_loss, test_mse = model.evaluate(X_test, y_test)
log(f"Test MSE: {test_mse}")

# Save the trained sentiment model
sentiment_model_save_path = 'sentiment_model_v2.h5'
log(f"Saving the trained sentiment model to {sentiment_model_save_path}...")
model.save(sentiment_model_save_path)
log("Sentiment model saved successfully.")


[2024-08-02 18:37:44,394] [INFO]: Converting BERT embeddings to numpy array and ensuring correct shape...
[2024-08-02 18:37:44,741] [INFO]: Reducing the dimensionality of BERT embeddings using mean pooling...
[2024-08-02 18:37:44,841] [INFO]: Concatenating BERT embeddings with additional features...
[2024-08-02 18:37:44,844] [INFO]: Combined feature shape: (930, 769)
[2024-08-02 18:37:44,844] [INFO]: Defining target variable...
[2024-08-02 18:37:44,844] [INFO]: Splitting the data into training and testing sets...
[2024-08-02 18:37:44,847] [INFO]: Training set shape: X_train: (744, 769), y_train: (744,)
[2024-08-02 18:37:44,848] [INFO]: Testing set shape: X_test: (186, 769), y_test: (186,)
[2024-08-02 18:37:44,848] [INFO]: Building the Neural Network model with a custom structure...
[2024-08-02 18:37:44,873] [INFO]: Compiling the model...
[2024-08-02 18:37:44,880] [INFO]: Model summary:


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               98560     
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 106881 (417.50 KB)
Trainable params: 106881 (417.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


[2024-08-02 18:37:44,886] [INFO]: Training the model...


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


[2024-08-02 18:37:47,374] [INFO]: Evaluating the model...




[2024-08-02 18:37:47,424] [INFO]: Test MSE: 0.0021939983125776052
[2024-08-02 18:37:47,425] [INFO]: Saving the trained sentiment model to sentiment_model_v2.h5...
  saving_api.save_model(
[2024-08-02 18:37:47,445] [INFO]: Sentiment model saved successfully.


In [28]:
# Import necessary libraries
from transformers import BertTokenizer
import tensorflow as tf
import numpy as np
import requests
from bs4 import BeautifulSoup
from tensorflow import keras

# Ensure the sentiment model is loaded
log("Loading sentiment analysis model...")
sentiment_model = keras.models.load_model('sentiment_model_v2.h5')

# Load BERT tokenizer (assuming the model is already loaded as 'bert_model')
log("Loading BERT tokenizer...")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def prepare_script_for_bert(script):
    """Tokenizes and prepares the script for BERT input."""
    log("Tokenizing the script for BERT...")
    tokenized = bert_tokenizer.encode_plus(
        script,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    return tokenized['input_ids'][0].numpy()

def extract_bert_embeddings(input_ids):
    """Extracts BERT embeddings for the input tokens."""
    log("Extracting BERT embeddings...")
    outputs = bert_model(input_ids)
    return outputs.last_hidden_state

def compute_readability(script):
    """Computes readability score for the given script."""
    # Placeholder function: Replace with your actual readability score calculation
    return 5.0  # Example score

def get_script_from_url(url):
    """Fetches script from the given URL and returns the text."""
    log("Fetching script from URL...")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        script_text = soup.get_text()
        log("Script fetched successfully.")
        return script_text
    else:
        log(f"Failed to fetch the script. Status code: {response.status_code}")
        return None

def predict_script_sentiment(script):
    """Predicts the sentiment score of a movie script."""
    # Preprocess the script for BERT input
    tokenized_script = prepare_script_for_bert(script)
    input_ids = tf.convert_to_tensor([tokenized_script], dtype=tf.int32)

    # Extract BERT embeddings
    bert_embeddings = extract_bert_embeddings(input_ids)
    bert_embeddings_mean = np.mean(bert_embeddings.numpy(), axis=1)

    # Prepare additional features
    readability_score = compute_readability(script)
    additional_features = np.array([[readability_score]])

    # Combine BERT embeddings and additional features
    X_input = np.concatenate([bert_embeddings_mean, additional_features], axis=1)

    # Predict sentiment
    log("Predicting sentiment score...")
    predicted_sentiment = sentiment_model.predict(X_input)
    return predicted_sentiment

# Fetch the script from the given URL and predict its sentiment
script_url = "https://imsdb.com/scripts/Tenet.html"
script_text = get_script_from_url(script_url)
if script_text:
    predicted_sentiment = predict_script_sentiment(script_text)
    log(f"Predicted Sentiment Score for the script: {predicted_sentiment}")


[2024-08-02 19:09:56,175] [INFO]: Loading sentiment analysis model...
[2024-08-02 19:09:56,225] [INFO]: Loading BERT tokenizer...
[2024-08-02 19:09:56,419] [INFO]: Fetching script from URL...
[2024-08-02 19:09:56,567] [INFO]: Script fetched successfully.
[2024-08-02 19:09:56,568] [INFO]: 






















The Internet Movie Script Database (IMSDb)




 

The web's largest movie script resource!















Search IMSDb

 








Alphabetical

#
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z




Genre

Action
Adventure
Animation
Comedy
Crime
Drama
Family
Fantasy
Film-Noir
Horror
Musical
Mystery
Romance
Sci-Fi
Short
Thriller
War
Western




Sponsor








TV Transcripts

Futurama
Seinfeld
South Park
Stargate SG-1
Lost
The 4400




International

French scripts




Latest Comments




ALL SCRIPTS









                           TENET



                         Written by

                      Christopher Nolan



ORCHESTRA TUNING, audience settling. High officials in




[2024-08-02 19:09:57,545] [INFO]: Predicted Sentiment Score for the script: [[0.10024708]]


In [26]:
# Required imports
import numpy as np
import pandas as pd
import pickle
import requests
from bs4 import BeautifulSoup

# Load the Random Forest model
log("Loading the Random Forest model...")
model_path = 'D:/manav/Documents/Engineering/Masters/EE8206/Project/MovieSuccessPredictor/models/randomforest/random_forest_model.pkl'
try:
    with open(model_path, 'rb') as file:
        random_forest_model = pickle.load(file)
except PermissionError:
    log("Permission denied: unable to access the model file.")
except FileNotFoundError:
    log(f"File not found: {model_path}")
except Exception as e:
    log(f"An error occurred while loading the model: {e}")

def prepare_features_for_rf(features):
    """Prepares features for the Random Forest model."""
    return np.array(features)

def predict_movie_rating(features):
    """Predicts the movie success rating using the Random Forest model."""
    prepared_features = prepare_features_for_rf(features)
    log("Predicting movie success rating...")
    predicted_rating = random_forest_model.predict([prepared_features])
    return predicted_rating

def get_script_from_url(url):
    """Fetches script from the given URL and returns the text."""
    log("Fetching script from URL...")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        script_text = soup.get_text()
        log("Script fetched successfully.")
        return script_text
    else:
        log(f"Failed to fetch the script. Status code: {response.status_code}")
        return None

# Hardcoded sample features from final_features.csv (excluding the target 'averageRating')
sample_features = [
    2063,  # numVotes
    1.6818028927009755e-05,  # runtimeMinutes_normalized
    1894.0,  # startYear
    False,  # titleType_movie
    True,  # titleType_short
    False,  # titleType_tvEpisode
    False,  # titleType_tvMiniSeries
    False,  # titleType_tvMovie
    False,  # titleType_tvSeries
    False,  # titleType_tvShort
    False,  # titleType_tvSpecial
    False,  # titleType_video
    False,  # titleType_videoGame
    0,  # action
    0,  # adult
    0,  # adventure
    0,  # animation
    0,  # biography
    0,  # comedy
    0,  # crime
    1,  # documentary
    0,  # drama
    0,  # family
    0,  # fantasy
    0,  # film-noir
    0,  # game-show
    0,  # history
    0,  # horror
    0,  # music
    0,  # musical
    0,  # mystery
    0,  # news
    0,  # reality-tv
    0,  # romance
    0,  # sci-fi
    0,  # short
    0,  # sport
    0,  # talk-show
    0,  # thriller
    0,  # war
    0,  # western
    True,  # season_Winter
    0.0,  # directorPopularity
    0.0,  # wordCount
    0.0,  # sentiment
    0.0,  # readabilityScore
    False,  # comedy_indicator
    False,  # horror_indicator
    False  # action_indicator
]

# Ensure the feature count matches the model's expectation
expected_feature_count = 49  # Change this to match your model's expected input
if len(sample_features) != expected_feature_count:
    log(f"Feature count mismatch. Expected {expected_feature_count}, got {len(sample_features)}.")
    raise ValueError("Feature count mismatch")

# Predict the rating using the Random Forest model
predicted_rating = predict_movie_rating(sample_features)
log(f"Predicted Movie Rating: {predicted_rating}")

# Fetch the script from the given URL
script_url = "https://imsdb.com/scripts/Tenet.html"
script_text = get_script_from_url(script_url)


[2024-08-02 19:02:03,135] [INFO]: Loading the Random Forest model...
[2024-08-02 19:02:03,146] [INFO]: Predicting movie success rating...
[2024-08-02 19:02:03,154] [INFO]: Predicted Movie Rating: [6.11832094]
[2024-08-02 19:02:03,155] [INFO]: Fetching script from URL...
[2024-08-02 19:02:03,429] [INFO]: Script fetched successfully.


In [None]:
# Save the sentiment model and Random Forest model after training
log("Saving the models...")
sentiment_model.save('sentiment_model.h5')
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(random_forest_model, f)
log("Models saved successfully.")

# Load the models for prediction
log("Loading the models...")
sentiment_model = keras.models.load_model('sentiment_model.h5')
with open('random_forest_model.pkl', 'rb') as f:
    random_forest_model = pickle.load(f)
log("Models loaded successfully.")
