Training LSTM model provided for the exam.

In [6]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import re

from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('Data.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

embed_dim = 128
lstm_out = 196
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)

model.save('./sentiment_model' + '.h5')
# Also, you'll need to save your tokenizer, as it's crucial for preprocessing new text.
# The easiest way is using pickle.
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer saved as tokenizer.pickle")

# (After labelencoder has been fitted)
import numpy as np
np.save('label_encoder_classes.npy', labelencoder.classes_)
print(f"Label encoder classes saved as label_encoder_classes.npy: {labelencoder.classes_}")

  row[0] = row[0].replace('rt', ' ')
  row[0] = row[0].replace('rt', ' ')


291/291 - 31s - 107ms/step - accuracy: 0.6408 - loss: 0.8318
144/144 - 3s - 22ms/step - accuracy: 0.6774 - loss: 0.7735




0.7734556198120117
0.677370011806488
['loss', 'compile_metrics']
Tokenizer saved as tokenizer.pickle
Label encoder classes saved as label_encoder_classes.npy: ['Negative' 'Neutral' 'Positive']


**Question 1**: Execute and save the given model and use the saved model to predict
on new text data (ex, “A lot of good things are happening. We are respected again throughout the world, and that's a great thing .@realDonaldTrump”)

In [2]:
import re
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Configuration & File Paths ---
MODEL_PATH = 'sentiment_model.h5'
TOKENIZER_PATH = 'tokenizer.pickle'
LABEL_CLASSES_PATH = 'label_encoder_classes.npy'

# --- Load Saved Artifacts ---
try:
    # Load the trained model
    loaded_model = load_model(MODEL_PATH)
    print(f"Model loaded successfully from {MODEL_PATH}")

    # Load the tokenizer
    with open(TOKENIZER_PATH, 'rb') as handle:
        loaded_tokenizer = pickle.load(handle)
    print(f"Tokenizer loaded successfully from {TOKENIZER_PATH}")

    # Load the label encoder classes
    label_classes = np.load(LABEL_CLASSES_PATH, allow_pickle=True)
    print(f"Label encoder classes loaded successfully from {LABEL_CLASSES_PATH}: {label_classes}")

except FileNotFoundError as e:
    print(f"Error: Could not find a required file. Make sure '{MODEL_PATH}', '{TOKENIZER_PATH}', and '{LABEL_CLASSES_PATH}' exist in the current directory.")
    print(f"Details: {e}")
    exit()
except Exception as e:
    print(f"An error occurred during loading: {e}")
    exit()


# --- Determine Input Sequence Length ---
# This should match the input_length used during training (X.shape[1] in your training code)
# We can often infer this from the model's first layer (Embedding layer)
try:
    input_seq_length = loaded_model.input_shape[1]
    if input_seq_length is None: # Should not happen for this model structure but good to check
        raise ValueError("Could not determine input_seq_length from model's input shape.")
    print(f"Inferred input sequence length from model: {input_seq_length}")
except Exception as e:
    print(f"Could not automatically determine input_seq_length from model: {e}")
    # Fallback: If you know the exact length from your training script's X.shape[1]
    # (the value of X.shape[1] AFTER `X = pad_sequences(X)` in training)
    # you can hardcode it here. For example:
    # input_seq_length = 200 # Replace 200 with your actual value if needed
    print("Please ensure 'input_seq_length' is set correctly if auto-detection failed.")
    # For your specific training code, `X = pad_sequences(X)` means the length is determined
    # by the longest sequence. The Embedding layer was `Embedding(..., input_length=X.shape[1])`.
    # So, `loaded_model.input_shape[1]` is the correct way.

# --- Preprocessing Function for New Text ---
def preprocess_text_for_prediction(text_input, tokenizer, max_len):
    """
    Preprocesses a single raw text string for prediction.
    Mirrors the preprocessing steps from the training script.
    """
    # 1. Lowercase
    processed_text = text_input.lower()
    # 2. Remove special characters (same regex as training)
    processed_text = re.sub(r'[^a-zA-z0-9\s]', '', processed_text)
    # 3. Replace 'rt' (same as training)
    processed_text = processed_text.replace('rt', ' ') # remove "rt "
    processed_text = processed_text.strip() # Clean up extra spaces

    # Tokenize the text
    sequence = tokenizer.texts_to_sequences([processed_text]) # Note: texts_to_sequences expects a list

    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    return padded_sequence

# --- Prediction Function ---
def predict_sentiment(raw_text):
    """
    Predicts sentiment for a given raw text string.
    """
    if not all([loaded_model, loaded_tokenizer, label_classes is not None, input_seq_length is not None]):
        print("Error: Model, tokenizer, label classes, or input_seq_length not initialized.")
        return None, None

    # Preprocess the input text
    preprocessed_input = preprocess_text_for_prediction(raw_text, loaded_tokenizer, input_seq_length)

    # Make prediction
    prediction_probabilities = loaded_model.predict(preprocessed_input)

    # Get the class with the highest probability
    predicted_class_index = np.argmax(prediction_probabilities, axis=1)[0] # Get the single index

    # Map index to sentiment label
    predicted_sentiment_label = label_classes[predicted_class_index]

    return predicted_sentiment_label, prediction_probabilities[0]


# --- Example Usage ---
if __name__ == "__main__":
    new_text_example = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing .@realDonaldTrump"
    texts_to_test = [new_text_example]

    for text in texts_to_test:
        print(f"\nOriginal Text: \"{text}\"")
        sentiment, probabilities = predict_sentiment(text)
        if sentiment and probabilities is not None:
            print(f"Predicted Sentiment: {sentiment}")
            print(f"Prediction Probabilities: ")
            for i, prob in enumerate(probabilities):
                print(f"  - {label_classes[i]}: {prob:.4f}")
        else:
            print("Prediction failed.")

    # Example of predicting a single string directly
    # text_to_predict_single = "I love this product, it's amazing!"
    # sentiment, _ = predict_sentiment(text_to_predict_single)
    # print(f"\nPrediction for '{text_to_predict_single}': {sentiment}")





Model loaded successfully from sentiment_model.h5
Tokenizer loaded successfully from tokenizer.pickle
Label encoder classes loaded successfully from label_encoder_classes.npy: ['Negative' 'Neutral' 'Positive']
Inferred input sequence length from model: 28

Original Text: "A lot of good things are happening. We are respected again throughout the world, and that's a great thing .@realDonaldTrump"
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Sentiment: Negative
Prediction Probabilities: 
  - Negative: 0.6006
  - Neutral: 0.1928
  - Positive: 0.2067


In [9]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

# TensorFlow / Keras imports (ensure these are from tensorflow.keras)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout # Added Dropout for flexibility
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam # To control learning rate

# KerasClassifier wrapper
# NOTE: tensorflow.keras.wrappers.scikit_learn is for older TensorFlow versions.
# For TensorFlow 2.7+, it's recommended to use scikeras:
# 1. Install: pip install scikeras
# 2. Import: from scikeras.wrappers import KerasClassifier
# However, I will use the import you provided in your example:
from scikeras.wrappers import KerasClassifier

In [15]:
!pip install -U scikit-learn scikeras tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (644.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m118.2 MB

In [9]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.5.2

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m112.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
Successfully installed scikit-learn-1.5.2


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import numpy as np

# TensorFlow / Keras imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D # Added SpatialDropout1D if you want to tune it
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input

# SciKeras wrapper (ensure you have it installed: pip install scikeras)
from scikeras.wrappers import KerasClassifier

# --- 1. Load and Prepare Your Data (Same as your original script) ---
try:
    data = pd.read_csv('Data.csv')
except FileNotFoundError:
    print("Error: 'Data.csv' not found. Please make sure the file exists.")
    exit()

data = data[['text','sentiment']]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_features = 2000 # Vocabulary size from your original code
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X_seq = tokenizer.texts_to_sequences(data['text'].values)
X_pad = pad_sequences(X_seq) # Padding to the length of the longest sequence

# Store the input_length for the Embedding layer
input_seq_length = X_pad.shape[1] # This is X.shape[1] from your original code after padding
print(f"Input sequence length (max_len for padding): {input_seq_length}")

# Prepare labels
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y_cat = to_categorical(integer_encoded)
num_classes = y_cat.shape[1] # Number of unique sentiment classes (should be 3 based on your original model)
print(f"Number of classes: {num_classes}")
print(f"Label encoder classes: {labelencoder.classes_}")


# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X_pad, y_cat, test_size=0.33, random_state=42)
print(f"X_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")


# --- 2. Create a `build_model` Function for GridSearchCV (LSTM Model) ---
def build_lstm_model_for_grid(
    embedding_dim=128,
    lstm_units=196,
    lstm_dropout=0.2,
    lstm_recurrent_dropout=0.2,
    learning_rate=0.001
):
    model = Sequential([
        Input(shape=(input_seq_length,)),             # ← ensures model.outputs exists
        Embedding(input_dim=max_features,
                  output_dim=embedding_dim,
                  input_length=input_seq_length),
        LSTM(lstm_units,
             dropout=lstm_dropout,
             recurrent_dropout=lstm_recurrent_dropout),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# ————————————————
# 3. Wrap for GridSearch
# ————————————————
model_for_grid = KerasClassifier(
    model=build_lstm_model_for_grid,
    verbose=0,
)

param_grid = {
    'batch_size': [32, 64],
    'epochs':     [5, 7],
    'model__embedding_dim': [64, 128],
}
print("\nParameter grid for GridSearchCV (LSTM Model):")
for key, value in param_grid.items():
    print(f"- {key}: {value}")

# --- 4. Perform GridSearchCV ---
grid = GridSearchCV(
    estimator=model_for_grid,
    param_grid=param_grid,
    cv=2,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

print("\nStarting GridSearchCV with LSTM model... This may take a while.")
try:
    grid_result = grid.fit(X_train, Y_train)

    # --- 5. Display Results and Evaluate the Best Model ---
    print(f"\nBest Score (Cross-Validation Accuracy): {grid_result.best_score_:.4f}")
    print(f"Best Hyperparameters: {grid_result.best_params_}")

    print("\nAll results from GridSearchCV:")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params_list = grid_result.cv_results_['params']
    for mean, stdev, param_combination in zip(means, stds, params_list):
        print(f"Mean CV Accuracy: {mean:.4f} (Std: {stdev:.4f}) with: {param_combination}")

    best_model_wrapper = grid_result.best_estimator_

    if hasattr(best_model_wrapper, 'model_'):
        best_keras_model_actual = best_model_wrapper.model_
        loss, accuracy = best_keras_model_actual.evaluate(X_test, Y_test, verbose=0)
        print(f"\nTest Accuracy of the Best LSTM Model on X_test, Y_test: {accuracy:.4f}")
        print(f"Test Loss of the Best LSTM Model on X_test, Y_test: {loss:.4f}")
    else:
        print("\nEvaluating best LSTM model on test set using the wrapper's score method:")
        test_accuracy = best_model_wrapper.score(X_test, Y_test)
        print(f"Test Accuracy from best_estimator_.score(): {test_accuracy:.4f}")

except Exception as e:
    print(f"An error occurred during GridSearchCV fitting or evaluation: {e}")
    import traceback
    traceback.print_exc()



  row[0] = row[0].replace('rt', ' ')
  row[0] = row[0].replace('rt', ' ')


Input sequence length (max_len for padding): 28
Number of classes: 3
Label encoder classes: ['Negative' 'Neutral' 'Positive']
X_train shape: (9293, 28), Y_train shape: (9293, 3)
X_test shape: (4578, 28), Y_test shape: (4578, 3)

Parameter grid for GridSearchCV (LSTM Model):
- batch_size: [32, 64]
- epochs: [5, 7]
- model__embedding_dim: [64, 128]

Starting GridSearchCV with LSTM model... This may take a while.
Fitting 2 folds for each of 8 candidates, totalling 16 fits
