In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder


2024-05-15 16:04:20.755449: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 16:04:20.755571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 16:04:20.920373: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def pre_process(df, column_name):
    
    # Get English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Function to remove stopwords from a text
    def remove_stopwords_from_text(text):
        if isinstance(text, str):
            word_tokens = word_tokenize(text)
            filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
            return ' '.join(filtered_text)
        else:
            return text
    
    # Apply the function to the specified column
    df[column_name] = df[column_name].apply(remove_stopwords_from_text)
    
    return df

In [3]:
def split_data(df, test_size=0.2, random_state=None):
    X_train, X_test, Y_train, Y_test = train_test_split(df["cleaned_review"], df["sentiments"], test_size=test_size, random_state=random_state)
    
    # Convert X_train, X_test, Y_train, Y_test to TensorFlow tensors
    X_train = tf.convert_to_tensor([tf.convert_to_tensor(x) for x in X_train])
    X_test = tf.convert_to_tensor([tf.convert_to_tensor(x) for x in X_test])
    Y_train = tf.convert_to_tensor([tf.convert_to_tensor(y) for y in Y_train])
    Y_test = tf.convert_to_tensor([tf.convert_to_tensor(y) for y in Y_test])
    
    
    return X_train, X_test, Y_train, Y_test

In [4]:
def encode(column):
    
    class_to_label = {'positive': 0, 'neutral': 1, 'negative': 2}
    
    # Fit label encoder and transform the sentiment labels
    encoded_labels = column.map(class_to_label)
    
    return encoded_labels

In [5]:
from collections import defaultdict

word_index = defaultdict(lambda: len(word_index))  # Initialize an index dictionary
max_sequence_length = 0

def tokenize_and_index(df, column, max_length=None):
    
    global max_sequence_length
    
    tokenized_column = []

    for entry in df[column].astype(str):
        tokens = nltk.word_tokenize(entry)  # Tokenize each entry
        indexed_tokens = [word_index[token] for token in tokens]  # Convert tokens to indices
        
        if len(indexed_tokens) > max_sequence_length:
            max_sequence_length = len(indexed_tokens)
        
        tokenized_column.append(indexed_tokens)  # Append list of indices
    
    if max_length is None:
        max_length = max_sequence_length

    # Pad each entry to max_length
    for i in range(len(tokenized_column)):
        tokenized_column[i] += [0] * (max_length - len(tokenized_column[i]))

    return tokenized_column, len(word_index), max_length

In [6]:
def preprocess_and_encode(df, text_column, sentiment_column):
    # Preprocess the DataFrame
    df = pre_process(df, text_column)
    
    # Tokenize and index the text column
    df[text_column], vocab_size, max_length = tokenize_and_index(df, text_column)
    
    # Encode the sentiment column
    df[sentiment_column] = encode(df[sentiment_column])
    
    return df

In [7]:
def build_simple_rnn_model(vocab_size, len):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=50, input_length=len),
        Bidirectional(SimpleRNN(16)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')  # 3 classes: neutral, negative, positive
    ])
    return model

In [8]:
def build_lstm_model(vocab_size, max_seq_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=50, input_length=max_seq_length),
        Bidirectional(LSTM(16)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')  # 3 classes: neutral, negative, positive
    ])
    
    # Compile the model
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [9]:
def train_and_evaluate_model(model, X_train, Y_train, X_test, Y_test):
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, Y_train, epochs=5, batch_size=32, verbose=0)
    _, accuracy = model.evaluate(X_test, Y_test, verbose=0)
    return accuracy

In [18]:
# Load the dataset
df = pd.read_csv("/kaggle/input/amazon-sentiment/cleaned_reviews.csv")
df = preprocess_and_encode(df, "cleaned_review", 'sentiments')

In [19]:
df

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0,...",19,5.0
1,1,"[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...",88,1.0
2,1,"[52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0...",9,2.0
3,1,"[57, 58, 59, 7, 60, 61, 62, 15, 63, 64, 0, 0, ...",12,1.0
4,1,"[65, 66, 67, 68, 48, 69, 70, 32, 0, 0, 0, 0, 0...",21,1.0
...,...,...,...,...
17335,0,"[5, 3463, 5, 386, 896, 32, 982, 101, 1126, 128...",30,5.0
17336,0,"[15, 861, 318, 448, 1470, 1343, 2207, 0, 0, 0,...",13,4.0
17337,0,"[6485, 111, 243, 682, 9, 55, 32, 167, 6, 320, ...",41,5.0
17338,0,"[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,5.0


In [12]:
X_train, X_test, Y_train, Y_test = split_data(df, test_size=0.2, random_state=41)

In [13]:
X_train[0]

<tf.Tensor: shape=(313,), dtype=int32, numpy=
array([1078,  239,  409,  429,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [20]:
def build_model(model_name, vocab_size, max_seq_length):
    if model_name == 'Simple RNN':
        return build_simple_rnn_model(vocab_size, max_seq_length)
    elif model_name == 'LSTM':
        return build_lstm_model(vocab_size, max_seq_length)
    else:
        raise ValueError(f"Unknown model name: {model_name}")

In [29]:
def evaluate_models_with_varied_splits(df, split_ratios, model_names, max_seq_lengths):
    results = []
    models = []
    for split_ratio in split_ratios:
        print(f"Split Ratio: {split_ratio}")
        for max_seq_length in max_seq_lengths:
            print(f"Max Seq Length: {max_seq_length}")
            X_train, X_test, Y_train, Y_test = split_data(df, test_size=split_ratio, random_state=41)
            for model_name in model_names:
                print(f"Building {model_name}...")
                model = build_model(model_name, len(word_index), max_seq_length)
                models.append(model)
                accuracy = train_and_evaluate_model(model, X_train, Y_train, X_test, Y_test)
                print(f"Model: {model_name}, Max Seq Length: {max_seq_length}, Accuracy: {accuracy}")
                results.append({'Model': model_name, 'Split Ratio': split_ratio, 'Max Seq Length': max_seq_length, 'Accuracy': accuracy})
    return pd.DataFrame(results), models

# Example usage
split_ratios = [0.2, 0.4]  # Example split ratios
model_names = ['Simple RNN', 'LSTM']  # Example model names
max_seq_lengths = [50, 100]  # Example max sequence lengths
results_df, models = evaluate_models_with_varied_splits(df, split_ratios, model_names, max_seq_lengths)
print(results_df)

Split Ratio: 0.2
Max Seq Length: 50
Building Simple RNN...
Model: Simple RNN, Max Seq Length: 50, Accuracy: 0.7603806257247925
Building LSTM...
Model: LSTM, Max Seq Length: 50, Accuracy: 0.8641868233680725
Max Seq Length: 100
Building Simple RNN...
Model: Simple RNN, Max Seq Length: 100, Accuracy: 0.871107280254364
Building LSTM...
Model: LSTM, Max Seq Length: 100, Accuracy: 0.8731257319450378
Split Ratio: 0.4
Max Seq Length: 50
Building Simple RNN...
Model: Simple RNN, Max Seq Length: 50, Accuracy: 0.8145905137062073
Building LSTM...
Model: LSTM, Max Seq Length: 50, Accuracy: 0.8496251702308655
Max Seq Length: 100
Building Simple RNN...
Model: Simple RNN, Max Seq Length: 100, Accuracy: 0.839244544506073
Building LSTM...
Model: LSTM, Max Seq Length: 100, Accuracy: 0.8522202968597412
        Model  Split Ratio  Max Seq Length  Accuracy
0  Simple RNN          0.2              50  0.760381
1        LSTM          0.2              50  0.864187
2  Simple RNN          0.2             100  0.8

In [30]:
# BONUS TASK!!!!!

# Define an empty list to store data
data = []

# Prompt the user to enter data for each row
while True:
    cleaned_review = input("Enter cleaned review (or 'exit' to quit): ")
    if cleaned_review.lower() == 'exit':
        break
    sentiment = input("Enter sentiment: ")

    # Append the entered data as a tuple to the list
    data.append((cleaned_review, sentiment))

# Create a DataFrame from the collected data
df2 = pd.DataFrame(data, columns=['cleaned_review', 'sentiments'])

# Display the DataFrame
print(df2)

Enter cleaned review (or 'exit' to quit):  I really liked the product it was great
Enter sentiment:  positive
Enter cleaned review (or 'exit' to quit):  I hated the product it was horrible
Enter sentiment:  negative
Enter cleaned review (or 'exit' to quit):  it was vey mediocre but it works
Enter sentiment:  neutral
Enter cleaned review (or 'exit' to quit):  exit


                            cleaned_review sentiments
0  I really liked the product it was great   positive
1      I hated the product it was horrible   negative
2         it was vey mediocre but it works    neutral


In [31]:
df2 = preprocess_and_encode(df2, "cleaned_review", 'sentiments')
df2

Unnamed: 0,cleaned_review,sentiments
0,"[128, 221, 16, 95, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[445, 16, 1121, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
2,"[6961, 3132, 277, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",1


In [32]:
X = tf.convert_to_tensor([tf.convert_to_tensor(x) for x in df2["cleaned_review"]])
y = tf.convert_to_tensor([tf.convert_to_tensor(x) for x in df2["sentiments"]])

In [35]:
results= models[3].predict(X)
results
#highest index is the prediction, first index is positive
#second is neutral and third is negative
#so it predicted [positive, negative, neutral] 100% correct!

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


array([[9.9623436e-01, 3.5648716e-03, 2.0088226e-04],
       [7.0451421e-04, 3.1683955e-02, 9.6761161e-01],
       [3.6003876e-03, 9.9358046e-01, 2.8191688e-03]], dtype=float32)