In [16]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import spacy
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding, Dropout
from tensorflow.keras.utils import to_categorical

In [17]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [18]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/IMDB Dataset.csv")
print(df.info(),'\n',df.describe())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None 
                                                    review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [20]:
df['positive'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [21]:
nlp = spacy.load("en_core_web_lg")

def preprocess(text):
  text = re.sub(r'<.*?>', '', text)
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_punct or token.is_stop:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

In [None]:
df['text'] = df['review'].apply(preprocess)
df.head()

KeyboardInterrupt: 

In [None]:
df['vector'] = df['text'].apply(lambda x: nlp(x).vector)
df.head()

In [None]:
data = df
X = np.array(data["vector"].tolist())
y = data["positive"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

def build_and_evaluate_model(model_type="rnn"):
    model = Sequential()
    if model_type == "rnn":
        model.add(SimpleRNN(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
    elif model_type == "lstm":
        model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)

    y_pred = model.predict(X_test).flatten()
    y_pred_class = (y_pred > 0.5).astype(int)

    f1 = classification_report(y_test, y_pred_class, output_dict=True)['1']['f1-score']
    roc_auc = roc_auc_score(y_test, y_pred)

    return f1, roc_auc

rnn_f1, rnn_roc_auc = build_and_evaluate_model(model_type="rnn")

lstm_f1, lstm_roc_auc = build_and_evaluate_model(model_type="lstm")

print("RNN - F1 Score: {:.4f}, ROC-AUC: {:.4f}".format(rnn_f1, rnn_roc_auc))
print("LSTM - F1 Score: {:.4f}, ROC-AUC: {:.4f}".format(lstm_f1, lstm_roc_auc))
