In [22]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Conv1D, GlobalMaxPooling1D, Dense, Bidirectional, Dropout


In [4]:
# Step 1: Load and Inspect the Data
file_path = 'rct_data.txt'
data = pd.read_csv(file_path, delimiter='\t', header=None, names=['ID', 'Label', 'Year', 'Title', 'Abstract'])

In [5]:
# Load necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
 # preprocess the dataset
data.dropna(subset=['Abstract'], inplace=True)

def preprocess_text(comment):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Lowercase
    comment = comment.lower()
    # Remove special characters and URLs
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)
    comment = re.sub(r'\@\w+|\#', '', comment)
    # Tokenization
    tokens = word_tokenize(comment)
    # Lemmatization and stop word removal
    comment = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return comment

data['Abstract'] = data['Abstract'].apply(preprocess_text)
X = data['Abstract']
y = data['Label']

In [7]:
# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [8]:
# Tokenize and pad sequences for LSTM and GRU
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [9]:
max_length = 100
trunc_type='post'
padding_type='post'

In [10]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [11]:
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_val_padded = pad_sequences(X_val_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [12]:
# Model 1: LSTM
model_lstm = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.fit(X_train_padded, y_train, epochs=5, validation_data=(X_val_padded, y_val), batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7dc219a11ab0>

In [13]:
# Evaluate LSTM
y_pred_lstm = (model_lstm.predict(X_test_padded) > 0.5).astype("int32")
print("LSTM Test Accuracy:", accuracy_score(y_test, y_pred_lstm))
print("LSTM Test Report:")
print(classification_report(y_test, y_pred_lstm))

LSTM Test Accuracy: 0.9158386908240795
LSTM Test Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      3979
           1       0.80      0.84      0.82      1154

    accuracy                           0.92      5133
   macro avg       0.87      0.89      0.88      5133
weighted avg       0.92      0.92      0.92      5133



In [25]:
# Model 2: GRU
model_gru = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    Bidirectional(GRU(64)),
    Dense(1, activation='sigmoid')
])
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_gru.fit(X_train_padded, y_train, epochs=5, validation_data=(X_val_padded, y_val), batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7dc21615ed10>

In [26]:
# Evaluate GRU
y_pred_gru = (model_gru.predict(X_test_padded) > 0.5).astype("int32")
print("GRU Test Accuracy:", accuracy_score(y_test, y_pred_gru))
print("GRU Test Report:")
print(classification_report(y_test, y_pred_gru))

GRU Test Accuracy: 0.9148646015975064
GRU Test Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      3979
           1       0.82      0.80      0.81      1154

    accuracy                           0.91      5133
   macro avg       0.88      0.87      0.88      5133
weighted avg       0.91      0.91      0.91      5133



In [23]:
# Model 3: CNN
model_cnn = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_cnn.fit(X_train_padded, y_train, epochs=10, validation_data=(X_val_padded, y_val), batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7dc2062393c0>

In [24]:
# Evaluate CNN
y_pred_cnn = (model_cnn.predict(X_test_padded) > 0.5).astype("int32")
print("CNN Test Accuracy:", accuracy_score(y_test, y_pred_cnn))
print("CNN Test Report:")
print(classification_report(y_test, y_pred_cnn))

CNN Test Accuracy: 0.9288914864601597
CNN Test Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      3979
           1       0.88      0.80      0.83      1154

    accuracy                           0.93      5133
   macro avg       0.91      0.88      0.89      5133
weighted avg       0.93      0.93      0.93      5133

