In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Load data
def preprocess_data_from_file(filepath):
    with open(filepath, 'r') as file:
        data = file.read()
    lines = data.strip().split('\n')
    labels = []
    texts = []
    for line in lines:
        label, text = line.split(' ', 1)
        label = int(label.split('__label__')[1])
        labels.append(label)
        texts.append(text)
    return pd.DataFrame({'label': labels, 'text': texts})

# Function to preprocess text
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[\W_]+', ' ', text)
    # Tokenize and remove stop words
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
train_filepath = 'train.3270.txt'
df_train = preprocess_data_from_file(train_filepath)
df_train['cleaned_text'] = df_train['text'].apply(preprocess_text)

# Split data into features and labels
X = df_train['cleaned_text']
y = df_train['label']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HUYNGUYEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HUYNGUYEN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# Step 3: Initialize and fit the TfidfVectorizer with stop_words and max_features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(X)

# Convert the TF-IDF matrix to a dense matrix
dense_tfidf = tfidf_matrix.todense()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dense_tfidf, y, test_size=0.2, random_state=42)

# Step 5: Build the Bidirectional LSTM model
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))  # Input layer
model.add(Dense(128, activation='relu'))  # Dense layer as an alternative to embedding
model.add(Dense(64, activation='relu'))  # Additional Dense layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Step 6: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.5229358077049255


In [17]:
type(tfidf_matrix.toarray()), type(tfidf_matrix.todense())

(numpy.ndarray, numpy.matrix)

In [6]:
dense_tfidf.shape

(3270, 5000)

In [9]:
dense_tfidf[0]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [5]:
X_train.shape, X_train.shape[1]

((2616, 5000), 5000)