In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody','Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Preprocess the data
df.dropna(inplace=True)  # Drop rows with missing values

# Get unique classes and clean negotiation names
df['Negotiation'] = df['Negotiation'].str.strip().str.lower()  # Remove extra spaces and convert to lowercase
unique_negotiations = df['Negotiation'].unique()  # Directly get unique classes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['text'])

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Define the Bidirectional LSTM model
model = Sequential()

# Embedding layer with dimension size 512 (found from grid search)
model.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))  # Best 'embedding_dim' from grid search

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(64, return_sequences=True)))

# Pooling Layer
model.add(GlobalMaxPooling1D())

# Fully Connected Layer
model.add(Dense(64, activation='relu'))

# Output Layer
model.add(Dense(len(unique_negotiations), activation='softmax'))

# Compile the model with a learning rate of 0.01 (found from grid search)
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])  # Best 'learning_rate' from grid search

# Train the model
model.fit(X_train, y_train_one_hot, epochs=50, batch_size=64, verbose=0)  # Best 'epochs' and 'batch_size' from grid search

# Make predictions on the test set
predictions = model.predict(X_test)

# Convert predictions from one-hot vectors to labels
predictions = encoder.inverse_transform(predictions.argmax(axis=1))

# Print Grid Search Parameters used in the Bidirectional LSTM model
print("\nParameters used in the Bidirectional LSTM model:")
print(f"- Embedding Dimension: 512")
print(f"- Learning Rate: 0.01")
print(f"- Batch Size: 64")
print(f"- Number of Epochs: 50")
print()

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, predictions)
print(f'LSTM Accuracy using Feature_Engineering: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:\n', classification_report(y_test, predictions, labels=unique_negotiations, zero_division=1))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step

Parameters used in the Bidirectional LSTM model:
- Embedding Dimension: 512
- Learning Rate: 0.01
- Batch Size: 64
- Number of Epochs: 50

LSTM Accuracy using Feature_Engineering: 0.51

Classification Report:
               precision    recall  f1-score   support

  conceptual       0.43      0.26      0.33        80
 theoretical       0.27      0.36      0.31        47
    learning       0.30      0.26      0.28        35
     tooling       0.55      0.32      0.41       115
      errors       0.63      0.91      0.75       174
   api usage       0.35      0.28      0.31        46

    accuracy                           0.51       497
   macro avg       0.42      0.40      0.40       497
weighted avg       0.50      0.51      0.48       497

