In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody', 'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody']

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Preprocess the data
df.dropna(inplace=True)  # Drop rows with missing values

# Get unique classes and clean negotiation names
df['Negotiation'] = df['Negotiation'].str.strip().str.lower()  # Clean negotiation names
unique_negotiations = df['Negotiation'].unique()  # Directly get unique classes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['text'])

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Define the RNN model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))

# RNN Layer
model.add(SimpleRNN(128, return_sequences=True))

# Global Max Pooling Layer
model.add(GlobalMaxPooling1D())

# Fully Connected Layer
model.add(Dense(64, activation='relu'))

# Output Layer
model.add(Dense(len(unique_negotiations), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_one_hot, epochs=50, batch_size=64, verbose=0)

# Make predictions on the test set
predictions = model.predict(X_test)

# Convert predictions from one-hot vectors to labels
predictions = encoder.inverse_transform(predictions.argmax(axis=1))

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, predictions)
print(f'RNN Accuracy using Feature_Engineering: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:\n', classification_report(y_test, predictions, labels=unique_negotiations, zero_division=1))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 134ms/step
RNN Accuracy using Feature_Engineering: 0.51

Classification Report:
               precision    recall  f1-score   support

  conceptual       0.37      0.46      0.41        80
 theoretical       0.32      0.13      0.18        47
    learning       0.38      0.09      0.14        35
     tooling       0.41      0.66      0.50       115
      errors       0.79      0.72      0.76       174
   api usage       0.26      0.13      0.17        46

    accuracy                           0.51       497
   macro avg       0.42      0.37      0.36       497
weighted avg       0.51      0.51      0.49       497

