In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody', 'Negotiation']
df = df[selected_columns]

# Preprocess the data
df.dropna(inplace=True)  # Drop rows with missing values
df['Negotiation'] = df['Negotiation'].str.strip().str.lower()  # Clean negotiation names

# Split the data into features and target
X = df['QuestionTitle'] + ' ' + df['QuestionBody']
y = df['Negotiation']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Define the MLP model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_sequence_length))

# Flatten layer
model.add(Flatten())

# Dense layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Output Layer
model.add(Dense(len(encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_one_hot, epochs=50, batch_size=64, verbose=0)

# Make predictions on the test set
predictions = model.predict(X_test)

# Convert predictions from one-hot vectors to labels
predictions = encoder.inverse_transform(predictions.argmax(axis=1))

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, predictions)
print(f'MLP Accuracy using Train-Test_Split: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:\n', classification_report(y_test, predictions, zero_division=1))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 79ms/step
MLP Accuracy using Train-Test_Split: 0.49

Classification Report:
               precision    recall  f1-score   support

   api usage       0.19      0.11      0.14        46
  conceptual       0.39      0.46      0.43        80
      errors       0.69      0.83      0.75       174
    learning       0.33      0.06      0.10        35
 theoretical       0.35      0.17      0.23        47
     tooling       0.36      0.43      0.40       115

    accuracy                           0.49       497
   macro avg       0.39      0.34      0.34       497
weighted avg       0.46      0.49      0.46       497

