In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import tensorflow as tf

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody', 'Negotiation']
df = df[selected_columns]

# Preprocess the data
df.dropna(inplace=True)  # Drop rows with missing values
df['Negotiation'] = df['Negotiation'].str.strip().str.lower()  # Clean negotiation names

# Split the data into features and target
X = df['QuestionTitle'] + ' ' + df['QuestionBody'] 
y = df['Negotiation']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert categorical labels to numerical
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

# Convert numerical labels to one-hot vectors
y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Define the deep learning model (Feedforward Neural Network)
model = Sequential()
model.add(Dense(512, input_dim=X_train_tfidf.shape[1], activation='relu'))  
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy']) 

# Train the model
model.fit(X_train_tfidf.toarray(), y_train_one_hot, epochs=50, batch_size=64, verbose=0) 

# Make predictions on the test set
predictions = model.predict(X_test_tfidf.toarray())

# Convert predictions from one-hot vectors to labels
predictions = encoder.inverse_transform(predictions.argmax(axis=1))


# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, predictions)
print(f'FNN Accuracy using Train-Test_Split: {accuracy:.2f}')

# Display classification report
print('\nClassification Report:\n', classification_report(y_test, predictions, zero_division=1))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
FNN Accuracy using Train-Test_Split: 0.53

Classification Report:
               precision    recall  f1-score   support

   api usage       0.20      0.11      0.14        46
  conceptual       0.40      0.39      0.39        80
      errors       0.79      0.76      0.78       174
    learning       0.40      0.34      0.37        35
 theoretical       0.39      0.38      0.39        47
     tooling       0.44      0.57      0.50       115

    accuracy                           0.53       497
   macro avg       0.44      0.43      0.43       497
weighted avg       0.53      0.53      0.53       497

