In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import re
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function to get synonyms of a word
def get_synonym(word):
    synonyms = nltk.corpus.wordnet.synsets(word)
    if synonyms:
        return random.choice(synonyms[0].lemma_names())
    else:
        return word

# Function to augment text
def augment_text(text, n=1):
    augmented_texts = []
    words = nltk.word_tokenize(text)

    for _ in range(n):
        augmented_text = words.copy()

        # Random Deletion
        augmented_text = [word for word in augmented_text if random.uniform(0, 1) > 0.2]

        # Random Swap
        for _ in range(2):
            if len(augmented_text) >= 2:
                idx1, idx2 = random.sample(range(len(augmented_text)), 2)
                augmented_text[idx1], augmented_text[idx2] = augmented_text[idx2], augmented_text[idx1]

        # Random Insertion
        for _ in range(2):
            if augmented_text:
                idx = random.randint(0, len(augmented_text) - 1)
                word = get_synonym(augmented_text[idx])
                augmented_text.insert(idx, word)

        augmented_texts.append(' '.join(augmented_text))

    return augmented_texts

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody',  'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Data augmentation using the augment_text function
augmented_texts = []
for text in df['text']:
    augmented_texts.extend(augment_text(text, n=1))  # Augment each text 1 time

# Combine original and augmented texts
df_augmented = pd.DataFrame({'text': df['text'].tolist() + augmented_texts, 'Negotiation': df['Negotiation'].tolist() + [neg for neg in df['Negotiation'].tolist() for _ in range(1)]})

# Preprocess the augmented data
df_augmented.dropna(inplace=True)  # Drop rows with missing values

# Get unique classes and clean negotiation names
df_augmented['Negotiation'] = df_augmented['Negotiation'].str.strip().str.lower()  # Remove extra spaces and convert to lowercase
unique_negotiations_augmented = df_augmented['Negotiation'].unique()  # Directly get unique classes

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(df_augmented['text'], df_augmented['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

# Tokenize the text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_train_augmented = tokenizer.texts_to_sequences(X_train_augmented)
X_test_augmented = tokenizer.texts_to_sequences(X_test_augmented)

# Pad sequences with zeros to make all sequences of the same length
X_train_augmented = pad_sequences(X_train_augmented, maxlen=max_sequence_length)
X_test_augmented = pad_sequences(X_test_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_train_encoded_augmented = encoder_augmented.fit_transform(y_train_augmented)
y_test_encoded_augmented = encoder_augmented.transform(y_test_augmented)

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_train_augmented, y_train_encoded_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_train_one_hot_augmented = to_categorical(y_over_augmented)
y_test_one_hot_augmented = to_categorical(y_test_encoded_augmented)

# Define the CNN model for augmented data
model_augmented = Sequential()

# Embedding layer
model_augmented.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))
#above Use the best embedding_dim from grid search 'embedding_dim': 512,

# Convolutional Layer
model_augmented.add(Conv1D(128, 5, activation='relu'))

# Pooling Layer
model_augmented.add(GlobalMaxPooling1D())

# Fully Connected Layer
model_augmented.add(Dense(64, activation='relu'))

# Output Layer
model_augmented.add(Dense(len(unique_negotiations_augmented), activation='softmax'))

# Compile the model with a smaller learning rate for augmented data
model_augmented.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
# above Use the best parameters from grid search 'learning_rate': 0.01

# Train the model with oversampled augmented data for more epochs
model_augmented.fit(X_over_augmented, y_train_one_hot_augmented, epochs=50, batch_size=64, verbose=0)
# above Use the best batch_size and epochs from grid search 'batch_size': 64, 'epochs': 50

# Make predictions on the augmented test set
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    predictions_augmented = model_augmented.predict(X_test_augmented)

# Convert predictions from one-hot vectors to labels for augmented data
predictions_augmented = encoder_augmented.inverse_transform(predictions_augmented.argmax(axis=1))

# Define grid search parameters I have trained these separately before using them here  
print("\n Parameters used in  model:")
print(f"'embedding_dim': [32, 64, 128, 256, 512],")
print(f"'learning_rate': [0.0001, 0.001, 0.01],")
print(f"'batch_size': [32, 64, 128],")
print(f"'epochs': [50, 100, 150]")
print()

# Display Best Model Parameters # Print Grid Search Parameters used in the  _ model
print('\nBest Model Parameters: {"embedding_dim": 512, "learning_rate": 0.01, "batch_size": 64, "epochs": 50}')

# Evaluate the performance of the classifier with augmented data
accuracy_augmented = accuracy_score(y_test_augmented, predictions_augmented)
print(f'Deep Learning Model Accuracy with Augmented Data using CNN: {accuracy_augmented:.2f}')


# Display classification report for augmented data
print('\nClassification Report for Deep Learning Model with Augmented Data:\n', classification_report(y_test_augmented, predictions_augmented, labels=unique_negotiations_augmented, zero_division=1))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 92ms/step

 Parameters used in  model:
'embedding_dim': [32, 64, 128, 256, 512],
'learning_rate': [0.0001, 0.001, 0.01],
'batch_size': [32, 64, 128],
'epochs': [50, 100, 150]


Best Model Parameters: {"embedding_dim": 512, "learning_rate": 0.01, "batch_size": 64, "epochs": 50}
Deep Learning Model Accuracy with Augmented Data using CNN: 0.67

Classification Report for Deep Learning Model with Augmented Data:
               precision    recall  f1-score   support

  conceptual       0.91      0.45      0.61       260
 theoretical       0.63      0.80      0.71       161
    learning       0.86      0.59      0.70        61
     tooling       0.88      0.50      0.64       233
      errors       0.56      0.97      0.71       336
   api usage       0.81      0.42      0.55        81

    accuracy                           0.67      1132
   macro avg       0.77      0.62      0.65      1132
weighted avg       0.75      0.67   

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import re
import nlpaug.augmenter.word as naw

import tensorflow as tf

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize Word Embeddings Augmenter
aug = naw.WordEmbsAug(
    model_type='glove',  # You can use 'word2vec', 'glove', or 'fasttext'
    model_path='glove/glove.6B.100d.txt',  # Make sure to download and specify the correct path
    action='substitute')  # Substitute words with their closest word embedding

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody', 'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody']

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Data augmentation using Word Embeddings
augmented_texts = []
for text in df['text']:
    augmented_texts.append(aug.augment(text))  # Augment each text once

# Combine original and augmented texts
df_augmented = pd.DataFrame({'text': df['text'].tolist() + augmented_texts,
                             'Negotiation': df['Negotiation'].tolist() + df['Negotiation'].tolist()})

# Preprocess the augmented data
df_augmented.dropna(inplace=True)  # Drop rows with missing values

# Get unique classes and clean negotiation names
df_augmented['Negotiation'] = df_augmented['Negotiation'].str.strip().str.lower()  # Remove extra spaces and convert to lowercase
unique_negotiations_augmented = df_augmented['Negotiation'].unique()  # Directly get unique classes

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(
    df_augmented['text'], df_augmented['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

# Tokenize the text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_train_augmented = tokenizer.texts_to_sequences(X_train_augmented)
X_test_augmented = tokenizer.texts_to_sequences(X_test_augmented)

# Pad sequences with zeros to make all sequences of the same length
X_train_augmented = pad_sequences(X_train_augmented, maxlen=max_sequence_length)
X_test_augmented = pad_sequences(X_test_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_train_encoded_augmented = encoder_augmented.fit_transform(y_train_augmented)
y_test_encoded_augmented = encoder_augmented.transform(y_test_augmented)

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_train_augmented, y_train_encoded_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_train_one_hot_augmented = to_categorical(y_over_augmented)
y_test_one_hot_augmented = to_categorical(y_test_encoded_augmented)

# Define the CNN model for augmented data
model_augmented = Sequential()

# Embedding layer
model_augmented.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))

# Convolutional Layer
model_augmented.add(Conv1D(128, 5, activation='relu'))

# Pooling Layer
model_augmented.add(GlobalMaxPooling1D())

# Fully Connected Layer
model_augmented.add(Dense(64, activation='relu'))

# Output Layer
model_augmented.add(Dense(len(unique_negotiations_augmented), activation='softmax'))

# Compile the model with a smaller learning rate for augmented data
model_augmented.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

# Train the model with oversampled augmented data for more epochs
model_augmented.fit(X_over_augmented, y_train_one_hot_augmented, epochs=50, batch_size=64, verbose=0)

# Make predictions on the augmented test set
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    predictions_augmented = model_augmented.predict(X_test_augmented)

# Convert predictions from one-hot vectors to labels for augmented data
predictions_augmented = encoder_augmented.inverse_transform(predictions_augmented.argmax(axis=1))

# Evaluate the performance of the classifier with augmented data
accuracy_augmented = accuracy_score(y_test_augmented, predictions_augmented)
print(f'Deep Learning Model Accuracy with Augmented Data using CNN: {accuracy_augmented:.2f}')

# Display classification report for augmented data
print('\nClassification Report for Deep Learning Model with Augmented Data:\n', classification_report(y_test_augmented, predictions_augmented, labels=unique_negotiations_augmented, zero_division=1))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
