In [None]:
!pip install textattack


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import re
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from textattack.augmentation import EmbeddingAugmenter

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def get_synonym(word):
    synonyms = nltk.corpus.wordnet.synsets(word)
    if synonyms:
        return random.choice(synonyms[0].lemma_names())
    else:
        return word

def augment_text(text, n=1):
    augmented_texts = []
    words = nltk.word_tokenize(text)

    for _ in range(n):
        augmented_text = words.copy()

        # Random Deletion
        augmented_text = [word for word in augmented_text if random.uniform(0, 1) > 0.2]

        # Random Swap
        for _ in range(2):
            if len(augmented_text) >= 2:
                idx1, idx2 = random.sample(range(len(augmented_text)), 2)
                augmented_text[idx1], augmented_text[idx2] = augmented_text[idx2], augmented_text[idx1]

        # Random Insertion
        for _ in range(2):
            if augmented_text:
                idx = random.randint(0, len(augmented_text) - 1)
                word = get_synonym(augmented_text[idx])
                augmented_text.insert(idx, word)

        augmented_texts.append(' '.join(augmented_text))

    return augmented_texts

# Instantiate the EmbeddingAugmenter with specified parameters
embed_aug = EmbeddingAugmenter(pct_words_to_swap=0.1, transformations_per_example=1)

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody',  'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Data augmentation using the augment_text function
augmented_texts = []
for text in df['text']:
    augmented_texts.extend(augment_text(text, n=1))  # Augment each text 1 time

# Combine original and augmented texts
df_augmented = pd.DataFrame({'text': df['text'].tolist() + augmented_texts, 'Negotiation': df['Negotiation'].tolist() + [neg for neg in df['Negotiation'].tolist() for _ in range(1)]})

# Preprocess the augmented data
df_augmented.dropna(inplace=True)  # Drop rows with missing values

# Get unique classes and clean negotiation names
df_augmented['Negotiation'] = df_augmented['Negotiation'].str.strip().str.lower()  # Remove extra spaces and convert to lowercase
unique_negotiations_augmented = df_augmented['Negotiation'].unique()  # Directly get unique classes

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(df_augmented['text'], df_augmented['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_train_augmented = tokenizer.texts_to_sequences(X_train_augmented)
X_test_augmented = tokenizer.texts_to_sequences(X_test_augmented)

# Pad sequences with zeros to make all sequences of the same length
X_train_augmented = pad_sequences(X_train_augmented, maxlen=max_sequence_length)
X_test_augmented = pad_sequences(X_test_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_train_encoded_augmented = encoder_augmented.fit_transform(y_train_augmented)
y_test_encoded_augmented = encoder_augmented.transform(y_test_augmented)

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_train_augmented, y_train_encoded_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_train_one_hot_augmented = to_categorical(y_over_augmented)
y_test_one_hot_augmented = to_categorical(y_test_encoded_augmented)

# Define the GRU model for augmented data 
model_augmented = Sequential()

# Embedding layer with dimension size 512 (found from grid search)
model_augmented.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))  # Best 'embedding_dim' from grid search

# GRU Layer
model_augmented.add(GRU(64, return_sequences=True))

# Pooling Layer
model_augmented.add(GlobalMaxPooling1D())

# Fully Connected Layer
model_augmented.add(Dense(64, activation='relu'))

# Output Layer
model_augmented.add(Dense(len(unique_negotiations_augmented), activation='softmax'))

# Compile the model with a learning rate of 0.01 (found from grid search)
model_augmented.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])  # Best 'learning_rate' from grid search

# Train the model with oversampled augmented data for 50 epochs with batch size 64 (found from grid search)
model_augmented.fit(X_over_augmented, y_train_one_hot_augmented, epochs=50, batch_size=64, verbose=0)  # Best 'epochs' and 'batch_size' from grid search

# Make predictions on the augmented test set
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    predictions_augmented = model_augmented.predict(X_test_augmented)

# Convert predictions from one-hot vectors to labels for augmented data
predictions_augmented = encoder_augmented.inverse_transform(predictions_augmented.argmax(axis=1))

# Print Grid Search Parameters used in the GRU model for augmented data 
print("\n Parameters used in the GRU model for augmented data :")
print(f"- Embedding Dimension: 512")
print(f"- Learning Rate: 0.01")
print(f"- Batch Size: 64")
print(f"- Number of Epochs: 50")
print()

# Evaluate the performance of the classifier with augmented data
accuracy_augmented = accuracy_score(y_test_augmented, predictions_augmented)
print(f'Deep Learning Model Accuracy using GRU: {accuracy_augmented:.2f}')

# Display classification report for augmented data
print('\nClassification Report for Deep Learning Model with Augmented Data:\n', classification_report(y_test_augmented, predictions_augmented, labels=unique_negotiations_augmented, zero_division=1))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from textattack.augmentation import WordNetAugmenter

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function for text augmentation using TextAttack
def textattack_data_augment(texts, labels, augmenter):
    aug_texts, aug_labels = [], []
    for text, label in zip(texts, labels):
        aug_texts.append(text)
        aug_labels.append(label)
        augmented_samples = augmenter.augment(text)
        aug_texts.extend(augmented_samples)
        aug_labels.extend([label] * len(augmented_samples))
    return aug_texts, aug_labels

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody',  'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Instantiate the WordNetAugmenter
wordnet_aug = WordNetAugmenter()

# Data augmentation using TextAttack with WordNetAugmenter
augmented_texts, augmented_labels = textattack_data_augment(df['text'], df['Negotiation'], wordnet_aug)

# Create a DataFrame with augmented data
df_augmented = pd.DataFrame({'text': augmented_texts, 'Negotiation': augmented_labels})

# Drop any NaN values if present
df_augmented.dropna(inplace=True)

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(df_augmented['text'], df_augmented['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_train_augmented = tokenizer.texts_to_sequences(X_train_augmented)
X_test_augmented = tokenizer.texts_to_sequences(X_test_augmented)

# Pad sequences with zeros to make all sequences of the same length
X_train_augmented = pad_sequences(X_train_augmented, maxlen=max_sequence_length)
X_test_augmented = pad_sequences(X_test_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_train_encoded_augmented = encoder_augmented.fit_transform(y_train_augmented)
y_test_encoded_augmented = encoder_augmented.transform(y_test_augmented)

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_train_augmented, y_train_encoded_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_train_one_hot_augmented = to_categorical(y_over_augmented)
y_test_one_hot_augmented = to_categorical(y_test_encoded_augmented)

# Define the GRU model for augmented data 
model_augmented = Sequential()

# Embedding layer
model_augmented.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_sequence_length))

# GRU Layer
model_augmented.add(GRU(64, return_sequences=True))

# Pooling Layer
model_augmented.add(GlobalMaxPooling1D())

# Fully Connected Layer
model_augmented.add(Dense(64, activation='relu'))

# Output Layer
model_augmented.add(Dense(len(df_augmented['Negotiation'].unique()), activation='softmax'))

# Compile the model
model_augmented.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model_augmented.fit(X_over_augmented, y_train_one_hot_augmented, epochs=10, batch_size=32, verbose=1)

# Make predictions on the test set
predictions_augmented = model_augmented.predict(X_test_augmented)

# Convert predictions from one-hot vectors to labels
predictions_augmented = encoder_augmented.inverse_transform(predictions_augmented.argmax(axis=1))

# Evaluate the performance of the classifier
accuracy_augmented = accuracy_score(y_test_augmented, predictions_augmented)
print(f'Accuracy: {accuracy_augmented:.2f}')

# Display classification report
print('\nClassification Report:\n', classification_report(y_test_augmented, predictions_augmented, zero_division=1))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from textattack.augmentation import WordNetAugmenter

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function for text augmentation using TextAttack
def textattack_data_augment(texts, labels, augmenter):
    aug_texts, aug_labels = [], []
    for text, label in zip(texts, labels):
        aug_texts.append(text)  # Original text
        aug_labels.append(label)  # Original label
        augmented_samples = augmenter.augment(text)  # Generate augmented samples
        aug_texts.extend(augmented_samples)  # Add augmented texts
        aug_labels.extend([label] * len(augmented_samples))  # Add corresponding labels
    return aug_texts, aug_labels

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody',  'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Instantiate the WordNetAugmenter
wordnet_aug = WordNetAugmenter()

# Data augmentation using TextAttack with WordNetAugmenter
augmented_texts, augmented_labels = textattack_data_augment(df['text'], df['Negotiation'], wordnet_aug)

# Create a DataFrame with augmented data
df_augmented = pd.DataFrame({'text': augmented_texts, 'Negotiation': augmented_labels})

# Drop any NaN values if present
df_augmented.dropna(inplace=True)

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(
    df_augmented['text'], df_augmented['Negotiation'], test_size=0.2, random_state=42)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_train_augmented = tokenizer.texts_to_sequences(X_train_augmented)
X_test_augmented = tokenizer.texts_to_sequences(X_test_augmented)

# Pad sequences with zeros to make all sequences of the same length
X_train_augmented = pad_sequences(X_train_augmented, maxlen=max_sequence_length)
X_test_augmented = pad_sequences(X_test_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_train_encoded_augmented = encoder_augmented.fit_transform(y_train_augmented)
y_test_encoded_augmented = encoder_augmented.transform(y_test_augmented)

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_train_augmented, y_train_encoded_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_train_one_hot_augmented = to_categorical(y_over_augmented)
y_test_one_hot_augmented = to_categorical(y_test_encoded_augmented)

# Define the GRU model for augmented data 
model_augmented = Sequential()

# Embedding layer
model_augmented.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_sequence_length))

# GRU Layer
model_augmented.add(GRU(64, return_sequences=True))

# Pooling Layer
model_augmented.add(GlobalMaxPooling1D())

# Fully Connected Layer
model_augmented.add(Dense(64, activation='relu'))

# Output Layer
model_augmented.add(Dense(len(df_augmented['Negotiation'].unique()), activation='softmax'))

# Compile the model
model_augmented.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model_augmented.fit(X_over_augmented, y_train_one_hot_augmented, epochs=10, batch_size=32, verbose=1)

# Make predictions on the test set
predictions_augmented = model_augmented.predict(X_test_augmented)

# Convert predictions from one-hot vectors to labels
predictions_augmented = encoder_augmented.inverse_transform(predictions_augmented.argmax(axis=1))

# Evaluate the performance of the classifier
accuracy_augmented = accuracy_score(y_test_augmented, predictions_augmented)
print(f'Accuracy: {accuracy_augmented:.2f}')

# Display classification report
print('\nClassification Report:\n', classification_report(y_test_augmented, predictions_augmented, zero_division=1))


In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, GlobalMaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from textattack.augmentation import WordNetAugmenter

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function for text augmentation using TextAttack
def textattack_data_augment(texts, labels, augmenter):
    aug_texts, aug_labels = [], []
    for text, label in zip(texts, labels):
        aug_texts.append(text)  # Original text
        aug_labels.append(label)  # Original label
        augmented_samples = augmenter.augment(text)  # Generate augmented samples
        aug_texts.extend(augmented_samples)  # Add augmented texts
        aug_labels.extend([label] * len(augmented_samples))  # Add corresponding labels
    return aug_texts, aug_labels

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody', 'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Instantiate the WordNetAugmenter
wordnet_aug = WordNetAugmenter()

# Data augmentation using TextAttack with WordNetAugmenter
augmented_texts, augmented_labels = textattack_data_augment(df['text'], df['Negotiation'], wordnet_aug)

# Create a DataFrame with augmented data
df_augmented = pd.DataFrame({'text': augmented_texts, 'Negotiation': augmented_labels})

# Drop any NaN values if present
df_augmented.dropna(inplace=True)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_augmented = tokenizer.texts_to_sequences(df_augmented['text'])
X_augmented = pad_sequences(X_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_augmented = encoder_augmented.fit_transform(df_augmented['Negotiation'])

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_augmented, y_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_over_one_hot_augmented = to_categorical(y_over_augmented)

# Initialize KFold with 10 splits
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Function to build the GRU model with updated hyperparameters
def build_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))
    model.add(GRU(units=128, return_sequences=True))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(df_augmented['Negotiation'].unique()), activation='softmax'))
    
    # Compile the model with specified hyperparameters
    optimizer = Adam(learning_rate=0.0005)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

# K-Fold Cross Validation
accuracies = []
fold = 1
for train_index, test_index in kf.split(X_over_augmented):
    print(f"Training on fold {fold}...")
    
    # Split the data into train and test for the current fold
    X_train, X_test = X_over_augmented[train_index], X_over_augmented[test_index]
    y_train, y_test = y_over_one_hot_augmented[train_index], y_over_one_hot_augmented[test_index]
    
    # Build and train the model
    model_augmented = build_model()
    model_augmented.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
    
    # Evaluate the model
    predictions_augmented = model_augmented.predict(X_test)
    predictions_augmented_labels = predictions_augmented.argmax(axis=1)
    y_test_labels = y_test.argmax(axis=1)
    
    accuracy_augmented = accuracy_score(y_test_labels, predictions_augmented_labels)
    accuracies.append(accuracy_augmented)
    
    print(f"Accuracy for fold {fold}: {accuracy_augmented:.2f}")
    print('\nClassification Report:\n', classification_report(y_test_labels, predictions_augmented_labels, zero_division=1))
    
    fold += 1

# Print the average accuracy across all folds
print(f"\nAverage Accuracy across all folds: {sum(accuracies)/len(accuracies):.2f}")


In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, GlobalMaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from textattack.augmentation import EmbeddingAugmenter

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Function for text augmentation using TextAttack
def textattack_data_augment(texts, labels, augmenter):
    aug_texts, aug_labels = [], []
    for text, label in zip(texts, labels):
        aug_texts.append(text)  # Original text
        aug_labels.append(label)  # Original label
        augmented_samples = augmenter.augment(text)  # Generate augmented samples
        aug_texts.extend(augmented_samples)  # Add augmented texts
        aug_labels.extend([label] * len(augmented_samples))  # Add corresponding labels
    return aug_texts, aug_labels

# Load the dataset
dataset_path = r'All questions answers of Stack Exchange.csv'
df = pd.read_csv(dataset_path, encoding='latin1')

# Select relevant columns for classification
selected_columns = ['QuestionTitle', 'QuestionBody', 'Negotiation']
df = df[selected_columns]

# Clean and preprocess text data
df['text'] = df['QuestionTitle'] + ' ' + df['QuestionBody'] 

# Convert to lowercase
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Instantiate the EmbeddingAugmenter
embed_aug = EmbeddingAugmenter(pct_words_to_swap=0.1, transformations_per_example=1)

# Data augmentation using TextAttack with EmbeddingAugmenter
augmented_texts, augmented_labels = textattack_data_augment(df['text'], df['Negotiation'], embed_aug)

# Create a DataFrame with augmented data
df_augmented = pd.DataFrame({'text': augmented_texts, 'Negotiation': augmented_labels})

# Drop any NaN values if present
df_augmented.dropna(inplace=True)

# Define maximum number of words to consider as features
max_features = 5000

# Define maximum length of a sequence
max_sequence_length = 200

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_augmented['text'])

X_augmented = tokenizer.texts_to_sequences(df_augmented['text'])
X_augmented = pad_sequences(X_augmented, maxlen=max_sequence_length)

# Convert categorical labels to numerical
encoder_augmented = LabelEncoder()
y_augmented = encoder_augmented.fit_transform(df_augmented['Negotiation'])

# Define oversampling strategy for augmented data
oversample_augmented = RandomOverSampler(sampling_strategy='minority')

# Fit and apply the transform for augmented data
X_over_augmented, y_over_augmented = oversample_augmented.fit_resample(X_augmented, y_augmented)

# Convert numerical labels to one-hot vectors for augmented data
y_over_one_hot_augmented = to_categorical(y_over_augmented)

# Initialize KFold with 10 splits
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Function to build the GRU model with updated hyperparameters
def build_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=512, input_length=max_sequence_length))
    model.add(GRU(units=128, return_sequences=True))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(df_augmented['Negotiation'].unique()), activation='softmax'))
    
    # Compile the model with specified hyperparameters
    optimizer = Adam(learning_rate=0.0005)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

# K-Fold Cross Validation
accuracies = []
fold = 1
for train_index, test_index in kf.split(X_over_augmented):
    print(f"Training on fold {fold}...")
    
    # Split the data into train and test for the current fold
    X_train, X_test = X_over_augmented[train_index], X_over_augmented[test_index]
    y_train, y_test = y_over_one_hot_augmented[train_index], y_over_one_hot_augmented[test_index]
    
    # Build and train the model
    model_augmented = build_model()
    model_augmented.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
    
    # Evaluate the model
    predictions_augmented = model_augmented.predict(X_test)
    predictions_augmented_labels = predictions_augmented.argmax(axis=1)
    y_test_labels = y_test.argmax(axis=1)
    
    accuracy_augmented = accuracy_score(y_test_labels, predictions_augmented_labels)
    accuracies.append(accuracy_augmented)
    
    print(f"Accuracy for fold {fold}: {accuracy_augmented:.2f}")
    print('\nClassification Report:\n', classification_report(y_test_labels, predictions_augmented_labels, zero_division=1))
    
    fold += 1

# Print the average accuracy across all folds
print(f"\nAverage Accuracy across all folds: {sum(accuracies)/len(accuracies):.2f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nekdilkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
