In [1]:
import pandas as pd
import random

def add_noise(sentence, p=0.2):
    words = sentence.split()
    noisy_words = []
    for word in words:
        if random.random() < p:
            # Add noise to the word
            noisy_word = ''.join(random.sample(word, len(word)))
            noisy_words.append(noisy_word)
        else:
            noisy_words.append(word)
    return ' '.join(noisy_words)

# Load the training dataset
train_df = pd.read_csv(r'D:\sell\archive (1)\train_updated.csv')
eval_df = pd.read_csv(r'D:\sell\archive (1)\eval_updated.csv')
train_df = pd.concat([train_df, eval_df], ignore_index=True)


# Augment the data by adding noise to each sentence
augmented_sentences = [add_noise(sentence) for sentence in train_df['input'].values]

# Combine the original and augmented data
all_sentences = list(train_df['input'].values) + augmented_sentences
all_targets = list(train_df['target'].values) + list(train_df['target'].values)

# Create a new dataframe with the combined data
new_train_df = pd.DataFrame({'input': all_sentences, 'target': all_targets})

# Save the new dataframe to a CSV file
new_train_df.to_csv(r'D:\sell\archive (1)\augmented_train_data.csv', index=False)


In [2]:
new_train_df

Unnamed: 0,input,target
0,So I think we can not live if old people coul...,So I think we would not be alive if our ancest...
1,So I think we can not live if old people coul...,So I think we could not live if older people d...
2,So I think we can not live if old people coul...,So I think we can not live if old people could...
3,So I think we can not live if old people coul...,So I think we can not live if old people can n...
4,For not use car .,Not for use with a car .
...,...,...
12003,"In toreh words , the image in the TV comercial...","In other words , the image in TV commercials i..."
12004,Members gather yomen for the funeral nad lpeh ...,Members gather money for the funeral to help t...
12005,Members gather money for the funeral nda help ...,Members gather money for the funeral to help t...
12006,eMbsmer reaght oenmy for the funeral and leph ...,Members gather money for the funeral and help ...


In [3]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

# fit and transform the target column using the LabelBinarizer
new_train_df['target'] = lb.fit_transform(new_train_df['target'])
another_data = pd.read_csv(r'D:\sell\archive (1)\NLP Assignment\train_data.csv')
another_data = another_data.rename(columns={'input': 'input', 'labels': 'target'})
df = pd.concat([new_train_df, another_data], ignore_index=True)
df

Unnamed: 0,input,target
0,So I think we can not live if old people coul...,0
1,So I think we can not live if old people coul...,0
2,So I think we can not live if old people coul...,0
3,So I think we can not live if old people coul...,0
4,For not use car .,0
...,...,...
32001,More than a hundred thousand people are suppor...,1
32002,The reason why we chose this subject a few day...,1
32003,When you buy CBS is Criminal Minds DVD in Japa...,1
32004,but it is dizzying for me T T,1


In [4]:
df['target'].value_counts()

0    22005
1    10001
Name: target, dtype: int64

In [7]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import regularizers, optimizers
from sklearn.preprocessing import LabelEncoder



# Perform data preprocessing
stop_words = set(stopwords.words('english'))
df['input'] = df['input'].apply(lambda x: x.lower())
df['input'] = df['input'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
df['input'] = df['input'].apply(lambda x: re.sub('\n', ' ', x))
df['input'] = df['input'].apply(lambda x: re.sub('\s+', ' ', x.strip()))
df['input'] = df['input'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['input'].values)
X = tokenizer.texts_to_sequences(df['input'].values)
X = pad_sequences(X, maxlen=50)

# Split data into train and test sets
Y = df['target'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X.shape[1]))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(LeakyReLU(alpha = 0.1))
model.add(Dense(units=32, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
np.random.seed(42)
optimizer = optimizers.Adam(learning_rate = 0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(model.summary())

# Set early stopping and model checkpoint callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

# Train the model
history = model.fit(X_train, Y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[es, mc])

# Print the training loss and accuracy for each epoch
print(history.history)

# Evaluate the model on the test set
model.load_weights('best_model.h5')
score, acc = model.evaluate(X_test, Y_test, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 128)           2718848   
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,860,801
Trainable params: 2,860,801
No

In [10]:
from keras.layers import Embedding, LSTM, Dense, Bidirectional
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(units=32, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
np.random.seed(42)
optimizer = optimizers.Adam(learning_rate = 0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(model.summary())

# Set early stopping and model checkpoint callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

# Train the model
history = model.fit(X_train, Y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[es, mc])

# Print the training loss and accuracy for each epoch
print(history.history)

# Evaluate the model on the test set
model.load_weights('best_model.h5')
score, acc = model.evaluate(X_test, Y_test, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 128)           2718848   
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              263168    
 nal)                                                            
                                                                 
 dense_12 (Dense)            (None, 64)                16448     
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 64)                0         
                                                                 
 dense_13 (Dense)            (None, 32)                2080      
                                                                 
 dense_14 (Dense)            (None, 1)                 33        
                                                      