In [None]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


**Importing Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import re
import nlpaug.augmenter.word as naw
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

**Loading the Dataset**

In [None]:
#reading the dataset in csv format

data = pd.read_csv('/content/drive/MyDrive/Datasets/gittercom_annotated_data.csv')

**Text Preprocessing**


In [None]:
def text_preprocessing(text):
    #removing punctuation from text data
    text = re.sub(r'[^\w\s]', '', text)

    #removing special symbols from text data
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    #replacing usernames in text data with '<username>'
    text = re.sub(r'@[^\s]+', '<username>', text)

    #replacing url in text data with '<url>' 
    text = re.sub(r'http\S+|www\S+|\S+\.com\S+', '<url>', text)
    
    #replacing code snippets in text data with '<code>' 
    text = re.sub(r'`[^`]+`', '<code>', text)
    
    return text

**Applying Text Preprocessing to the Text Data**

In [None]:
#applying preprocessing techniques to the text data ('message' column holds all textual communications,i.e., text data)

data['message'] = data['message'].apply(text_preprocessing)

**Counting the Total Number of Instances of Our Dataset**

In [None]:
data.shape[0]

1000

In [None]:
#exploring first five rows of our dataset

data.head()

Unnamed: 0,Channel,message,category
0,Cucumber,Hi Team I just recently upgraded our cucumberj...,fear
1,Cucumber,So github is trying to replace irc P,fear
2,Cucumber,aslakhellesoy Thanks seems like I was using o...,joy
3,Cucumber,Sidkiyassine just call the methods directly,surprise
4,Cucumber,Hello guys need a helpI want to call the run m...,fear


In [None]:
#exploring the instances of message column that holds text data

data['message']

0      Hi Team I just recently upgraded our cucumberj...
1                   So github is trying to replace irc P
2      aslakhellesoy Thanks  seems like I was using o...
3            Sidkiyassine just call the methods directly
4      Hello guys need a helpI want to call the run m...
                             ...                        
995    Besides the unzip issue which I saw you solved...
996    Hopefully now that the unzip issue is fixed we...
997     I can see that being a problem with my community
998                       which would slow down installs
999         Thanks yet again Ill try your suggestion now
Name: message, Length: 1000, dtype: object

**Splitting the Dataset for Classification**

In [None]:
X = data['message'].values
y = data['category'].values

#splitting the data into training and testing sets (70% for training and 30% for testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

**Data Augmentation of training data using Synonym**

In [None]:
augmented_data = []
augmented_category = []
no_of_targeted_augmentation = 30000

#initializing SynonymAug class of nlp aug library that leverage semantic meaning to substitute word using synonym
data_augmentation = naw.SynonymAug(aug_src='wordnet')

while len(augmented_data) < no_of_targeted_augmentation:
    for message, category in zip(X_train, y_train):
        augmented_text = data_augmentation.augment(message)
        augmented_data.append(augmented_text)
        augmented_category.append(category)
        if len(augmented_data) == no_of_targeted_augmentation:
            break

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
X_train = X_train.tolist() + augmented_data[:no_of_targeted_augmentation]
y_train = y_train.tolist() + augmented_category[:no_of_targeted_augmentation]

**Counting the Total Number of Instances of training data After Data Augmentation**

In [None]:
len(X_train)

30700

**Dropping Any Possible Duplicate Instance(s) of Text Data Due to Performing Data Augmentation**

In [None]:
data.drop_duplicates(subset='message', inplace=True)

In [None]:
len(X_train)

30700

**Exploring Augmented Data**

In [None]:
for message, category in zip(X_train, y_train):
    print("message:", message)
    print("category:", category)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
message: ['also jdubois cannot find the pourboire folder <url >']
category: sadness
message: ['hawaii i am running into cucumberspring124 v springboot130 issues <url> javalangArrayStoreException sunreflectannotationTypeNotPresentExceptionProxy']
category: fear
message: ['ane care i could do this sort of oeuvre for free as well only unfortunately its non possible right nowadays but it would be a dream come rightful to work total time on a projection like this']
category: sadness
message: ['So 1 would merely prolong that a litle bit possibly with a few notes to that effect and everything would exist all good']
category: joy
message: ['Tips and legerdemain look awesome']
category: joy
message: ['Ping pink ping Iu2019d like to perplex the jspm workflow for notorious publish tonight D Just 380 needs a fix or be information technology my fault']
category: joy
message: ['Intelligence community an finish this with this']
category

**Converting Augmented Data Back into Strings for Feature Extraction**

In [None]:
X_train = [' '.join(x) for x in X_train]

**Preparing the Data Before Feeding It to the BiLSTM Model**

In [None]:
#converting text data to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

max_length = 100  

#padding the sequences
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')


**Performing One-Hot Encoding**

In [None]:
one_hot_encoder = OneHotEncoder()
y_train = one_hot_encoder.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_test = one_hot_encoder.transform(np.array(y_test).reshape(-1, 1)).toarray()


**Building BiLSTM Model**

In [None]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          692300    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         84480     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 6)                 3

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

**Training the Model**

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd90b9dc850>

**Testing and Evaluating the Model**

In [None]:
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

#Generating Classification Report of the Model for Computing Precision, Recall and F1-Score for Every Six Emotion Categories
clsf_rprt = classification_report(np.argmax(y_test, axis=1), y_pred_labels)
print("Classification_report:\n", clsf_rprt)

Classification_report:
               precision    recall  f1-score   support

           0       0.19      0.11      0.14        28
           1       0.38      0.38      0.38        78
           2       0.53      0.66      0.59        88
           3       0.25      0.33      0.29         9
           4       0.39      0.38      0.38        88
           5       0.00      0.00      0.00         9

    accuracy                           0.42       300
   macro avg       0.29      0.31      0.30       300
weighted avg       0.39      0.42      0.41       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#calculating precision score
precision = precision_score(np.argmax(y_test, axis=1), y_pred_labels, average='weighted')

#calculating recall score
recall = recall_score(np.argmax(y_test, axis=1), y_pred_labels, average='weighted')

#calculating f1-score score
f1 = f1_score(np.argmax(y_test, axis=1), y_pred_labels, average='weighted')

#calculating accuracy score
accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred_labels)

print("Precision (weighted):", precision)
print("Recall (weighted):", recall)
print("F1-Score (weighted):", f1)
print("Accuracy:", accuracy)

Precision (weighted): 0.39496797985249144
Recall (weighted): 0.42333333333333334
F1-Score (weighted): 0.40593041202706415
Accuracy: 0.42333333333333334


  _warn_prf(average, modifier, msg_start, len(result))
