In [None]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Importing Necessary Libraries**

In [None]:
import pandas as pd
import re
import nlpaug.augmenter.word as naw
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

**Loading the Dataset**

In [None]:
#reading the dataset in csv format

data = pd.read_csv('/content/drive/MyDrive/Datasets/gittercom_annotated_data.csv')

**Text Preprocessing**


In [None]:
def text_preprocessing(text):
    #removing punctuation from text data
    text = re.sub(r'[^\w\s]', '', text)

    #removing special symbols from text data
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    #replacing usernames in text data with '<username>'
    text = re.sub(r'@[^\s]+', '<username>', text)

    #replacing url in text data with '<url>' 
    text = re.sub(r'http\S+|www\S+|\S+\.com\S+', '<url>', text)
    
    #replacing code snippets in text data with '<code>' 
    text = re.sub(r'`[^`]+`', '<code>', text)
    
    return text

**Applying Text Preprocessing to the Text Data**

In [None]:
#applying preprocessing techniques to the text data ('message' column holds all textual communications,i.e., text data)

data['message'] = data['message'].apply(text_preprocessing)

**Counting the Total Number of Instances of Our Dataset**

In [None]:
data.shape[0]

1000

In [None]:
#exploring first five rows of our dataset

data.head()

Unnamed: 0,Channel,message,category
0,Cucumber,Hi Team I just recently upgraded our cucumberj...,fear
1,Cucumber,So github is trying to replace irc P,fear
2,Cucumber,aslakhellesoy Thanks seems like I was using o...,joy
3,Cucumber,Sidkiyassine just call the methods directly,surprise
4,Cucumber,Hello guys need a helpI want to call the run m...,fear


In [None]:
#exploring the instances of message column that holds text data

data['message']

0      Hi Team I just recently upgraded our cucumberj...
1                   So github is trying to replace irc P
2      aslakhellesoy Thanks  seems like I was using o...
3            Sidkiyassine just call the methods directly
4      Hello guys need a helpI want to call the run m...
                             ...                        
995    Besides the unzip issue which I saw you solved...
996    Hopefully now that the unzip issue is fixed we...
997     I can see that being a problem with my community
998                       which would slow down installs
999         Thanks yet again Ill try your suggestion now
Name: message, Length: 1000, dtype: object

**Splitting the Dataset for Classification**

In [None]:
X = data['message']
y = data['category']

#splitting the data into training and testing sets (70% for training and 30% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


**Data Augmentation of training data using Synonym**

In [None]:
augmented_data = []
augmented_category = []
no_of_targeted_augmentation = 30000

#initializing SynonymAug class of nlp aug library that leverage semantic meaning to substitute word using synonym
data_augmentation = naw.SynonymAug(aug_src='wordnet')

while len(augmented_data) < no_of_targeted_augmentation:
    for message, category in zip(X_train, y_train):
        augmented_text = data_augmentation.augment(message)
        augmented_data.append(augmented_text)
        augmented_category.append(category)
        if len(augmented_data) == no_of_targeted_augmentation:
            break

In [None]:
X_train = X_train.tolist() + augmented_data[:no_of_targeted_augmentation]
y_train = y_train.tolist() + augmented_category[:no_of_targeted_augmentation]

**Counting the Total Number of Instances of training data After Data Augmentation**

In [None]:
len(X_train)

30700

**Dropping Any Possible Duplicate Instance(s) of Text Data Due to Performing Data Augmentation**

In [None]:
data.drop_duplicates(subset='message', inplace=True)

In [None]:
len(X_train)

30700

**Exploring Augmented Text Data**

In [None]:
for message, category in zip(X_train, y_train):
    print("message:", message)
    print("category:", category)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
message: ['as well jdubois cannot discover the gratuity folder <url >']
category: sadness
message: ['hawaii i am go into cucumberspring124 vs springboot130 progeny <url> javalangArrayStoreException sunreflectannotationTypeNotPresentExceptionProxy']
category: fear
message: ['1 wish i could do this kind of workplace for free as well but unfortunately its not potential correctly now but it would cost a dream come rightful to work full meter on a project like this']
category: sadness
message: ['So I would just extend that a litle chip possibly with a few notes to that effect and everything would be all ripe']
category: joy
message: ['Tips and tricks depend awing']
category: joy
message: ['Ping ping ping Iu2019d alike to receive the jspm workflow for infamous publish tonight D Just 380 needs a location or is it my fracture']
category: joy
message: ['National intelligence community an finish this with this']
category: joy
messa

**Converting Augmented Data Back into Strings for Feature Extraction**

In [None]:
X_train = [' '.join(x) for x in X_train]

**Feature Extraction using TF-IDF**

In [None]:
vectorizer = TfidfVectorizer()
                         
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

**Training the Dataset using Decision Tree Classifier**

In [None]:
#training the dataset using Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

**Testing and Evaluating the Model**

In [None]:
#predicting on the testing set
y_pred = model.predict(X_test)

#Generating Classification Report of the Model for Computing Precision, Recall and F1-Score for Every Six Emotion Categories
clsf_rprt = classification_report(y_test, y_pred)
print("Classification_report:\n", clsf_rprt)

Classification_report:
               precision    recall  f1-score   support

       anger       0.19      0.14      0.16        28
        fear       0.39      0.41      0.40        78
         joy       0.51      0.45      0.48        88
        love       0.33      0.33      0.33         9
     sadness       0.40      0.45      0.42        88
    surprise       0.00      0.00      0.00         9

    accuracy                           0.40       300
   macro avg       0.30      0.30      0.30       300
weighted avg       0.39      0.40      0.39       300



In [None]:
#calculating precision score
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision (weighted):", precision)

#calculating recall score
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall (weighted):", recall)

#calculating f1_score score
f1_score = f1_score(y_test, y_pred, average='weighted')
print("f1-score (weighted):", f1_score)

#calculating accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Precision (weighted): 0.3927135655459705
Recall (weighted): 0.39666666666666667
f1-score (weighted): 0.3932733520766151
Accuracy: 0.39666666666666667


**Testing the Efficiency of the Model with New Textual Messages Containing Emotion**

In [None]:
#predicting on new text data containing emotion
text_with_emotion = ["Unfortunately I am still stuck at the error"]
text_with_emotion = vectorizer.transform(text_with_emotion)
predicted_labels = model.predict(text_with_emotion)
print(predicted_labels)

['sadness']
