In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [156]:
df = pd.read_csv("E:/Generative AI Projects/NLP_Complete/corpus/train.txt", sep = ";", header = None, names = ["text", "emotions"])

In [157]:
df.head(3)

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger


In [158]:
df.isnull().sum()

text        0
emotions    0
dtype: int64

In [159]:
unique_emotions = df["emotions"].unique()

In [160]:
unique_emotions

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [161]:
emotions_numbers = {}
i = 0
for emo in unique_emotions :
    emotions_numbers[emo] = i
    i += 1

df["emotions"] = df["emotions"].map(emotions_numbers)

In [162]:
df.head(7)

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
5,ive been feeling a little burdened lately wasn...,0
6,ive been taking or milligrams or times recomme...,3


### Text Preprocessing
##### Converting to lower case

In [163]:
df["text"] = df["text"].apply(lambda x : x.lower())

##### Removing Punctuations, Numbers and Emojis
In DL based methods like, BERT, GPT, T5 punctuations can hold a meaning, we usually do not remove them during preprocessing

In [164]:
import string

def remove_punc(text) :
    return text.translate(str.maketrans(" ", " ", string.punctuation))

In [165]:
df["text"] = df["text"].apply(remove_punc)

In [166]:
def remove_num(text) :
    new = ""
    for i in text :
        if not i.isdigit() :
            new = new + i
    return new

In [167]:
df["text"] = df["text"].apply(remove_num)

In [168]:
def remove_emojis(text) :
    new = ""
    for i in text :
        if i.isascii() :
            new  = new + i
    return new

df["text"] = df["text"].apply(remove_emojis)

##### Remove Stopword
w.r.t DL, stopwords preserves meaning.

Imp : To remove stopwords it is necessary to transform Corpus -> Sentence -> Words. THis process is Tokenization for which we are using nltk.download("punkt") or word_tokenizer

In [169]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [45]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [176]:
stopwords = set(stopwords.words('english'))

In [177]:
def tokenization(text) :
    words = word_tokenize(text)
    cleaned = []
    for w in words :
        if not w in stopwords :
            cleaned.append(w)
    return cleaned

In [178]:
df["text"].iloc[1]

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [179]:
df["text"] = df["text"].apply(tokenization)

In [180]:
df["text"].iloc[1]

['go',
 'feeling',
 'hopeless',
 'damned',
 'hopeful',
 'around',
 'someone',
 'cares',
 'awake']

##### Stemming, Lemmatization

In [149]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

In [145]:
def stemming(tokens) :
    stemmer = PorterStemmer()
    stemmed_word = []
    for t in tokens :
        stemmed_word.append(stemmer.stem(t))
    return stemmed_word

In [146]:
df["text"] = df["text"].apply(stemming)

In [147]:
df["text"].iloc[1]

['go', 'feel', 'hopeless', 'damn', 'hope', 'around', 'someon', 'care', 'awak']

In [152]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...


True

In [181]:
# Map Treebank POS -> WordNet POS
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)  # Auto POS tagging
    
    lemmatized = []
    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        lemmatized.append(lemmatizer.lemmatize(word, wn_tag))
    
    return lemmatized


In [182]:
df["text"] = df["text"].apply(lemmatize)

In [183]:
df["text"].iloc[1]

['go',
 'feeling',
 'hopeless',
 'damn',
 'hopeful',
 'around',
 'someone',
 'care',
 'awake']