In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [194]:
df = pd.read_csv("E:/Generative AI Projects/NLP_Complete/corpus/train.txt", sep = ";", header = None, names = ["text", "emotions"])

In [195]:
df.head(3)

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger


In [196]:
df.isnull().sum()

text        0
emotions    0
dtype: int64

In [197]:
unique_emotions = df["emotions"].unique()

In [198]:
unique_emotions

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [199]:
emotions_numbers = {}
i = 0
for emo in unique_emotions :
    emotions_numbers[emo] = i
    i += 1

df["emotions"] = df["emotions"].map(emotions_numbers)

In [200]:
df.head(7)

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
5,ive been feeling a little burdened lately wasn...,0
6,ive been taking or milligrams or times recomme...,3


### Text Preprocessing
##### Converting to lower case

In [201]:
df["text"] = df["text"].apply(lambda x : x.lower())

##### Removing Punctuations, Numbers and Emojis
In DL based methods like, BERT, GPT, T5 punctuations can hold a meaning, we usually do not remove them during preprocessing

In [202]:
import string

def remove_punc(text) :
    return text.translate(str.maketrans(" ", " ", string.punctuation))

In [203]:
df["text"] = df["text"].apply(remove_punc)

In [204]:
def remove_num(text) :
    new = ""
    for i in text :
        if not i.isdigit() :
            new = new + i
    return new

In [205]:
df["text"] = df["text"].apply(remove_num)

In [206]:
def remove_emojis(text) :
    new = ""
    for i in text :
        if i.isascii() :
            new  = new + i
    return new

df["text"] = df["text"].apply(remove_emojis)

##### Remove Stopword
w.r.t DL, stopwords preserves meaning.

Imp : To remove stopwords it is necessary to transform Corpus -> Sentence -> Words. THis process is Tokenization for which we are using nltk.download("punkt") or word_tokenizer

In [207]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [45]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [208]:
stopwords = set(stopwords.words('english'))

In [209]:
def tokenization(text) :
    words = word_tokenize(text)
    cleaned = []
    for w in words :
        if not w in stopwords :
            cleaned.append(w)
    return cleaned

In [210]:
df["text"].iloc[1]

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [211]:
df["text"] = df["text"].apply(tokenization)

In [212]:
df["text"].iloc[1]

['go',
 'feeling',
 'hopeless',
 'damned',
 'hopeful',
 'around',
 'someone',
 'cares',
 'awake']

#### Feature Normalization
##### Stemming, Lemmatization

In [214]:
df_tokenization = df
df_stemming = df
df_lemmatize = df
df_spacy_pos = df
df_bow = df
df_tfidf = df
df_word2vec = df
df_glove = df

In [149]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

In [215]:
def stemming(tokens) :
    stemmer = PorterStemmer()
    stemmed_word = []
    for t in tokens :
        stemmed_word.append(stemmer.stem(t))
    return stemmed_word

In [216]:
df_tokenization["text"] = df_tokenization["text"].apply(stemming)

In [217]:
df_tokenization["text"].iloc[1]

['go', 'feel', 'hopeless', 'damn', 'hope', 'around', 'someon', 'care', 'awak']

In [152]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swapn\AppData\Roaming\nltk_data...


True

In [181]:
# Map Treebank POS -> WordNet POS
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)  # Auto POS tagging
    
    lemmatized = []
    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        lemmatized.append(lemmatizer.lemmatize(word, wn_tag))
    
    return lemmatized


In [219]:
df_lemmatize["text"] = df_lemmatize["text"].apply(lemmatize)

In [220]:
df_lemmatize["text"].iloc[1]

['go', 'feel', 'hopeless', 'damn', 'hope', 'around', 'someon', 'care', 'awak']

In [186]:
import spacy
import en_core_web_sm


nlp = en_core_web_sm.load()

In [221]:
def spacy_pos(tokens):  # tokens is a list
    text = " ".join(tokens)  # join back into string
    doc = nlp(text)
    results = []
    for token in doc:
        results.append({
            "text": token.text,
            "pos": token.pos_,
            # "tag": token.tag_,
            "lemma": token.lemma_
        })
    return results

df_spacy_pos["pos_info"] = df_spacy_pos["text"].apply(spacy_pos)

In [222]:
df_spacy_pos

Unnamed: 0,text,emotions,pos_info
0,"[didnt, feel, humili]",0,"[{'text': 'did', 'pos': 'AUX', 'lemma': 'do'},..."
1,"[go, feel, hopeless, damn, hope, around, someo...",0,"[{'text': 'go', 'pos': 'VERB', 'lemma': 'go'},..."
2,"[im, grab, minut, post, feel, greedi, wrong]",1,"[{'text': 'i', 'pos': 'PRON', 'lemma': 'I'}, {..."
3,"[ever, feel, nostalg, fireplac, know, still, p...",2,"[{'text': 'ever', 'pos': 'ADV', 'lemma': 'ever..."
4,"[feel, grouchi]",1,"[{'text': 'feel', 'pos': 'VERB', 'lemma': 'fee..."
...,...,...,...
15995,"[brief, time, beanbag, say, anna, feel, like, ...",0,"[{'text': 'brief', 'pos': 'ADJ', 'lemma': 'bri..."
15996,"[turn, feel, pathet, still, wait, tabl, sub, t...",0,"[{'text': 'turn', 'pos': 'VERB', 'lemma': 'tur..."
15997,"[feel, strong, good, overal]",5,"[{'text': 'feel', 'pos': 'VERB', 'lemma': 'fee..."
15998,"[feel, like, rude, comment, im, glad]",1,"[{'text': 'feel', 'pos': 'VERB', 'lemma': 'fee..."


#### Feature Extraction
##### Bag Of Words, TF-IDF, One-Hot Encoding, UniGram, BiGram

In [225]:
from sklearn.feature_extraction.text import CountVectorizer

In [236]:
# join tokens into string per row
df_bow["joined"] = df_bow["text"].apply(lambda x: " ".join(x))

vectorizer = CountVectorizer(max_features=10)
X = vectorizer.fit_transform(df_bow["joined"])

# convert to DataFrame
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())


In [235]:
bow_df

Unnamed: 0,feel,get,go,im,know,like,make,realli,think,time
0,1,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
15995,1,0,0,0,0,1,0,0,0,1
15996,1,0,0,0,0,0,0,0,0,0
15997,1,0,0,0,0,0,0,0,0,0
15998,1,0,0,1,0,1,0,0,0,0


In [237]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [241]:
# join tokens into string per row
df_tfidf["joined"] = df_tfidf["text"].apply(lambda x: " ".join(x))

vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(df_tfidf["joined"])

# convert to DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [242]:
tfidf_df

Unnamed: 0,actual,also,alway,amaz,anyth,around,away,back,bit,cant,...,way,week,well,without,wonder,work,world,would,write,year
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.447141,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15996,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15997,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15998,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
