##Using Embeddings in Text Classification


* Word Embeddings
* Document Embeddings   



In [None]:
#basic imports
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import os
import gzip
import shutil
from time import time

#pre-processing imports
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
try:
    from google.colab import files
    !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv
    !ls -lah DATAPATH
    data = pd.read_csv("DATAPATH/Full-Economic-News-DFE-839861.csv" , encoding = "ISO-8859-1" )

except ModuleNotFoundError:
    data = pd.read_csv("Data/Full-Economic-News-DFE-839861.csv" , encoding = "ISO-8859-1" )

--2024-02-16 03:42:45--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12383529 (12M) [text/plain]
Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv.2’


2024-02-16 03:42:45 (112 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv.2’ saved [12383529/12383529]

total 36M
drwxr-xr-x 2 root root 4.0K Feb 16 03:42 .
drwxr-xr-x 1 root root 4.0K Feb 16 03:09 ..
-rw-r--r-- 1 root root  12M Feb 16 02:46 Full-Economic-News-DFE-839861.csv
-rw-r--r-- 1 root root  12M Feb 16 02:46 Full-Economic-News-DFE-839861.csv.1
-rw-r--r-- 1 root root  12M Feb 16 03:42 Full-Economic-News-DFE-839861.csv.2


In [None]:
display(data.shape) # Number of rows (instances) and columns in the dataset
data["relevance"].value_counts()/data.shape[0] # Class distribution in the dataset

(8000, 15)

no          0.821375
yes         0.177500
not sure    0.001125
Name: relevance, dtype: float64

In [None]:
# convert label to a numerical variable
data = data[data.relevance != "not sure"] # removing the data where we don't want relevance="not sure".
data.shape
data['relevance'] = data.relevance.map({'yes':1, 'no':0}) # relevant is 1, not-relevant is 0.
data = data[["text","relevance"]] # Let us take only the two columns we need.
data.shape

(7991, 2)

In [None]:
data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,1
1,The Wall Street Journal Online</br></br>The Mo...,0
2,WASHINGTON -- In an effort to achieve banking ...,0
3,The statistics on the enormous costs of employ...,0
4,NEW YORK -- Indecision marked the dollar's ton...,1


**Download the pre-trained glove model**

In [None]:
import gensim.downloader

#Load pre trained glove model from Gensim
w2v_model = gensim.downloader.load('glove-wiki-gigaword-100')



In [None]:
texts = data['text'].values.tolist()
cats = data['relevance'].values.tolist()

In [None]:
#Inspect the model
word2vec_vocab = w2v_model.key_to_index
# inspect the # of words in the pre-trained model
print(len(word2vec_vocab))
# randomly pick a word 'dog' to inspect the vector dimension of the pre-trained model
print(len(w2v_model['dog']))

400000
300


In [None]:
from nltk.stem import WordNetLemmatizer
mystopwords = set(stopwords.words("english"))
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lemmatized = []
wn = WordNetLemmatizer()

def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        tokens_new = [wn.lemmatize(word) for word in tokens ]
        return [token.lower() for token in tokens_new if token.lower() not in mystopwords and not token.isdigit()
               and token not in punctuation]
    #This return statement below uses the above function to process twitter tokenizer output further.
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

In [None]:
texts_processed = preprocess_corpus(texts)
print(len(cats), len(texts_processed))
print(texts_processed[1])
print(cats[1])

7991 7991
['wall', 'street', 'journal', 'online', '/br', '/br', 'morning', 'brief', 'look', 'day', "'s", 'biggest', 'news', 'emailed', 'subscriber', 'a.m.', 'every', 'business', 'day', 'sign', 'e-mail', 'here.', '/br', '/br', 'friday', 'evening', 'congress', 'town', 'summer', 'recess', 'americans', 'heading', 'mid-august', 'weekend', 'bush', 'administration', 'sent', 'message', 'state', 'federal', 'government', 'make', 'tougher', 'national', 'child', "'s", 'insurance', 'program', 'cover', 'offspring', 'middle-income', 'families.', '/br', '/br', 'state', 'children', "'s", 'health', 'insurance', 'program', 'wa', 'created', 'help', 'child', 'whose', 'family', 'could', "n't", 'afford', 'insurance', "n't", 'qualify', 'medicaid', 'administration', 'official', 'tell', 'new', 'york', 'times', 'change', 'aimed', 'returning', 'program', 'low-', 'income', 'focus', 'assuring', "n't", 'become', 'replacement', 'private', 'insurance', 'administration', 'point', 'man', 'dennis', 'smith', 'wrote', 'sta

In [None]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0 + 1e-5 # to avoid divide-by-zero
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        if(count_for_this!=0):
            feats.append(feat_for_this/count_for_this)
        else:
            feats.append(zero_vector)
    return feats


train_vectors = embedding_feats(texts_processed)
print(len(train_vectors))

7991


In [None]:
#Take any classifier (LogisticRegression here, and train/test it like before.
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats, random_state=42)

classifier = LogisticRegression(class_weight="balanced")
classifier.fit(train_data, train_cats)
preds = classifier.predict(test_data)

In [None]:
print(classification_report(test_cats, preds))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78      1621
           1       0.34      0.68      0.45       377

    accuracy                           0.69      1998
   macro avg       0.62      0.69      0.62      1998
weighted avg       0.80      0.69      0.72      1998



##Document Embeddings

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,1
1,The Wall Street Journal Online</br></br>The Mo...,0
2,WASHINGTON -- In an effort to achieve banking ...,0
3,The statistics on the enormous costs of employ...,0
4,NEW YORK -- Indecision marked the dollar's ton...,1


In [None]:
#Split data into train and test, following the usual process
train_data, test_data, train_cats, test_cats = train_test_split(texts_processed,cats,random_state=42)

#prepare training data in doc2vec format:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

Model Saved


In [None]:
#Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("d2v.model")
#infer in multiple steps to get a stable representation.
train_vectors =  [model.infer_vector(list_of_tokens) for list_of_tokens in train_data]
test_vectors = [model.infer_vector(list_of_tokens) for list_of_tokens in test_data]

#Use any regular classifier like logistic regression
myclass = LogisticRegression(class_weight="balanced") #because classes are not balanced.
myclass.fit(train_vectors, train_cats)
preds = myclass.predict(test_vectors)

print(classification_report(test_cats, preds))

              precision    recall  f1-score   support

           0       0.90      0.64      0.75      1621
           1       0.32      0.71      0.44       377

    accuracy                           0.66      1998
   macro avg       0.61      0.67      0.59      1998
weighted avg       0.79      0.66      0.69      1998

