# Representations

The focus of this notebook is to explore different representations of the input data. The data is a pre-processed text, and the goal is to find the best representation for the data. The representations that will be explored are:

- Plain Text
- Bag of Words (BoW)
- One-Hot Encoding
- TF-IDF (Term Frequency-Inverse Document Frequency)
- N-grams (different n-gram sizes)
- Word Embeddings (Word2Vec, custom trained) of different sizes and aggregation methods (append, mean, max, min, etc.)
- Custom Representation (using word sentiment and word frequency)

## Importing Libraries and Data

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from collections import defaultdict
from nltk import ngrams
import random
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import spacy
import gensim
import logging
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE




In [5]:
df = pd.read_pickle('data/data_processed.pkl')
df = df.reset_index(drop=True)

df.head()

feel irrit kinda hate feel


In [6]:
# print the first text row

print(df['text'][0])

feel irrit kinda hate feel


## Bag of Words (BoW)

In [4]:
def model_bow(corpus, max_features = 1500):
    vectorizer = CountVectorizer(max_features = max_features)
    x = vectorizer.fit_transform(corpus).toarray()
    return x

In [5]:
# Apply the BOW model to the text column

x = model_bow(df['text'])

# Replace the text column with the new BOW representation
df_bow = df.copy()
df_bow['text'] = x.tolist()

print(f"Original text: {df['text'][0]}")
print(f"BOW representation: {df_bow['text'][0]}")
print(f"Length of the BOW representation: {len(df_bow['text'][0])}")
print (f"Number of none zeros: {np.count_nonzero(df_bow['text'][0])}")

df_bow.head()

Original text: feel irrit kinda hate feel
BOW representation: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Unnamed: 0,text,emotions
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger


In [6]:
df_bow.to_pickle('data/reps/rep_bow.pkl') # Save the dataframe to file

## One-Hot Encoding

In [7]:
def model_one_hot(corpus, max_features = 1500):
    vectorizer_binary = CountVectorizer(binary=True, max_features = max_features)
    x = vectorizer_binary.fit_transform(corpus).toarray()    
    return x

In [8]:
# Apply the one-hot model to the text column

x = model_one_hot(df['text'])

# Replace the text column with the new one-hot representation
df_one_hot = df.copy()
df_one_hot['text'] = x.tolist()

print(f"Original text: {df['text'][0]}")
print(f"One-hot representation: {df_one_hot['text'][0]}")
print(f"Length of the one-hot representation: {len(df_one_hot['text'][0])}")
print(f"Number of ones: {np.count_nonzero(df_one_hot['text'][0])}")

df_one_hot.head()

Original text: feel irrit kinda hate feel
One-hot representation: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Unnamed: 0,text,emotions
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger


In [9]:
# Save the dataframe to file
df_one_hot.to_pickle('data/reps/rep_one_hot.pkl')

## TF-IDF

In [10]:
def model_tf_idf(corpus):
    vectorizer_tfidf = TfidfVectorizer()
    x = vectorizer_tfidf.fit_transform(corpus).toarray()
    return x

In [11]:
# Apply the TF-IDF model to the text column

x = model_tf_idf(df['text'])

# Replace the text column with the new TF-IDF representation
df_tf_idf = df.copy()
df_tf_idf['text'] = x.tolist()

print(f"Original text: {df['text'][0]}")
print(f"TF-IDF representation: {df_tf_idf['text'][0]}")
print(f"Length of the TF-IDF representation: {len(df_tf_idf['text'][0])}")

df_tf_idf.head()

: 

## N-grams

In [27]:
def model_ngram(corpus, ngram_range = (1,2)):
    vectorizer_bigram = CountVectorizer(ngram_range = ngram_range)
    x = vectorizer_bigram.fit_transform(corpus).toarray()
    return x

def train_ngram(n = 5, corpus = []):

    # Create a placeholder for the model
    n_model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count the frequency of each ngram
    for sentence in corpus:
        for w in range(2, n+1):
            for ngram in ngrams(sentence, n, pad_right=True, pad_left=True, left_pad_symbol='<s>', right_pad_symbol='</s>'):
                n_model[ngram[:w-1]][ngram[w-1]] += 1
                
    # Let's transform the counts to probabilities
    for ngram in n_model:
        total_count = float(sum(n_model[ngram].values()))
        for w in n_model[ngram]:
            n_model[ngram][w] = n_model[ngram][w] / total_count

    return n_model

def ngram_guess_next(n_model, text):
    max(n_model[text], key=n_model[text].get)
                
def ngram_complete_sentence(n_model, text):
    while text[-1] != "</s>":
            
            # select a random probability threshold
            r = random.random()
            
            # select word above the probability threshold, conditioned to the previous word text[-1]
            accumulator = .0
            for word in n_model[(text[-3], text[-2], text[-1])]:
                accumulator += n_model[(text[-3], text[-2], text[-1])][word]
                if accumulator >= r:
                    text.append(word)
                    break
                
    print (' '.join([t for t in text if t]))

## Word Embeddings (Word2Vec)

In [28]:
# TODO

## Word Embeddings (Custom)

Training a custom Word2Vec model using the data we have. For that, we will use the `gensim` library and we'll use the original data with fewer pre-processing steps.

In [60]:
df_og = pd.read_pickle("data/data_original.pkl")

df_og.head()

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


### Cleaning Data

In [61]:
nlp = spacy.load("en_core_web_sm") # there are also "en_core_web_md" and "en_core_web_lg" that include vector representations
ps = PorterStemmer()
sw = set(stopwords.words('english'))

def custom_pre_process(sentence):
    
    global ps, sw
    
    # Remove non alpha chars 
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Convert to lower-case
    sentence = sentence.lower()

    # split into tokens, apply stemming and remove stop words
    sentence = ' '.join([ps.stem(w) for w in sentence.split() if w not in sw])

    return sentence

# Does the following:
# 1. Remove non alpha characters
# 2. Convert to lower-case
# 3. Tokenize the sentence
# 4. POS Tagging
# 5. Dependency Parsing
# 6. Named Entity Recognition
# 7. Lemmatization
# 8. Remove stop words
# 9. Sentence Boundary Detection
def spacy_pre_process(sentence):
    global nlp
    return nlp(sentence)

In [62]:
# Apply the pre-processing to the original dataset

df_og_proc = df_og.copy()

df_og_proc['text'] = df_og['text'].apply(custom_pre_process)

df_og_proc.head()

Unnamed: 0,text,emotions
27383,feel aw job get posit succeed happen,sadness
110083,im alon feel aw,sadness
140764,ive probabl mention realli feel proud actual k...,joy
100071,feel littl low day back,sadness
2837,beleiv much sensit peopl feel tend compassion,love


In [63]:
# Apply the pre-processing to the original dataset

df_og_proc2 = df_og.copy()

# df_og_proc2['text'] = df_og['text'].apply(spacy_pre_process) # TAKES A LOT OF TIME

df_og_proc2.head()

Unnamed: 0,text,emotions
27383,"(i, feel, awful, about, it, too, because, it, ...",sadness
110083,"(i, m, alone, i, feel, awful)",sadness
140764,"(i, ve, probably, mentioned, this, before, but...",joy
100071,"(i, was, feeling, a, little, low, few, days, b...",sadness
2837,"(i, beleive, that, i, am, much, more, sensitiv...",love


In [None]:
# save df_og_proc2 to file CSV
df_og_proc2.to_csv('data/data_processed_spacy.csv', index=False)

### Defining useful functions

In [64]:
def train_embeddings(df, col, vec_size=150):
        
    model = gensim.models.Word2Vec(df[col], vector_size=vec_size, window=10, min_count=2, workers=10, sg=1)

    model.wv.save_word2vec_format('data/reps/custom_embedding.bin')
    
def load_embedding():
    wv = gensim.models.KeyedVectors.load_word2vec_format("data/reps/custom_embedding.bin")
    return wv

In [35]:
# Também podemos usar estes modelos pré-treinados para obter embeddings de texto e ha mtos para escolher

""" 
def convert_text_to_embeddings(df, text_column, model_name='all-MiniLM-L6-v2'):

    # Load SentenceTransformer model
    model = SentenceTransformer(model_name)
    
    # Get the texts from the DataFrame
    texts = df[text_column].tolist()
    
    # Compute embeddings
    embeddings = model.encode(texts)

    # insert the embeddings into the DataFrame into a single new column
    df['embeddings'] = embeddings.tolist()
    
    return embeddings

# Example usage:
# Assuming you have a DataFrame df with a 'text' column
embeddings = convert_text_to_embeddings(df, 'text', 'all-MiniLM-L6-v2')


 """;

### Training the model

In [65]:
# train the custom embeddings using gensim
train_embeddings(df_og_proc, 'text', 150)

KeyboardInterrupt: 

### Explore the custom embeddings

In [None]:
# Load the model

wv = load_embedding()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbe in position 0: invalid start byte

In [None]:
wv.most_similar(positive=["polite"], topn=5)

In [None]:
wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1) 

In [None]:
wv.similarity(w1="dirty", w2="smelly")   

In [None]:
wv.doesnt_match(["cat", "dog", "france"]) 

In [None]:
"""
def visualize_embedding(wv, word_list):

    def reduce_dimensions(model, num_dimensions=2, words=[]):

        vectors = [] # positions in vector space
        labels = [] # keep track of words to label our data again later
        
        # if no word list is given, assume we want to use the whole data in the model
        if(words == []):
            words = model.index_to_key

        for word in words:
            vectors.append(model[word])
            labels.append(word)

        # convert both lists into numpy vectors for reduction
        vectors = np.asarray(vectors)
        labels = np.asarray(labels)

        # reduce using t-SNE
        tsne = TSNE(n_components=num_dimensions, random_state=0, perplexity=2)
        vectors = tsne.fit_transform(vectors)

        return vectors, labels

    def plot_with_matplotlib(x_vals, y_vals, labels, words=[]):

        random.seed(0)
        
        x_vals_new = np.array([])
        y_vals_new = np.array([])
        labels_new = np.array([])
        if(words == []):
            # if no word list is given, assume we want to plot the whole data
            x_vals_new = x_vals
            y_vals_new = y_vals
            labels_new = labels
        else:
            for i in range(len(labels)):
                if(labels[i] in words):
                    x_vals_new = np.append(x_vals_new,x_vals[i])
                    y_vals_new = np.append(y_vals_new,y_vals[i])
                    labels_new = np.append(labels_new,labels[i])
        
        plt.figure(figsize=(12, 12))
        plt.scatter(x_vals_new, y_vals_new)

        # apply labels
        for i in range(len(labels_new)):
            plt.annotate(labels_new[i], (x_vals_new[i], y_vals_new[i]))
        
        plt.show()

    vectors, labels = reduce_dimensions(wv, 2, word_list)
    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
            
    plot_with_matplotlib(x_vals, y_vals, labels, word_list)
""";

## Custom Representation

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def model_vader(corpus):
    # VADER is a robust rule-based lexicon tool tuned to assess social media sentiment 
    # Returns a binary result for each phrase in the corpus where 1 is positive
    analyzer = SentimentIntensityAnalyzer()
    x = []
    for rev in corpus:
        x.append(1 if analyzer.polarity_scores(rev)['compound'] > 0 else 0)
    return x

# TODO