# Representations

The focus of this notebook is to explore different representations of the input data. The data is a pre-processed text, and the goal is to find the best representation for the data. The representations that will be explored are:

- Plain Text
- Bag of Words (BoW)
- One-Hot Encoding
- TF-IDF (Term Frequency-Inverse Document Frequency)
- N-grams (different n-gram sizes)
- Word Embeddings (Word2Vec, custom trained) of different sizes and aggregation methods (append, mean, max, min, etc.)
- Vader Sentiment Analysis
- POS (Part of Speech) Tagging (using Spacy)

## Importing Libraries and Data

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from collections import defaultdict
from nltk import ngrams
import random
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import spacy
import gensim
import logging
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE




In [3]:
df = pd.read_pickle('data/data_processed.pkl')
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,text,emotions
0,feel irrit kinda hate feel,anger
1,id rather home feel violent lone im not_tri so...,anger
2,suggest wait discuss feel less resent,anger
3,wrong feel royal piss,anger
4,im tierd talk like there hope hell care unders...,anger


In [4]:
# print the first text row

print(df['text'][0])

feel irrit kinda hate feel


# Plain Text

In [5]:
df.to_pickle('data/reps/1_plain_text.pkl')

## Bag of Words (BoW)

In [6]:
def model_bow(corpus, max_features = 1500):
    vectorizer = CountVectorizer(max_features = max_features)
    x = vectorizer.fit_transform(corpus).toarray()
    return x

In [7]:
# Apply the BOW model to the text column

x = model_bow(df['text'])

# Replace the text column with the new BOW representation
df_bow = df.copy()
df_bow['text'] = x.tolist()

print(f"Original text: {df['text'][0]}")
print(f"BOW representation: {df_bow['text'][0]}")
print(f"Length of the BOW representation: {len(df_bow['text'][0])}")
print (f"Number of none zeros: {np.count_nonzero(df_bow['text'][0])}")

df_bow.head()

Original text: feel irrit kinda hate feel
BOW representation: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Unnamed: 0,text,emotions
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger


In [8]:
df_bow.to_pickle('data/reps/2_bow.pkl') # Save the dataframe to file

## One-Hot Encoding

In [9]:
def model_one_hot(corpus, max_features = 1500):
    vectorizer_binary = CountVectorizer(binary=True, max_features = max_features)
    x = vectorizer_binary.fit_transform(corpus).toarray()    
    return x

In [10]:
# Apply the one-hot model to the text column

x = model_one_hot(df['text'])

# Replace the text column with the new one-hot representation
df_one_hot = df.copy()
df_one_hot['text'] = x.tolist()

print(f"Original text: {df['text'][0]}")
print(f"One-hot representation: {df_one_hot['text'][0]}")
print(f"Length of the one-hot representation: {len(df_one_hot['text'][0])}")
print(f"Number of ones: {np.count_nonzero(df_one_hot['text'][0])}")

df_one_hot.head()

Original text: feel irrit kinda hate feel
One-hot representation: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Unnamed: 0,text,emotions
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",anger


In [11]:
# Save the dataframe to file
df_one_hot.to_pickle('data/reps/3_one_hot.pkl')

## TF-IDF

In [1]:
def model_tf_idf(corpus):
    vectorizer_tfidf = TfidfVectorizer(max_features=6000)
    x = vectorizer_tfidf.fit_transform(corpus).toarray()
    return x

The cell below requires a lot of RAM and if not enough available, may crash the kernel. The max_features parameter can be reduced to avoid this.

In [12]:
# Apply the TF-IDF model to the text column

# """ 

x = model_tf_idf(df['text'])

# Replace the text column with the new TF-IDF representation
df_tf_idf = df.copy()
df_tf_idf['text'] = x.tolist()

print(f"Original text: {df['text'][0]}")
print(f"TF-IDF representation: {df_tf_idf['text'][0]}")
print(f"Length of the TF-IDF representation: {len(df_tf_idf['text'][0])}")

print(df_tf_idf.head())

df_tf_idf.to_pickle('data/reps/4_tf_idf.pkl')

# """

Original text: feel irrit kinda hate feel
TF-IDF representation: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

## N-grams

In [14]:
def model_ngram(corpus, ngram_range = (1,2)):
    vectorizer_bigram = CountVectorizer(ngram_range = ngram_range)
    x = vectorizer_bigram.fit_transform(corpus).toarray()
    return x

def train_ngram(n = 5, corpus = []):

    # Create a placeholder for the model
    n_model = defaultdict(lambda: defaultdict(lambda: 0))

    # Count the frequency of each ngram
    for sentence in corpus:
        for w in range(2, n+1):
            for ngram in ngrams(sentence, n, pad_right=True, pad_left=True, left_pad_symbol='<s>', right_pad_symbol='</s>'):
                n_model[ngram[:w-1]][ngram[w-1]] += 1
                
    # Let's transform the counts to probabilities
    for ngram in n_model:
        total_count = float(sum(n_model[ngram].values()))
        for w in n_model[ngram]:
            n_model[ngram][w] = n_model[ngram][w] / total_count

    return n_model

def ngram_guess_next(n_model, text):
    max(n_model[text], key=n_model[text].get)
                
def ngram_complete_sentence(n_model, text):
    while text[-1] != "</s>":
            
            # select a random probability threshold
            r = random.random()
            
            # select word above the probability threshold, conditioned to the previous word text[-1]
            accumulator = .0
            for word in n_model[(text[-3], text[-2], text[-1])]:
                accumulator += n_model[(text[-3], text[-2], text[-1])][word]
                if accumulator >= r:
                    text.append(word)
                    break
                
    print (' '.join([t for t in text if t]))

In [13]:
# TODO: Apply the n-gram model to the text column 

# existem maneiras built in de fazer ngrams como vimos nos notebooks das aulas, talvez fosse mais inteligente do que usar estas funcoes a la pata

## Word Embeddings (Custom)

In [3]:
df_og = pd.read_pickle("data/data_processed.pkl")

df_og.head()

Unnamed: 0,text,emotions
19132,feel irrit kinda hate feel,anger
51533,id rather home feel violent lone im not_tri so...,anger
44351,suggest wait discuss feel less resent,anger
51299,wrong feel royal piss,anger
55778,im tierd talk like there hope hell care unders...,anger


In [4]:
import gensim

# Apply gensim.utils.simple_preprocess(line) to each line in the text column
df_og['text'] = df_og['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

df_og.head()

Unnamed: 0,text,emotions
19132,"[feel, irrit, kinda, hate, feel]",anger
51533,"[id, rather, home, feel, violent, lone, im, no...",anger
44351,"[suggest, wait, discuss, feel, less, resent]",anger
51299,"[wrong, feel, royal, piss]",anger
55778,"[im, tierd, talk, like, there, hope, hell, car...",anger


In [5]:
documents = df_og['text'].tolist()

print(documents[0])

def train_embeddings(documents, vec_size=150):
    model = gensim.models.Word2Vec(documents, vector_size=vec_size, window=10, min_count=2, workers=10, sg=1)

    model.wv.save_word2vec_format('data/reps/custom_embedding.bin')
    
def load_embedding():
    wv = gensim.models.KeyedVectors.load_word2vec_format("data/reps/custom_embedding.bin")
    return wv

['feel', 'irrit', 'kinda', 'hate', 'feel']


Training a custom Word2Vec model using the data we have. For that, we will use the `gensim` library and we'll use the original data with fewer pre-processing steps.

In [7]:
train_embeddings(df_og)

In [8]:
wv = load_embedding()

In [9]:
# print the vocabulary of the model
print(wv.key_to_index)

{'t': 0, 'o': 1, 'e': 2}


## Word Embeddings (Word2Vec)



We can make use of pretrained word embeddings to represent our input text in a classification problem. Let's try it out with the embeddings we've trained in the word embeddings notebook, which have the advantage of having been trained on data that is similar to our classification task's data (reviews). You could try other embeddings (such as those available in [Gensim](https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html)).

In [None]:
# try using a embedding representation trained specifically for emotions and tweets:
#  FastText by Facebook Research
# Tweet2Vec



In [11]:
# train the custom embeddings using gensim
# train_embeddings(df_og_proc)

### Explore the custom embeddings

In [10]:
# Load the model

# wv = load_embedding()

In [None]:
wv.most_similar(positive=["polite"], topn=5)

In [None]:
wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1) 

In [None]:
wv.similarity(w1="dirty", w2="smelly")   

In [None]:
wv.doesnt_match(["cat", "dog", "france"]) 

In [None]:
"""
def visualize_embedding(wv, word_list):

    def reduce_dimensions(model, num_dimensions=2, words=[]):

        vectors = [] # positions in vector space
        labels = [] # keep track of words to label our data again later
        
        # if no word list is given, assume we want to use the whole data in the model
        if(words == []):
            words = model.index_to_key

        for word in words:
            vectors.append(model[word])
            labels.append(word)

        # convert both lists into numpy vectors for reduction
        vectors = np.asarray(vectors)
        labels = np.asarray(labels)

        # reduce using t-SNE
        tsne = TSNE(n_components=num_dimensions, random_state=0, perplexity=2)
        vectors = tsne.fit_transform(vectors)

        return vectors, labels

    def plot_with_matplotlib(x_vals, y_vals, labels, words=[]):

        random.seed(0)
        
        x_vals_new = np.array([])
        y_vals_new = np.array([])
        labels_new = np.array([])
        if(words == []):
            # if no word list is given, assume we want to plot the whole data
            x_vals_new = x_vals
            y_vals_new = y_vals
            labels_new = labels
        else:
            for i in range(len(labels)):
                if(labels[i] in words):
                    x_vals_new = np.append(x_vals_new,x_vals[i])
                    y_vals_new = np.append(y_vals_new,y_vals[i])
                    labels_new = np.append(labels_new,labels[i])
        
        plt.figure(figsize=(12, 12))
        plt.scatter(x_vals_new, y_vals_new)

        # apply labels
        for i in range(len(labels_new)):
            plt.annotate(labels_new[i], (x_vals_new[i], y_vals_new[i]))
        
        plt.show()

    vectors, labels = reduce_dimensions(wv, 2, word_list)
    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
            
    plot_with_matplotlib(x_vals, y_vals, labels, word_list)
""";

## Vader Sentiment Representation

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def model_vader(corpus):
    # VADER is a robust rule-based lexicon tool tuned to assess social media sentiment 
    # Returns a binary result for each phrase in the corpus where 1 is positive
    analyzer = SentimentIntensityAnalyzer()
    x = []
    for rev in corpus:
        x.append(1 if analyzer.polarity_scores(rev)['compound'] > 0 else 0)
    return x

# TODO

# POS Tagging Representation

In [1]:
# TODO

#  How to train with rep of POS --> temos de definir as nossas proprias features e ter cuidado com a quantidade de tags q podem dar mto sparse (não é uma prioridade fazer POS pq é mais complicado de lidar)