## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string

# import GloVe
from torchtext.vocab import GloVe
global_vectors = GloVe(name='840B', dim=300)

#tokenizer
import torchtext
from torchtext.data import get_tokenizer

# stop word removal
nltk.download('stopwords')
from nltk.corpus import stopwords

# lemmazation
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nunomachado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nunomachado/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data pre-pocessing

In [2]:
df = pd.read_csv('train.txt', delimiter='\t', header=None, names=["Label", "Review"])
df_test = pd.read_csv('test_just_reviews.txt', delimiter='\t', header=None, names=["Review"])

#separating the labels into real/fake and positive/negative
df['Realness'] = df['Label'].str[:-8]
df['Sentiment'] = df['Label'].str[-8:]
df = df.drop('Label', axis=1)

# remove capitalized letters
#df['Review'] = df['Review'].str.lower()
#df['Review'] = tokenizer(df['Review'])
data = df.to_numpy()
Xt = df_test.to_numpy()[:,0]

X = data[:,0]
yr = data[:,1]
ys = data[:,2]
print(df)

def pre_process_x(X):

    #tokenizing
    tokenizer = get_tokenizer("basic_english") 
    X = [tokenizer(x) for x in X]

    #getting max numer of tokens in one review

    max_words = 0
    tam = []
    for x in X:
        tam.append(len(x))
        if len(x) > max_words:
            max_words = len(x)

    max_words = 864

    print(f"max_words-->{max_words}")

    #padding
    X = [tokens+[""] * (max_words-len(tokens))  if len(tokens)<max_words else tokens[:max_words] for tokens in X]

    #embeddings
    X_emb = []
    for token in X:
        # tokenize and use glove word embedding
        X_emb.append(np.array(global_vectors.get_vecs_by_tokens(token)))

    X_emb = np.array(X_emb)
    print(f"X_emb.shape-->{X_emb.shape}")

    return X_emb


def pp_x_w_lemma_and_stop_word(X):

    #tokenizing
    tokenizer = get_tokenizer("basic_english") 
    X = [tokenizer(x) for x in X]

    print(f"after tokenization: X[0]-->{X[0]}")

    # lemmazation
    wnl = WordNetLemmatizer()

    # stop word removal (and punctuation)
    stop_words = set(stopwords.words('english'))

    # apply both stop word removal and lemmazation
    X_stop = []
    for x in X:
        X_stop.append([wnl.lemmatize(i) for i in x if i not in stop_words and i not in string.punctuation])

    #X_stop=np.array(X_stop)
    print(f"after stop word and lemma: X_stop[0]-->{X_stop[0]}")

    #print(f"after stop word removal and lemmazation\n")
    #print(f"X_stop-->{X_stop}")

    #getting max numer of tokens in one review

    max_words = 0
    tam = []
    for x in X_stop:
        tam.append(len(x))
        if len(x) > max_words:
            max_words = len(x)

    #max_words = 864
    max_words=359

    print(f"max_words-->{max_words}")

    print(f"type(X_stop[0])-->{type(X_stop[0])}")
    

    #padding
    X_stop = [tokens+[""] * (max_words-len(tokens))  if len(tokens)<max_words else tokens[:max_words] for tokens in X_stop]

    print(f"after padding: X_stop[0]-->{X_stop[0]}")

    #embeddings
    X_emb = []
    for token in X_stop:
        # tokenize and use glove word embedding
        X_emb.append(np.array(global_vectors.get_vecs_by_tokens(token)))

    X_emb = np.array(X_emb)

    print(f"after embedding: X_emb[0]-->{X_emb[0]}")
    print(f"X_emb.shape-->{X_emb.shape}")

    return X_emb

                                                 Review   Realness Sentiment
0     The sheraton was a wonderful hotel! When me an...   TRUTHFUL  POSITIVE
1     We stayed at the Omni between Christmas and Ne...   TRUTHFUL  POSITIVE
2     I was REALLY looking forward to a nice relaxin...  DECEPTIVE  NEGATIVE
3     First let me say, I try not to be too critical...   TRUTHFUL  NEGATIVE
4     The Ambassador East Hotel is a terrible place ...  DECEPTIVE  NEGATIVE
...                                                 ...        ...       ...
1395  I stayed here for 5 nights last summer. I book...   TRUTHFUL  NEGATIVE
1396  Stayed here for 3 nights for a Bridgestone/Fir...   TRUTHFUL  POSITIVE
1397  I am staying here now and actually am compelle...   TRUTHFUL  NEGATIVE
1398  We stayed at this hotel with our two teenage d...   TRUTHFUL  NEGATIVE
1399  The rooms were beautiful! The staff was friend...  DECEPTIVE  POSITIVE

[1400 rows x 3 columns]


In [3]:
X_pp = pp_x_w_lemma_and_stop_word(X)
Xt_pp = pp_x_w_lemma_and_stop_word(Xt)

after tokenization: X[0]-->['the', 'sheraton', 'was', 'a', 'wonderful', 'hotel', '!', 'when', 'me', 'and', 'my', 'mom', 'flew', 'in', 'we', 'were', 'really', 'tired', 'so', 'we', 'decided', 'to', 'take', 'a', 'quick', 'nap', '.', 'we', 'didnt', 'want', 'to', 'get', 'up', '!', 'the', 'beds', 'are', 'absolutely', 'to', 'die', 'for', '.', 'i', 'wanted', 'to', 'take', 'it', 'home', 'with', 'me', '.', 'the', 'service', 'was', 'great', 'and', 'this', 'was', 'probably', 'one', 'of', 'the', 'biggest', 'if', 'not', 'the', 'biggest', 'hotel', 'ive', 'ever', 'stayed', 'in', '.', 'they', 'had', 'a', 'really', 'nice', 'restaurant', 'inside', 'with', 'excellent', 'food', '.']
after stop word and lemma: X_stop[0]-->['sheraton', 'wonderful', 'hotel', 'mom', 'flew', 'really', 'tired', 'decided', 'take', 'quick', 'nap', 'didnt', 'want', 'get', 'bed', 'absolutely', 'die', 'wanted', 'take', 'home', 'service', 'great', 'probably', 'one', 'biggest', 'biggest', 'hotel', 'ive', 'ever', 'stayed', 'really', 'ni

In [4]:
X_emb = pre_process_x(X)
Xt_emb = pre_process_x(Xt)

######### y's pre processing
for i in range(len(ys)):
    if ys[i] == "POSITIVE":
        ys[i] = 1
    else:
        ys[i] = 0

    if yr[i] == "TRUTHFUL":
        yr[i] = 1
    else:
        yr[i] = 0

ys = np.array(ys)
yr = np.array(yr)

max_words-->864
X_emb.shape-->(1400, 864, 300)
max_words-->864
X_emb.shape-->(200, 864, 300)


In [5]:
print(f"type(X_emb)-->{type(X_emb)}")
print(f"type(Xt_emb)-->{type(Xt_emb)}")
print(f"type(ys)-->{type(ys)}")
print(f"type(yr)-->{type(yr)}\n")

print(f"X_emb.shape-->{X_emb.shape}")
print(f"Xt_emb.shape-->{Xt_emb.shape}")
print(f"ys.shape-->{ys.shape}")
print(f"yr.shape-->{yr.shape}")

type(X_emb)--><class 'numpy.ndarray'>
type(Xt_emb)--><class 'numpy.ndarray'>
type(ys)--><class 'numpy.ndarray'>
type(yr)--><class 'numpy.ndarray'>

X_emb.shape-->(1400, 864, 300)
Xt_emb.shape-->(200, 864, 300)
ys.shape-->(1400,)
yr.shape-->(1400,)


In [6]:
'''
print(f"tam_min-->{np.min(tam)}")
print(f"tam_MAX-->{np.max(tam)}")
print(f"tam_avg-->{np.average(tam)}")
print(f"tam_median-->{np.average(tam)}")

# distribuição do numero de palavras por review
plt.hist(tam)
plt.show()
'''

'\nprint(f"tam_min-->{np.min(tam)}")\nprint(f"tam_MAX-->{np.max(tam)}")\nprint(f"tam_avg-->{np.average(tam)}")\nprint(f"tam_median-->{np.average(tam)}")\n\n# distribuição do numero de palavras por review\nplt.hist(tam)\nplt.show()\n'

In [7]:
# delete cache
del(global_vectors)
#save processed numpy arrays
np.save("./processed_train/X_emb.npy", X_emb)
np.save("./processed_train/Xt_emb.npy", Xt_emb)
np.save("./processed_train/ys.npy", ys)
np.save("./processed_train/yr.npy", yr)

## more pre processed, lemmazation and stop word removal

np.save("./processed_train/X_pp.npy", X_pp)
np.save("./processed_train/Xt_pp.npy", Xt_pp)