In [1]:
import numpy as np
import pandas as pd

file_path = "../../data/processed/hi_rws_0001_0256_descriptive.csv"
df = pd.read_csv(file_path, nrows=1000)

## Previous Preprocessing Steps

1. Sent Tokens
2. Word Tokens
3. Remove punctuations 
4. Remove stopwords (NLTK List)
5. Lower words
6. Lemmatize
7. Stemming

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
alias               1000 non-null object
ratingValue         1000 non-null int64
dataPublished       1000 non-null object
description         1000 non-null object
author              1000 non-null object
sentiment           1000 non-null int64
word_count          1000 non-null int64
sent_count          1000 non-null int64
chr_count           1000 non-null int64
avg_word_len        1000 non-null float64
avg_sent_len        1000 non-null float64
num_of_stopwords    1000 non-null int64
num_of_modals       1000 non-null int64
hashtags            1000 non-null int64
mentions            1000 non-null int64
numerics            1000 non-null int64
uppercase_cnt       1000 non-null int64
punctuation_cnt     1000 non-null int64
vocab_cnt           1000 non-null int64
ratio_lexical       1000 non-null float64
ratio_content       1000 non-null float64
dtypes: float64(4), int64(13), object(4)
m

In [3]:
import spacy
nlp = spacy.load("en_core_web_lg", disable=['ner'])
print(spacy.__version__)
nlp.pipe_names
nlp

2.0.18


<spacy.lang.en.English at 0x11775cf98>

## Testing with a Text

In [4]:
text = nlp(df.description[5])
sent = [i for i in text.sents][0]
print(dir(text))
type(text[0].lemma_)

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_bulk_merge', '_py_tokens', '_realloc', '_vector', '_vector_norm', 'cats', 'char_span', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_disk', 'get_extension', 'get_lca_matrix', 'has_extension', 'has_vector', 'is_parsed', 'is_sentenced', 'is_tagged', 'mem', 'merge', 'noun_chunks', 'noun_chunks_iterator', 'print_tree', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_extension', 'similarity', 'tensor', 'text', 'text_with_ws', 'to_array', 'to_bytes', 'to_disk', 'user_data', 'user_hooks', 'user_span_hooks', 'user_token_hooks', 'vector', 'vector_

str

In [None]:
# solution one that I dont like
texts = []
for sent_token in text.sents:

    for token in text:        
        if token in sent_token:
            # do cleaning in here
            texts.append(clean_up(sent_token.text))
[tuple(i) for i in texts]

In [None]:
# solution that is fair enough
texts = []
sent_current = ""
for token in text:     
    # check for tokens current sent
    if sent_current == token.sent.text: 
        # add same sent tokens to the sent list
        token_clean = token_clean_up(token)
        if token_clean is not None:
            sent.append(token_clean)
            print(sent)
    else:         
        # add it to texts, if it is not initially
        if sent_current != "":
            texts.append(sent)
        # update current sent index
        sent_current = token.sent.text
        # create sent list and add first token
        sent = []
        token_clean = token_clean_up(token)
        if token_clean is not None:
            sent.append(token_clean)
texts.append(sent)
print(len(list(text.sents)), len(texts))
# [print(i) for i in text.sents]
# [tuple(i) for i in texts]
# [print(i, '\n\n', y) for i,y in zip(text.sents, texts)]

In [None]:
noun_chunk_list = []
for token in text.noun_chunks:
    if token.is_stop == False and token.is_alpha and len(token) > 3 and token.pos_ not in removal:
        noun_chunk_list.append(token)

In [None]:
[i.orth_ for i in text][:10]

## Testing in Review DataFrame

In [26]:
def token_clean_up(token):
    """ token cleanup. Return clean token or None. """
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    if token.is_stop == False and token.is_alpha and len(token)>3 and token.pos_ not in removal:
        lemma = token.lemma_
        return lemma

In [22]:
def clean_up(text):  
    """ clean up tokens by documents """
    removal = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    doc = nlp(text)
    text_out = []    
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>3 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out

In [27]:
def clean_up2(text, clean_up=False):
    """ clean up tokens by sents in documents """
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    doc = nlp(text)
    
    texts = []
    sent_current = ""
    for token in doc:    
        # check for tokens current sent
        if sent_current != token.sent.text:
            # add it to texts, if it is not initially
            if sent_current != "":
                if len(sent) > 0:
                    texts.append(sent)
            # update current sent index
            sent_current = token.sent.text
            # create sent list and add first token
            sent = []
            if clean_up:
                token_clean = token_clean_up(token)
                if token_clean is not None:
                    sent.append(token_clean)
            else:
                sent.append(token)
        else:        
            # add same sent tokens to the sent list
            if clean_up:
                token_clean = token_clean_up(token)
                if token_clean is not None:
                    sent.append(token_clean)
            else:
                sent.append(token)
    # add the last sentence to the list
    texts.append(sent)
    
    return texts

In [28]:
df_test = df.description.apply(lambda x: clean_up2(x, True))
type(df_test[0][0][4])

str

In [29]:
print(len(df.description), sum(len(i) for i in df_test))

1000 8088


In [26]:
# testing with gensim
from itertools import chain
from gensim.corpora import Dictionary

dictionary = Dictionary(chain(*df_test))
len(dictionary)

3210

### Testing New Feature Creations

In [48]:
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
texts = df.description[:5].apply(lambda x: nlp(x))
for doc in texts:
    print(doc)
    for chunk in doc.noun_chunks:
        # print(chunk.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]))
        print(chunk)
        print(chunk.root)

I stumbled across this great restaurant overlooking the ocean for lunch during my vacation to Maui. I did not have high expectations for this place, but boy did it blow me out of the water. 

The fish and chips is some of the best I've ever had (and I've had lots, including from London). I highly recommend it. Also, the turkey bacon sandwich was SO good. 

In terms of drinks, I highly recommend the Pacific Paradise drink! So delicious and tropical! I also really enjoyed the Lahaina Lemonade. 

Service was really great! I wish I remembered the waitresses name because she was truly awesome and recommend the best stuff. She was blonde and had cute sunglasses.
I
I
this great restaurant
restaurant
the ocean
ocean
lunch
lunch
my vacation
vacation
Maui
Maui
I
I
high expectations
expectations
this place
place
it
it
me
me
the water
water
The fish
fish
chips
chips
I
I
I
I
lots
lots
London
London
I
I
it
it
the turkey bacon sandwich
sandwich
terms
terms
drinks
drinks
I
I
I
I
the Lahaina Lemonade. 

In [35]:
def noun_notnoun(phrase):
    doc = nlp(phrase) # create spacy object
    token_not_noun = []
    notnoun_noun_list = []

    for item in doc:
        if item.pos_ != "NOUN": # separate nouns and not nouns
            #if item.is_stop == False and item.is_oov == False and item.is_alpha:
            token_clean = token_clean_up(item)
            if token_clean is not None:
                token_not_noun.append(token_clean)
        if item.pos_ == "NOUN":
            noun = item.text

    for notnoun in token_not_noun:
        notnoun_noun_list.append(notnoun + " " + noun)

    return notnoun_noun_list

In [36]:
print(df.description[1])
print(noun_notnoun(df.description[1]))

Excellent view on the ocean at sunset.
Excellent food. We had the fresh fish : coconut for me and the yuzu for my husband. We loved it!
Waitress are super nice.
['excellent husband', 'excellent husband', 'fresh husband', 'love husband', 'waitress husband', 'nice husband']
