In [2]:
import re
import nltk
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from nltk.corpus import stopwords
from collections import defaultdict
import spacy
import textacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
# Read in data
import pickle
with open('data/reviews_clean.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

In [None]:
onestar = list(reviews[reviews['star_rating']==1]['review_text'])
print(onestar[0])

### Rules for Text Cleaning:
- Short hand abbrev:
    + bc -> because
- Repeating words (sooo):
    + regex??

In [3]:
# add special case rule
from spacy.attrs import ORTH, LEMMA, POS

special_case = [{ORTH: u"bc", LEMMA: u"because", POS: u"CONJ"}]
nlp.tokenizer.add_special_case(u"bc", special_case)

suffixes = nlp.Defaults.suffixes + (r'''lb|cm|\"''',)
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

In [None]:
text = """I'm 5"5' and 125 lbs. i ordered the s petite to make sure the length wasn't too long. i typically wear an xs regular in retailer dresses. if you're less busty (34b cup or smaller), a s petite will fit you perfectly (snug, but not tight). i love that i could dress it up for a party, or down for work. i love that the tulle is longer then the fabric underneath.
"""
textacy.preprocess.preprocess_text(text, False, 
                                              lowercase=True, no_urls=True, 
                                              no_emails=True, no_phone_numbers=True, 
                                              no_numbers=True, no_currency_symbols=True, 
                                              no_punct=True, no_contractions=False, 
                                              no_accents=True)

In [None]:
test_str = "it is soooo pretty"
test_str = re.sub(r"""([a-z])\1{1,} """, r"""\1 """, test_str)
print(test_str)

In [None]:
def clean_text(text):
    # remove unit of measurement
    text = re.sub(r'lb[s]?', '', text)
    text = re.sub(r'[0-9]{1,2}[\w]*', '', text)
    # repeating words
    text = re.sub(r"""([a-z])\1{1,} """, r"""\1 """, text)
    # abbreviations
    text = re.sub(r'imo', 'in my opinion', text)
    text = re.sub(r'bc', 'because', text)
    # slang
    text = re.sub(r'tad', 'litte', text)
    # size
    text = re.sub(r'x[x|s]', 'size', text)
    
    text = textacy.preprocess.preprocess_text(text, False, 
                                              lowercase=True, no_urls=True, 
                                              no_emails=True, no_phone_numbers=True, 
                                              no_numbers=True, no_currency_symbols=True, 
                                              no_punct=True, no_contractions=True, 
                                              no_accents=True)
    text = re.sub('number', '', text)
    return text

In [None]:
reviews['clean_text'] = reviews['review_text'].progress_apply(clean_text)

In [None]:
def clean_title(text):
    if type(text) is str:
        # repeating words
        text = re.sub(r"""([a-z])\1{1,} """, r"""\1 """, text)
        # abbreviations
        text = re.sub(r'imo', 'in my opinion', text)
        text = re.sub(r'bc', 'because', text)
        # slang
        text = re.sub(r'tad', 'litte', text)    
        text = textacy.preprocess.preprocess_text(text, False, 
                                                  lowercase=True, no_urls=True, 
                                                  no_emails=True, no_phone_numbers=True, 
                                                  no_numbers=True, no_currency_symbols=True, 
                                                  no_punct=True, no_contractions=True, 
                                                  no_accents=True)
    else:
        text = ''
    return text

reviews['clean_title'] = reviews['review_title'].progress_apply(clean_title)

In [None]:
reviews.to_pickle('data/reviews_processed.pkl')

In [None]:
reviews = pd.read_pickle('data/reviews_processed.pkl')

### Review Text Matrix Term Generation

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
analyzer = TfidfVectorizer().build_analyzer()
stemmer = nltk.PorterStemmer()
def stemmed_words(doc):
    return list(map(stemmer.stem, analyzer(doc)) )

pos_interest = ['ADJ','NOUN', 'VERB', 'ADV']
def trim_features(doc):
    review = nlp(doc)
    tokens = []
    for t in review:
        if not t.is_stop and t.is_alpha and t.pos_ in pos_interest:
            t_stem = stemmer.stem(t.text)
            if len(t_stem) > 2:
                tokens.append(t_stem)
    return tokens

In [None]:
#12613 unique lemmas
#9694 unique Porter stems
#8190 unique Lancaster stems

Will go with Porter stemming because not much difference to Lancaster while retaining more meaning of words

Only considers top 3000 stems

In [1]:
# Read in data
import pickle
with open('data/reviews_processed.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             stop_words='english',
                             max_features=2000,
                             min_df=5,
                             norm='l1')
corpus = list(reviews['clean_text'])

X = vectorizer.fit_transform(tqdm(corpus, desc='Calculating TF-iDF', total=len(reviews)))

HBox(children=(IntProgress(value=0, description='Calculating TF-iDF', max=22641, style=ProgressStyle(descriptiâ€¦




In [14]:
with open('data/ngram_matrix.pkl', 'wb') as f:
    pickle.dump(X, f)
    f.close()

In [15]:
with open('data/ngram_vocab.pkl', 'wb') as f:
    pickle.dump(vectorizer.vocabulary_, f)
    f.close()

### Generate term matrix for title

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
analyzer = TfidfVectorizer().build_analyzer()
stemmer = nltk.PorterStemmer()
def stemmed_words(doc):
    return list(map(stemmer.stem, analyzer(doc)) )

vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             stop_words='english',
                             analyzer=stemmed_words,
                             norm='l1)
corpus = list(reviews['clean_title'])

X_title = vectorizer.fit_transform(tqdm(corpus, desc='Calculating TF-iDF', total=len(reviews)))

In [None]:
with open('data/title_matrix.pkl', 'wb') as f:
    pickle.dump(X_title, f)
    f.close()

with open('data/title_key.pkl', 'wb') as f:
    pickle.dump(vectorizer.vocabulary_, f)
    f.close()

In [None]:
import matplotlib.pyplot as plt

reviews.boxplot(column='upvotes', by='star_rating')
plt.show()