In [1]:
import re
import nltk
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from nltk.corpus import stopwords
from collections import defaultdict
from spellchecker import SpellChecker
import emoji
import spacy
import textacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [2]:
# Read in data
import pickle
with open('data/reviews_clean.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

In [21]:
for r in reviews['review_text']:
    if 'lb' in r:
        print(r)
        break

I'm 5"5' and 125 lbs. i ordered the s petite to make sure the length wasn't too long. i typically wear an xs regular in retailer dresses. if you're less busty (34b cup or smaller), a s petite will fit you perfectly (snug, but not tight). i love that i could dress it up for a party, or down for work. i love that the tulle is longer then the fabric underneath.


### Rules for Text Cleaning:
- Short hand abbrev:
    + bc -> because
- Repeating words (sooo):
    + regex??

In [23]:
# add special case rule
from spacy.attrs import ORTH, LEMMA, POS

special_case = [{ORTH: u"bc", LEMMA: u"because", POS: u"CONJ"}]
nlp.tokenizer.add_special_case(u"bc", special_case)

suffixes = nlp.Defaults.suffixes + (r'''lb|cm|\"''',)
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

In [26]:
text = """I'm 5"5' and 125 lbs. i ordered the s petite to make sure the length wasn't too long. i typically wear an xs regular in retailer dresses. if you're less busty (34b cup or smaller), a s petite will fit you perfectly (snug, but not tight). i love that i could dress it up for a party, or down for work. i love that the tulle is longer then the fabric underneath.
"""
textacy.preprocess.preprocess_text(text, False, 
                                              lowercase=True, no_urls=True, 
                                              no_emails=True, no_phone_numbers=True, 
                                              no_numbers=True, no_currency_symbols=True, 
                                              no_punct=True, no_contractions=False, 
                                              no_accents=True)

'i am number number and number lbs i ordered the s petite to make sure the length wasn t too long i typically wear an xs regular in retailer dresses if you are less busty 34b cup or smaller a s petite will fit you perfectly snug but not tight i love that i could dress it up for a party or down for work i love that the tulle is longer then the fabric underneath'

In [27]:
def clean_text(text):
    text = emoji.demojize(text)
    text = textacy.preprocess.preprocess_text(text, False, 
                                              lowercase=True, no_urls=True, 
                                              no_emails=True, no_phone_numbers=True, 
                                              no_numbers=True, no_currency_symbols=False, 
                                              no_punct=True, no_contractions=False, 
                                              no_accents=True)
    # remove unit of measurement
    text = re.sub('lb[s]?', '', text)
    text = re.sub('[0-9]{1,2}[\w]*', '', text)
    return text

In [28]:
reviews['clean_text'] = reviews['review_text'].progress_apply(clean_text)

HBox(children=(IntProgress(value=0, max=22641), HTML(value='')))




In [29]:
reviews.to_pickle('data/reviews_processed.pkl')

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
analyzer = TfidfVectorizer().build_analyzer()
stemmer = nltk.stem.WordNetLemmatizer()
def stemmed_words(doc):
    return list(map(stemmer.lemmatize, analyzer(doc)) )

# vectorizer = TfidfVectorizer(ngram_range=(1,2), 
#                              stop_words='english',
#                              analyzer=stemmed_words,
#                              norm='l1')
# corpus = list(reviews['clean_text'])

# X = vectorizer.fit_transform(tqdm(corpus, desc='Calculating TF-iDF', total=len(reviews)))

In [10]:
#12613 unique lemmas
#9694 unique Porter stems
#8190 unique Lancaster stems
print(X.shape)

(22641, 9694)


Will go with Porter stemming because not much difference to Lancaster while retaining more meaning of words

Only considers top 3000 stems

In [35]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             stop_words='english',
                             analyzer=stemmed_words,
                             max_features=1000,
                             norm='l1')
corpus = list(reviews['clean_text'])

X = vectorizer.fit_transform(tqdm(corpus, desc='Calculating TF-iDF', total=len(reviews)))

HBox(children=(IntProgress(value=0, description='Calculating TF-iDF', max=22641, style=ProgressStyle(descripti…




In [36]:
with open('data/term_matrix.pkl', 'wb') as f:
    pickle.dump(X, f)
    f.close()

In [37]:
with open('data/vocab_key.pkl', 'wb') as f:
    pickle.dump(vectorizer.vocabulary_, f)
    f.close()