In [286]:
import re
import nltk
import pandas as pd
import numpy as np
import sklearn
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from spellchecker import SpellChecker
from kedro.pipeline.decorators import log_time
from matplotlib import pyplot as plt

In [2]:
reviews = io.load('reviews_master')

2019-06-18 09:45:45,829 - kedro.io.data_catalog - INFO - Loading data from `reviews_master` (PickleLocalDataSet)...


In [318]:
# Strip punctuation

def _remove_punc(text):
    return re.sub(r'[^\w\s]','', text)

# Remove repeated letters 

def _remove_repeats(lst):
    pattern = re.compile(r"(.)\1{2,}")
    return [pattern.sub(r"\1", w) for w in lst]

# Fix spelling

spell = SpellChecker()

def  _fix_spelling(lst):
    return [spell.correction(x) for x in lst]

# Stem words

porter = nltk.PorterStemmer() 

def _stem(lst):
    return [porter.stem(x) for x in lst]

# Lemmaatisation

wnl = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts.
    Source: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def _lemmatize(lst):
    return [wnl.lemmatize(w, get_wordnet_pos(w)) for w in lst]

# Remove stop words

stop_words = set(stopwords.words('english'))
stop_words = [_remove_punc(x) for x in stop_words]

def _remove_stops(lst):
    return [x for x in lst if x not in stop_words]

# Count words 

def _count_words(lst):
    word_freq = defaultdict(int)
    for w in lst:
        word_freq[w] += 1
    return dict(word_freq)

def create_bag(text):
    # extract emoticons
    text = _remove_punc(text)
    text = text.lower()
    words = nltk.word_tokenize(text)
#     words = _remove_extensions(words)
#     words = _fix_spelling(words)
#     words = _remove_stops(words)
#     words = _lemmatize(words)
    words = _stem(words)
    freq = _count_words(words)
    return freq

In [319]:
def create_bag(text):
    text = _remove_punc(text)
    text = text.lower()
    words = nltk.word_tokenize(text)
#     words = _remove_extensions(words)
#     words = _fix_spelling(words)
    words = _remove_stops(words)
#     words = _lemmatize(words)
    freq = _count_words(words)
    return freq

In [320]:
corpus = reviews['review_text']
bags = corpus.apply(create_bag)
bags

id
0        {'absolutely': 1, 'wonderful': 1, 'silky': 1, ...
1        {'love': 2, 'dress': 1, 'sooo': 1, 'pretty': 1...
2        {'high': 1, 'hopes': 1, 'dress': 1, 'really': ...
3        {'love': 3, 'jumpsuit': 1, 'fun': 1, 'flirty':...
4        {'shirt': 2, 'flattering': 1, 'due': 1, 'adjus...
5        {'love': 2, 'tracy': 1, 'reese': 1, 'dresses':...
6        {'aded': 1, 'basket': 1, 'hte': 4, 'last': 1, ...
7        {'ordered': 1, 'carbon': 1, 'store': 1, 'pick'...
8        {'love': 1, 'dress': 1, 'usually': 1, 'get': 1...
9        {'im': 1, '55': 1, '125': 1, 'lbs': 1, 'ordere...
10       {'dress': 2, 'runs': 2, 'small': 1, 'esp': 1, ...
11       {'dress': 1, 'perfection': 1, 'pretty': 1, 'fl...
12       {'find': 1, 'reliant': 1, 'reviews': 1, 'writt...
13       {'bought': 1, 'black': 1, 'xs': 2, 'go': 1, 'l...
14       {'nice': 1, 'choice': 1, 'holiday': 1, 'gather...
15       {'took': 1, 'package': 1, 'wanted': 1, 'fit': ...
16       {'material': 1, 'color': 1, 'nice': 1, 'leg'

In [196]:
import math
def _intersect_terms(doc1, doc2):
    return  list(set(list(doc1.keys()) + list(doc2.keys())))

def term_vector(doc, terms = []):
    local_terms = list(doc.keys())
    global_terms = list(set(local_terms + terms))
    return [doc[term] if term in local_terms else 0 for term in global_terms]
   
def cosine_sim(doc1, doc2):
    terms = _intersect_terms(doc1, doc2)
    X = [term_vector(doc1, terms), term_vector(doc2, terms)]
    X = sklearn.preprocessing.normalize(X) # normalize the rows of X
    return np.dot(X[0], X[1])

In [139]:
doc1 = bags.loc[0]
doc2 = bags.loc[1]
cosine_sim(doc1, doc2)

0.34099716973523675

In [271]:
def get_document_frequency(docs, cutoff = 0):
    """Given a list of {token: freq} dicts (ie as returned by get_document_terms),
    returns a {token:freq} dict. The frequencies of this return dictionary
    represent the number of documents each term appears in.

    """
    
    freqs = defaultdict(int)

    for doc in docs:
        for term in doc.keys():
            freqs[term] += 1
            
    freqs = {term: freq for term, freq in freqs.items() if freq > cutoff}

    return pd.Series(freqs)

In [294]:
doc_freqs = get_document_frequency(bags)
doc_freqs.sort_values(ascending = False)

love              7402
size              6539
fit               6139
dress             6039
like              5725
wear              5520
top               5264
great             5175
im                4897
would             4290
fabric            4119
color             4018
small             3648
look              3502
ordered           3455
really            3380
perfect           3347
flattering        3312
little            3283
soft              3151
one               3071
comfortable       2926
well              2926
back              2811
cute              2784
beautiful         2750
fits              2690
nice              2676
bought            2669
looks             2588
                  ... 
bummers              1
verticallyand        1
resent               1
growths              1
jodphurs             1
rackerback           1
382738               1
5ft8inches           1
softi                1
sunning              1
2100                 1
2999                 1
incidents  

In [280]:
def _idf(term):
    return math.log2(N_docs / doc_freqs[term])

def tfidf(dct):
    return {term: math.log2(1 + freq) * _idf(term) for term, freq in dct.items()}

In [321]:
bags.apply(tfidf)

id
0        {'absolutely': 4.831839009419443, 'wonderful':...
1        {'love': 2.556466278790357, 'dress': 1.9065561...
2        {'high': 4.544809122516671, 'hopes': 7.9120612...
3        {'love': 3.2259012785818815, 'jumpsuit': 7.040...
4        {'shirt': 5.720837490467469, 'flattering': 2.7...
5        {'love': 2.556466278790357, 'tracy': 9.5597594...
6        {'aded': 14.466650059591162, 'basket': 10.7662...
7        {'ordered': 2.7121800643155405, 'carbon': 10.7...
8        {'love': 1.6129506392909407, 'dress': 1.906556...
9        {'im': 2.2089675788823953, '55': 5.55975946398...
10       {'dress': 3.021819927109736, 'runs': 6.0165078...
11       {'dress': 1.9065561019486652, 'perfection': 8....
12       {'find': 4.664133694469938, 'reliant': 14.4666...
13       {'bought': 3.0845664694491948, 'black': 3.9618...
14       {'nice': 3.0807876589497, 'choice': 8.00721844...
15       {'took': 5.665750159670856, 'package': 7.79422...
16       {'material': 3.1628693114140582, 'color': 2.

In [326]:
def get_avg_tfidf(bags):
    bags = bags.apply(tfidf)
    means = defaultdict(int)

    for bag in bags:
        for term in bag.keys():
            means[term] += bag[term]
    
    for term in means.keys():
        means[term] /= doc_freqs[term]

    return pd.Series(means)

In [327]:
avg_tfidf = get_avg_tfidf(bags)
avg_tfidf.sort_values(ascending = False)

tule                28.933300
â                   28.933300
evanthe             28.933300
reflected           22.929098
roaming             22.929098
27r                 22.929098
champion            22.929098
wobbly              22.929098
eu                  22.929098
boxing              22.929098
cohs                22.929098
moors               22.929098
orangeblue          22.929098
librarian           22.929098
corodorys           22.929098
llama               22.929098
seea                22.929098
bluepurplesilver    22.929098
gripper             22.929098
riverdeck           22.929098
p2                  22.929098
haute               22.929098
stetson             22.929098
rona                22.929098
allison             22.929098
mui                 22.929098
whoop               22.929098
spokane             22.929098
coveralls           22.929098
marroon             22.929098
                      ...    
looks                3.316904
nice                 3.299070
bought    

In [338]:
def doc_search(term):
    """Search the documents
    """
    matches = [(i, bags.iloc[i]) for i in range(len(bags)) if term in bags.iloc[i].keys()]
    counts = sorted([(bag[term], i) for i, bag in matches], reverse = True)
    return counts[0][1]

In [358]:
doc_search("tule")

product_id                                                                  1056
author_age                                                                    54
review_title                                          Most stylish chinos around
review_text                    I bought these in pink and added taupe to my w...
star_rating                                                                    5
recommend_flag                                                                 1
upvotes                                                                        1
product_category_division                                                general
product_category_department                                              bottoms
product_category_class                                                     pants
Name: 9734, dtype: object