In [1]:
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer

import spacy
from spacy.symbols import amod
nlp = spacy.load('en_core_web_sm')

from pprint import pprint

from collections import Counter

In [2]:
df_jackets = pd.read_pickle("df_jackets_final.pkl")

In [3]:
df_jackets.head()

Unnamed: 0,url,jacket_name,manufacturer,jacket_price,total_rating,num_reviews,review_titles,review_ratings,review_text
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Super cool and warm,5,Great looking cool jacket waterproof and warm ...
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Dry and warm,5,Great coat for fall and probably into winter a...
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Stylish and functional,5,"Love this jacket, stylish and functional. It k..."
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Love it,5,"Love this jacket, roomy fit, glad I didn't go ..."
1,https://www.evo.com/insulated-jackets/l1-fairb...,L1 Fairbanks Jacket - Women's,L1,$298.95,5.0,1 Review,Love this,5,"So warm and comfortable. Easy to move in, just..."


In [4]:
# Lemmatize corpus and remove stopwords

stop_words = STOPWORDS.union(["jacket", "jackets","coat","product","layer","day","winter", "color","colors",
                              "return", "time", "year"])

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text)
                        
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words and len(token) > 3 or (token == 'enough' or 
                                                         token == 'few' or token == 'front' or 
                                                         token == 'full' or token == 'more'):
            result.append(lemmatize(token))
    return result

In [5]:
processed_docs = df_jackets['review_text'].map(preprocess)
df_jackets["processed_docs"] = processed_docs

In [6]:
processed_docs_list = [' '.join(doc) for doc in processed_docs]

In [7]:
spacy_processed_docs = list(nlp.pipe(processed_docs_list))
df_jackets["spacy_processed_docs"] = spacy_processed_docs

In [8]:
# Segregate positive and negative reviews. Only include 1, 2 and 5 star ratings b/c 3 and 4 star ratings are...
# ... more likely to have indifferent sentiment

df_jackets["review_ratings"] = df_jackets["review_ratings"].astype(int)

positive_reviews = df_jackets[df_jackets["review_ratings"] == 5]
negative_reviews = df_jackets[(df_jackets["review_ratings"] == 1) | (df_jackets["review_ratings"] == 2)]

In [9]:
# Separate positive and negative adjectives 

pos_adj = [token.text for doc in positive_reviews["spacy_processed_docs"] for token in doc if token.pos_=='ADJ']
neg_adj = [token.text for doc in negative_reviews["spacy_processed_docs"] for token in doc if token.pos_=='ADJ']

pos_noun = [token.text for doc in positive_reviews["spacy_processed_docs"] for token in doc if token.pos_=='NOUN']
neg_noun = [token.text for doc in negative_reviews["spacy_processed_docs"] for token in doc if token.pos_=='NOUN']

In [10]:
# Positive reviews talk most about warmth, comfort and fit

Counter(pos_adj).most_common(10)

[('great', 1574),
 ('warm', 1385),
 ('perfect', 714),
 ('comfortable', 449),
 ('good', 413),
 ('cold', 381),
 ('fit', 379),
 ('nice', 375),
 ('small', 346),
 ('large', 331)]

In [11]:
# Negative reviews talk most about fit

Counter(neg_adj).most_common(10)

[('small', 54),
 ('great', 52),
 ('warm', 45),
 ('large', 44),
 ('good', 38),
 ('disappointed', 35),
 ('few', 28),
 ('long', 26),
 ('more', 26),
 ('short', 22)]

In [12]:
# Happy customers talk about pockets most often. Hood is another ski jacket feature that happy customers write about.

Counter(pos_noun).most_common(10)

[('pocket', 825),
 ('love', 784),
 ('size', 612),
 ('shell', 363),
 ('day', 340),
 ('hood', 331),
 ('quality', 327),
 ('weather', 302),
 ('snow', 289),
 ('skiing', 274)]

In [13]:
# Unhappy customers complain about pockets and also zippers

Counter(neg_noun).most_common(10)

[('pocket', 114),
 ('size', 79),
 ('quality', 78),
 ('zipper', 77),
 ('material', 45),
 ('chest', 44),
 ('sleeve', 39),
 ('love', 36),
 ('year', 36),
 ('warranty', 34)]

In [14]:
# Investigate adjectival modifiers to understand dependency for nouns with the written reviews

def get_amods(noun, ser):
    amod_list = []
    for doc in ser:
        for token in doc:
            if (token.text) == noun:
                for child in token.children:
                    if child.dep == amod:
                        amod_list.append(child.text.lower())
    return Counter(amod_list).most_common(10)

def amods_by_sentiment(noun):
    print(f"Adjectives describing {str.upper(noun)}:\n")
    
    print("POSITIVE:")
    pprint(get_amods(noun, positive_reviews.spacy_processed_docs))
    
    print("\nNEGATIVE:")
    pprint(get_amods(noun, negative_reviews.spacy_processed_docs))

In [15]:
amods_by_sentiment("pocket")

Adjectives describing POCKET:

POSITIVE:
[('great', 33),
 ('plenty', 33),
 ('warm', 33),
 ('good', 15),
 ('front', 14),
 ('nice', 11),
 ('comfortable', 10),
 ('large', 10),
 ('enough', 9),
 ('perfect', 9)]

NEGATIVE:
[('front', 4),
 ('close', 4),
 ('left', 4),
 ('cute', 2),
 ('wrong', 2),
 ('small', 2),
 ('more', 2),
 ('enough', 2),
 ('multiple', 2),
 ('slow', 1)]


In [16]:
amods_by_sentiment("hood")

Adjectives describing HOOD:

POSITIVE:
[('great', 13),
 ('compatible', 9),
 ('warm', 7),
 ('adjustable', 5),
 ('large', 5),
 ('removable', 4),
 ('nice', 4),
 ('perfect', 3),
 ('good', 3),
 ('open', 3)]

NEGATIVE:
[('uncomfortable', 2),
 ('larger', 2),
 ('enough', 2),
 ('expensive', 1),
 ('curated', 1),
 ('stiff', 1),
 ('huge', 1),
 ('overall', 1),
 ('nice', 1),
 ('generous', 1)]


In [17]:
amods_by_sentiment("zipper")

Adjectives describing ZIPPER:

POSITIVE:
[('front', 12),
 ('warm', 8),
 ('main', 7),
 ('great', 7),
 ('comfortable', 5),
 ('waterproof', 4),
 ('defective', 4),
 ('durable', 4),
 ('double', 4),
 ('deep', 3)]

NEGATIVE:
[('main', 6),
 ('developed', 4),
 ('tight', 2),
 ('certain', 2),
 ('second', 2),
 ('sloppy', 2),
 ('enough', 1),
 ('upstate', 1),
 ('defective', 1),
 ('positive', 1)]


In [18]:
amods_by_sentiment("vent")

Adjectives describing VENT:

POSITIVE:
[('great', 9),
 ('warm', 7),
 ('open', 7),
 ('huge', 6),
 ('comfortable', 4),
 ('cool', 4),
 ('taped', 2),
 ('large', 2),
 ('front', 2),
 ('super', 2)]

NEGATIVE:
[('warm', 2)]


In [19]:
amods_by_sentiment("sleeve")

Adjectives describing SLEEVE:

POSITIVE:
[('long', 25),
 ('medium', 8),
 ('perfect', 7),
 ('great', 7),
 ('short', 6),
 ('large', 5),
 ('nice', 4),
 ('warm', 4),
 ('fit', 4),
 ('enough', 4)]

NEGATIVE:
[('long', 5),
 ('short', 4),
 ('great', 3),
 ('returned', 2),
 ('medium', 2),
 ('normal', 1),
 ('heavy', 1),
 ('floppy', 1),
 ('constricted', 1),
 ('tight', 1)]


In [20]:
amods_by_sentiment("collar")

Adjectives describing COLLAR:

POSITIVE:
[('high', 8),
 ('compatible', 2),
 ('nice', 1),
 ('awesome', 1),
 ('noticed', 1),
 ('separate', 1),
 ('half', 1),
 ('similiar', 1),
 ('tactical', 1),
 ('soft', 1)]

NEGATIVE:
[('zipped', 1), ('dirty', 1), ('disappointed', 1)]
