In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF

import spacy
nlp = spacy.load('en_core_web_sm')

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

In [2]:
df_jackets = pd.read_pickle("df_jackets_final.pkl")

In [3]:
df_jackets.head()

Unnamed: 0,url,jacket_name,manufacturer,jacket_price,total_rating,num_reviews,review_titles,review_ratings,review_text
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Super cool and warm,5,Great looking cool jacket waterproof and warm ...
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Dry and warm,5,Great coat for fall and probably into winter a...
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Stylish and functional,5,"Love this jacket, stylish and functional. It k..."
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Love it,5,"Love this jacket, roomy fit, glad I didn't go ..."
1,https://www.evo.com/insulated-jackets/l1-fairb...,L1 Fairbanks Jacket - Women's,L1,$298.95,5.0,1 Review,Love this,5,"So warm and comfortable. Easy to move in, just..."


In [4]:
# Preprocess text, remove stopwords, punctuation, numbers. Keep "not", "enough", "few", "front", "full", ...
# ..."more" because these are domain relevant stopwords

review_documents = list(nlp.pipe(df_jackets["review_text"]))

docs_clean = ([[word.lower_ for word in doc if 
                (not word.is_stop and not word.is_punct and not word.is_space 
                 and not word.like_num) or (word.lower_ =='not' or word.lower_ =='enough' 
                 or word.lower_ =='few' or word.lower_ =='front' or word.lower_ =='full' 
                 or word.lower_ =='more')] for doc in review_documents])

In [5]:
df_jackets["review_docs_cleaned"] = docs_clean
docs_clean_list = [' '.join(doc) for doc in docs_clean]

In [6]:
# Create function to perform NMF topic modeling

def topic_modeling(documents, vectorizer, topic_modeler, words_per_topic = 15):
    
    documents_vectorized = vectorizer.fit_transform(documents)
    
    document_topics = topic_modeler.fit_transform(documents_vectorized)
    
    vocab = vectorizer.get_feature_names()
    
    for idx, topic in enumerate(topic_modeler.components_):
        # Select the top 15 words in vocab for this topic.
        top_words = [vocab[i].upper() for i in topic.argsort()[:-words_per_topic-1:-1]]
        print(f"Topic {idx}:\n", ", ".join(top_words), "\n")
    return

In [7]:
#Baseline NMF model

documents = docs_clean_list
vectorizer = TfidfVectorizer()
topic_modeler = NMF(10, random_state=10, max_iter=1000)

topic_modeling(documents, vectorizer, topic_modeler)

Topic 0:
 JACKET, NOT, LAYER, POCKETS, SKI, LIKE, SKIING, WEAR, SHELL, POCKET, USE, COLD, DAYS, DAY, MORE 

Topic 1:
 GREAT, JACKET, FITS, FIT, QUALITY, PRICE, LOOKS, PRODUCT, SNOWBOARDING, SKIING, POCKETS, COLOR, WATERPROOF, FEATURES, LOOK 

Topic 2:
 WARM, SUPER, LIGHT, KEEPS, LIGHTWEIGHT, WEIGHT, DRY, COMFY, KEPT, CUTE, LOOKS, BULKY, COZY, ENOUGH, DAYS 

Topic 3:
 GOOD, QUALITY, LOOKS, LOOKING, PRICE, FEATURES, FIT, HIGH, SLOPES, EXCELLENT, INSANELY, DESIGN, FAR, BAGGY, WRIST 

Topic 4:
 LOVE, COLOR, JACKET, POCKETS, QUALITY, ABSOLUTELY, FIT, STYLE, LENGTH, AWESOME, FITS, BLUE, COLORS, LOOK, STRETCH 

Topic 5:
 SIZE, FIT, MEDIUM, SMALL, LARGE, LONG, BIG, LITTLE, TRUE, FITS, RUNS, ORDERED, LIKE, SLEEVES, ARMS 

Topic 6:
 PERFECT, FIT, FITS, SKIING, WEATHER, WEIGHT, SNOWBOARDING, COLD, WINTER, LIGHT, LENGTH, LAYER, SPRING, SHELL, LAYERING 

Topic 7:
 NICE, JACKET, FIT, COLOR, LONG, LOOKING, ARTICULATED, QUALITY, SICK, COLORS, SLEEVES, LONGER, HOOD, PRICE, POCKETS 

Topic 8:
 COAT, LOV

In [8]:
# Apply document frequency maximum and minimum parameters

documents = docs_clean_list
vectorizer = TfidfVectorizer(max_df = .75, min_df = 5)
topic_modeler = NMF(10, random_state=10, max_iter=1000)

topic_modeling(documents, vectorizer, topic_modeler)

Topic 0:
 NOT, LAYER, POCKETS, COLD, SHELL, WEAR, USE, POCKET, LIKE, DAYS, ENOUGH, COAT, SKI, WEATHER, SKIING 

Topic 1:
 GREAT, FITS, FIT, LOOKS, QUALITY, PRICE, PRODUCT, SKIING, SNOWBOARDING, POCKETS, COLOR, LOOK, FEATURES, LIGHT, WATERPROOF 

Topic 2:
 WARM, SUPER, LIGHT, KEEPS, LIGHTWEIGHT, WEIGHT, DRY, COMFY, KEPT, COAT, CUTE, LOOKS, COZY, BULKY, STYLISH 

Topic 3:
 GOOD, QUALITY, LOOKS, LOOKING, COAT, PRICE, FIT, FEATURES, HIGH, DESIGN, SLOPES, EXCELLENT, INSANELY, STYLE, FAR 

Topic 4:
 LOVE, COLOR, POCKETS, LENGTH, FIT, STYLE, ABSOLUTELY, COAT, QUALITY, BLUE, RED, AWESOME, FITS, LOOK, COLORS 

Topic 5:
 SIZE, FIT, MEDIUM, SMALL, LARGE, LONG, BIG, LITTLE, TRUE, RUNS, FITS, ORDERED, LIKE, SLEEVES, ARMS 

Topic 6:
 PERFECT, FIT, FITS, SKIING, WEIGHT, WINTER, LIGHT, SNOWBOARDING, WEATHER, COLD, LENGTH, SPRING, LAYER, LAYERING, SHELL 

Topic 7:
 JACKET, AMAZING, LOOKING, SNOWBOARDING, SKI, AWESOME, RECOMMEND, WATERPROOF, LOVES, PRICE, SKIING, BEST, BUY, LOVED, WINTER 

Topic 8:
 NIC

In [9]:
# Incorporate bi-grams, get more aggressive with max_df and min_df constraints

documents = docs_clean_list
vectorizer = TfidfVectorizer(max_df = .5, min_df = 10, ngram_range = (1,2))
topic_modeler = NMF(10, random_state=10, max_iter=1000)

topic_modeling(documents, vectorizer, topic_modeler)

Topic 0:
 NOT, LAYER, POCKETS, SHELL, LIKE, POCKET, USE, WEAR, SKI, COLD, SKIING, DAYS, MORE, ZIPPER, HOOD 

Topic 1:
 GREAT, GREAT JACKET, GREAT FIT, JACKET GREAT, FITS GREAT, FITS, FIT, PRICE, LOOKS GREAT, LOOKS, GREAT PRICE, QUALITY, JACKET FITS, GREAT PRODUCT, PRODUCT 

Topic 2:
 WARM, COMFORTABLE, SUPER, KEEPS, SUPER WARM, KEEPS WARM, LIGHTWEIGHT, DRY, WARM COMFORTABLE, COMFORTABLE WARM, WARM DRY, KEPT, LIGHTWEIGHT WARM, KEPT WARM, JACKET WARM 

Topic 3:
 LOVE, LOVE JACKET, COLOR, LOVE COLOR, JACKET LOVE, POCKETS, LOVE FIT, ABSOLUTELY LOVE, ABSOLUTELY, LOVE LOVE, FIT, STYLE, LENGTH, AWESOME, COLOR LOVE 

Topic 4:
 GOOD, QUALITY, GOOD QUALITY, LOOKS, LOOKS GOOD, GOOD FIT, GOOD JACKET, JACKET GOOD, GOOD PRICE, GOOD LOOKING, LOOKING, FIT, PRICE, FEATURES, FAR GOOD 

Topic 5:
 SIZE, FIT, MEDIUM, SMALL, LARGE, LONG, FITS, BIG, TRUE, SLEEVES, LITTLE, ORDERED, TRUE SIZE, LIKE, ARMS 

Topic 6:
 PERFECT, JACKET PERFECT, PERFECT FIT, FITS PERFECT, FIT, FIT PERFECT, FITS, PERFECT JACKET, WIN

In [10]:
# Remove additional irrelevant domain tokens

stop_words = {"snowboard","skiing","snowboarding","jacket","coat","ski","snowboard","winter","good",
              "great", "perfect", "perfectly","nice", "love", "loves", "jacket", "looks", "looking", "fit", 
              "like","small","medium","large","warm","warmth","size","fits","comfortable","cute", 
              "cozy", "comfy","color","wife","daughter","son","husband"}

documents = docs_clean_list
vectorizer = TfidfVectorizer(max_df = .5, min_df = 10, stop_words = stop_words)
topic_modeler = NMF(10, random_state=10, max_iter=1000)

topic_modeling(documents, vectorizer, topic_modeler)

Topic 0:
 POCKET, MORE, BOUGHT, HOOD, GOT, ZIPPER, JACKETS, SNOW, HELMET, MATERIAL, TIME, PRICE, SHELL, BIT, FABRIC 

Topic 1:
 SUPER, STYLISH, CONDITIONS, SOFT, BEAUTIFUL, STOKED, EASY, RECOMMEND, HAPPY, HOODIE, SHARP, LOVED, WORE, FLATTERING, BULKY 

Topic 2:
 LIGHT, WEIGHT, WATERPROOF, TRUE, WIND, RAIN, DAY, DEFINITELY, NEED, TEMPS, EXCELLENT, WEATHER, HOOD, SURPRISINGLY, VERSATILE 

Topic 3:
 QUALITY, HIGH, PRICE, EXCELLENT, TRUE, PRODUCT, STYLE, AMAZING, STYLISH, DESIGN, PATAGONIA, MATERIALS, FEATURES, DISAPPOINTED, RECOMMEND 

Topic 4:
 LONG, LITTLE, BIG, SLEEVES, ARMS, RUNS, LENGTH, BIT, WAY, ORDERED, ENOUGH, OVERALL, LAYERS, TIGHT, RUN 

Topic 5:
 POCKETS, LOTS, PLENTY, ENOUGH, STYLISH, ZIPPERS, BIG, INSIDE, HELMET, VENTS, HOOD, NEED, DAYS, PASS, FUNCTIONAL 

Topic 6:
 LIGHTWEIGHT, EASY, TRUE, EVERYDAY, PACKS, OUTDOOR, FEEL, VERSATILE, HIKING, SPRING, COOL, PACKABLE, USE, ACTIVITIES, FEELS 

Topic 7:
 KEEPS, DRY, KEPT, SNOW, AMAZING, DAY, WET, RECOMMEND, DAYS, CONDITIONS, RIDIN

In [11]:
# Lemmatize corpus

spacy_docs_lemmatize = list(nlp.pipe(docs_clean_list))

docs_lemmatize = [[word.lemma_ for word in doc] for doc in spacy_docs_lemmatize]

documents_list_lemmatize = [' '.join(doc) for doc in docs_lemmatize]

In [12]:
# Model lemmatized corpus and also remove additional irrelevant domain tokens

stop_words = {"snowboard","skiing","snowboarding","jacket","coat","ski","snowboard","winter","good",
              "great", "perfect", "perfectly","nice", "love", "loves", "jacket", "looks", "looking", "fit", 
              "like","small","medium","large","warm","warmth","size","fits","comfortable","cute", 
              "cozy", "comfy","color","wife","daughter","son","flattering","bought","product","cool","slope",
              "day","go","get"}

documents = documents_list_lemmatize
vectorizer = TfidfVectorizer(max_df = .75, min_df = 10, stop_words = stop_words)
topic_modeler = NMF(10, random_state=10, max_iter=1000)

topic_modeling(documents, vectorizer, topic_modeler)

Topic 0:
 LAYER, WEAR, COLD, SHELL, WEATHER, UNDERNEATH, USE, BASE, MID, NEED, ENOUGH, ROOM, WORK, OUTER, TIME 

Topic 1:
 SUPER, STYLISH, SOFT, RECOMMEND, HAPPY, BEAUTIFUL, EASY, CONDITION, STOKE, BULKY, HOODIE, SWEET, HIGHLY, SHARP, BREATHABLE 

Topic 2:
 POCKET, LOT, ZIPPER, PASS, HOOD, INSIDE, PHONE, HELMET, ZIP, VENT, CHEST, NEED, USE, WISH, PLACE 

Topic 3:
 LOOK, WATERPROOF, EXACTLY, WAIT, SEASON, STYLE, MORE, FORWARD, AWESOME, FUNCTIONAL, BLACK, ATHLETIC, ORDER, DEFINITELY, MAKE 

Topic 4:
 NOT, BUY, YEAR, ZIPPER, MORE, BULKY, PURCHASE, MATERIAL, PRICE, TIGHT, FEEL, WAY, RETURN, HEAVY, OLD 

Topic 5:
 LIGHT, WEIGHT, TRUE, WATERPROOF, HOOD, WIND, WEATHER, FEEL, RAIN, TEMP, USE, CONDITION, PATAGONIA, VERSATILE, PIECE 

Topic 6:
 QUALITY, HIGH, PRICE, EXCELLENT, TRUE, MATERIAL, DESIGN, STYLE, AMAZING, PATAGONIA, STYLISH, PURCHASE, FEATURE, BRAND, EXPECT 

Topic 7:
 KEEP, DRY, SNOW, WET, RECOMMEND, AMAZING, STAY, CONDITION, DEFINITELY, HOOD, RAIN, FAR, AWESOME, DEGREE, WIND 

Topic