In [1]:
import pandas as pd

import gensim
from gensim import corpora, matutils, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import SnowballStemmer, WordNetLemmatizer

import spacy
nlp = spacy.load('en_core_web_sm')

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

In [2]:
df_jackets = pd.read_pickle("df_jackets_final.pkl")

In [3]:
df_jackets.head()

Unnamed: 0,url,jacket_name,manufacturer,jacket_price,total_rating,num_reviews,review_titles,review_ratings,review_text
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Super cool and warm,5,Great looking cool jacket waterproof and warm ...
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Dry and warm,5,Great coat for fall and probably into winter a...
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Stylish and functional,5,"Love this jacket, stylish and functional. It k..."
0,https://www.evo.com/outlet/shell-jackets/thirt...,thirtytwo Light Anorak,thirtytwo,$134.99 SaleOrig: $199.95,5.0,4 Reviews,Love it,5,"Love this jacket, roomy fit, glad I didn't go ..."
1,https://www.evo.com/insulated-jackets/l1-fairb...,L1 Fairbanks Jacket - Women's,L1,$298.95,5.0,1 Review,Love this,5,"So warm and comfortable. Easy to move in, just..."


In [4]:
# Lemmatize corpus and remove stopwords

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text)
                        
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3 or (token == 'enough' or 
                                                         token == 'few' or token == 'front' or 
                                                         token == 'full' or token == 'more'):
            result.append(lemmatize(token))
    return result

In [5]:
processed_docs = df_jackets['review_text'].map(preprocess)
df_jackets["processed_docs"] = processed_docs

In [6]:
# Create function to perform LDA topic modeling using count vectorizer

def lda_bow_topic_modeling(documents, num_topics = 15, passes = 15):
    docs_dict = corpora.Dictionary(documents)
    docs_dict.filter_extremes(no_below=10, no_above=0.5)
    doc_term_matrix = [docs_dict.doc2bow(rev) for rev in documents]
    
    lda_model = models.LdaModel(corpus = doc_term_matrix, id2word = docs_dict, num_topics = num_topics, 
                          passes = passes, random_state = 42)
    
    for idx, topic in lda_model.print_topics():
        print(f'Topic: {idx} \n {topic}')

In [7]:
lda_bow_topic_modeling(processed_docs)

Topic: 0 
 0.102*"color" + 0.087*"love" + 0.040*"great" + 0.024*"long" + 0.023*"perfect" + 0.022*"length" + 0.022*"blue" + 0.020*"warm" + 0.016*"awesome" + 0.016*"like"
Topic: 1 
 0.045*"like" + 0.037*"coat" + 0.021*"nice" + 0.018*"more" + 0.018*"feel" + 0.017*"little" + 0.014*"small" + 0.014*"hip" + 0.014*"tight" + 0.013*"longer"
Topic: 2 
 0.066*"warm" + 0.050*"love" + 0.045*"perfect" + 0.039*"comfortable" + 0.031*"great" + 0.029*"lightweight" + 0.028*"super" + 0.025*"recommend" + 0.019*"highly" + 0.019*"easy"
Topic: 3 
 0.050*"quality" + 0.046*"size" + 0.030*"small" + 0.022*"pullover" + 0.021*"return" + 0.021*"version" + 0.019*"patagonia" + 0.016*"customer" + 0.015*"went" + 0.014*"disappointed"
Topic: 4 
 0.087*"warm" + 0.060*"light" + 0.059*"great" + 0.033*"cold" + 0.031*"wind" + 0.030*"weather" + 0.030*"weight" + 0.025*"rain" + 0.023*"super" + 0.021*"love"
Topic: 5 
 0.054*"year" + 0.040*"bought" + 0.036*"product" + 0.031*"wear" + 0.024*"time" + 0.022*"loved" + 0.022*"arcteryx" + 

In [8]:
# Create function to perform LDA topic modeling using tfidf vectorizer

def lda_tfidf_topic_modeling(documents, num_topics = 15, passes = 15):
    docs_dict = corpora.Dictionary(documents)
    docs_dict.filter_extremes(no_below=10, no_above=0.5)
    doc_term_matrix = [docs_dict.doc2bow(rev) for rev in documents]
    
    tfidf = models.TfidfModel(doc_term_matrix)
    corpus_tfidf = tfidf[doc_term_matrix]
    
    LDA = models.LdaModel(corpus = corpus_tfidf, id2word = docs_dict, num_topics = num_topics, 
                          passes = passes, random_state = 42)
    
    for idx, topic in LDA.print_topics():
        print(f'Topic: {idx} \n {topic}')

In [9]:
lda_tfidf_topic_modeling(processed_docs)

Topic: 0 
 0.018*"color" + 0.013*"blue" + 0.010*"black" + 0.008*"worn" + 0.008*"zipper" + 0.008*"look" + 0.008*"trail" + 0.008*"like" + 0.007*"light" + 0.007*"comfortable"
Topic: 1 
 0.020*"long" + 0.019*"nice" + 0.016*"sleeve" + 0.011*"little" + 0.010*"like" + 0.010*"coat" + 0.010*"especially" + 0.009*"fitting" + 0.009*"longer" + 0.009*"good"
Topic: 2 
 0.025*"climbing" + 0.023*"excellent" + 0.020*"north" + 0.019*"face" + 0.015*"teryx" + 0.015*"product" + 0.015*"quality" + 0.015*"piece" + 0.014*"fabric" + 0.013*"year"
Topic: 3 
 0.035*"light" + 0.033*"love" + 0.032*"warm" + 0.031*"perfect" + 0.023*"weight" + 0.022*"great" + 0.021*"coat" + 0.021*"layer" + 0.020*"comfy" + 0.019*"weather"
Topic: 4 
 0.041*"loved" + 0.034*"gift" + 0.033*"product" + 0.031*"versatile" + 0.026*"outdoors" + 0.023*"bought" + 0.022*"husband" + 0.020*"great" + 0.019*"item" + 0.019*"travel"
Topic: 5 
 0.032*"activity" + 0.022*"proton" + 0.022*"fall" + 0.018*"running" + 0.016*"daily" + 0.016*"casual" + 0.016*"stok

In [10]:
# Remove irrlevant domain tokens

stop_words = STOPWORDS.union(["snowboard","skiing","snowboarding","jacket","coat","ski","snowboard",
                              "winter","good","great", "perfect", "perfectly","nice", "love", "loves", 
                              "jacket", "looks", "looking", "fit", "like","small","medium","large",
                              "warm","warmth","size","fits","comfortable","cute", "cozy", "comfy",
                              "wife","daughter","son","husband", "bought","expected","stylish","color","blue",
                              "orange","black","amazing","super","exchange","return","flattering", "exactly",
                              "beautiful","definitely","absolutely", "wear", "layer", "lightweight", "light",
                              "weight","product"])

def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words and len(token) > 3 or (token == 'enough' or 
                                                         token == 'few' or token == 'front' or 
                                                         token == 'full' or token == 'more'):
            result.append(lemmatize(token))    
    return result

processed_docs = df_jackets['review_text'].map(preprocess)
    
df_jackets["processed_docs"] = processed_docs

In [11]:
lda_bow_topic_modeling(processed_docs)

Topic: 0 
 0.054*"keep" + 0.037*"enough" + 0.029*"pullover" + 0.029*"spring" + 0.028*"fall" + 0.020*"cool" + 0.020*"summer" + 0.018*"versatile" + 0.017*"look" + 0.017*"trip"
Topic: 1 
 0.036*"quality" + 0.034*"true" + 0.033*"color" + 0.033*"price" + 0.028*"run" + 0.024*"style" + 0.023*"ordered" + 0.023*"more" + 0.021*"excellent" + 0.020*"design"
Topic: 2 
 0.067*"cold" + 0.064*"day" + 0.041*"weather" + 0.031*"shell" + 0.027*"snow" + 0.024*"warmer" + 0.024*"time" + 0.020*"layering" + 0.018*"condition" + 0.016*"worn"
Topic: 3 
 0.025*"shell" + 0.024*"activity" + 0.024*"hiking" + 0.022*"outer" + 0.019*"more" + 0.019*"outdoor" + 0.017*"wind" + 0.016*"easily" + 0.016*"casual" + 0.015*"pack"
Topic: 4 
 0.052*"hood" + 0.037*"pocket" + 0.030*"helmet" + 0.025*"zipper" + 0.020*"shell" + 0.016*"easy" + 0.014*"zip" + 0.013*"bulky" + 0.013*"snow" + 0.013*"feel"
Topic: 5 
 0.040*"recommend" + 0.030*"climbing" + 0.030*"year" + 0.027*"soft" + 0.026*"wind" + 0.024*"highly" + 0.020*"feel" + 0.020*"few" 

In [12]:
lda_tfidf_topic_modeling(processed_docs)

Topic: 0 
 0.042*"pullover" + 0.037*"true" + 0.031*"versatile" + 0.029*"active" + 0.023*"patagonia" + 0.020*"fall" + 0.020*"style" + 0.018*"nano" + 0.018*"spring" + 0.017*"puff"
Topic: 1 
 0.029*"house" + 0.028*"wonderful" + 0.025*"item" + 0.025*"waterproof" + 0.021*"bulky" + 0.020*"yellow" + 0.020*"stoked" + 0.017*"round" + 0.017*"gear" + 0.017*"brand"
Topic: 2 
 0.032*"design" + 0.032*"best" + 0.027*"favorite" + 0.024*"goretex" + 0.021*"uphill" + 0.020*"elastic" + 0.019*"person" + 0.018*"mind" + 0.018*"satisfied" + 0.018*"classic"
Topic: 3 
 0.030*"north" + 0.028*"face" + 0.028*"wait" + 0.025*"purchase" + 0.020*"happy" + 0.019*"cold" + 0.018*"hiking" + 0.017*"warmer" + 0.014*"heavy" + 0.014*"packable"
Topic: 4 
 0.026*"logo" + 0.026*"hoody" + 0.023*"functional" + 0.019*"hoodie" + 0.017*"burton" + 0.017*"solid" + 0.016*"boyfriend" + 0.015*"said" + 0.015*"usually" + 0.014*"received"
Topic: 5 
 0.024*"recommend" + 0.020*"soft" + 0.019*"highly" + 0.016*"kept" + 0.015*"second" + 0.014*"fe

In [13]:
# Use pyLDAvis to visualize count vectorizer topic clusters

docs_dict = corpora.Dictionary(processed_docs)
doc_term_matrix = [docs_dict.doc2bow(rev) for rev in processed_docs]

lda_model = models.LdaModel(corpus = doc_term_matrix, id2word = docs_dict, num_topics = 15, 
                            passes = 15, random_state = 42)

lda_vis = gensimvis.prepare(lda_model, doc_term_matrix, docs_dict)

In [14]:
lda_vis