In [1]:
# !pip install keybert
# !pip install transformers
# !pip install bertopic
# !pip install -q transformers einops accelerate langchain bitsandbytes
# !pip install trl
# !pip install xformers

In [2]:
#!pip install bertopic

In [3]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from datasets import load_dataset
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
import spacy
import gensim
import pandas as pd
import numpy as np
from bertopic.representation import ZeroShotClassification
from bertopic.representation import TextGeneration
import en_core_web_sm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
df = pd.read_csv('labeled_final.csv')

In [5]:
# !pip install hdbscan
# !sudo yum install -y gcc
# !pip install hdbscan
#!python -m spacy download en_core_web_sm

In [6]:
df.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,reviewType,mostReviewed_rank,avg_star_rating
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0,1.0
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0,1.0
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},Design/Functionality,48.0,3.0
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Quality,47.0,2.0
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Design/Functionality,47.0,2.0


In [7]:
df_simple = df[['product', 'reviewText', 'reviewType']]

In [8]:
df_simple.head()

Unnamed: 0,product,reviewText,reviewType
0,ladder,Arrived with a defective lock assembly.,Delivery/ Packaging
1,ladder,The ladder quality was very good but trying to...,Delivery/ Packaging
2,ladder,Used mine for the first time today. Very happ...,Design/Functionality
3,ladder,This is not the original telesteps made in Swe...,Quality
4,ladder,"I was excited about this product at first, but...",Design/Functionality


In [9]:
#df_sub = df_simple[(df_simple['product'] == 'mower') & (df_simple['reviewType'] == 'Quality')]

In [54]:
#df_sub.head()

Unnamed: 0,product,reviewText,reviewType
2933,mower,Ordered and paid for the 1815-18 18 Inch 5 Bla...,Quality
2938,mower,Was great while it lasted. Handle broke off an...,Quality
2946,mower,The item arrived without the handle rods. So b...,Quality
2952,mower,Broke within a couple month of use and really ...,Quality
2960,mower,Item was flimsy and did not cut grass. The de...,Quality


In [80]:
representation_model = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.3)]
topic_model = BERTopic(representation_model=representation_model)

In [81]:
def topic_token(data, by_sent):
    #by_sent is a boolean
    reviews = [str(review) for review in data['reviewText']]
    if by_sent: 
        reviews_bySent = [sent_tokenize(r) for r in reviews]
        reviews_sent = []
        for item in reviews_bySent:
            if len(item) == 1:
                reviews_sent.append(item[0])
            else:
                for i in item:
                    reviews_sent.append(i)
        return reviews_sent
    else:
        return reviews


In [108]:
def get_topic_df(df_sub, topic_num, product, reviewType):
    sample_sent = topic_token(df_sub, True)
    topics, probs = topic_model.fit_transform(sample_sent)
    rep_doc = topic_model.get_representative_docs(topic= topic_num)[0]
    df1 = pd.DataFrame(topic_model.get_topic(topic_num), columns=['keyword', 'relevance_score']).sort_values(by = 'relevance_score', ascending = False).head(5)
    freq = topic_model.get_topic_freq(topic = topic_num)
    df1['rep_doc'] = rep_doc
    df1['freq'] = freq
    df1['topic_index'] = topic_num
    df1['product'] = product
    df1['reviewType'] = reviewType
    return df1 

In [115]:
prod_list = df_simple['product'].unique()
review_list = df_simple['reviewType'].unique()

In [116]:
from tqdm import tqdm
total_iterations = len(prod_list) * len(review_list) * 3


In [121]:
progress_bar = tqdm(total=total_iterations, desc='Processing')

joined_df = pd.DataFrame()
for product in prod_list:
    for reviewType in review_list:
        df_sub = df_simple[(df_simple['product'] == product) & (df_simple['reviewType'] == reviewType)]
        for topic_num in range(3):
            df2 = get_topic_df(df_sub, topic_num, product, reviewType)
            joined_df = pd.concat([joined_df, df2])
            progress_bar.update(1)
progress_bar.close()

Processing:  33%|█████████▋                   | 12/36 [03:12<05:09, 12.90s/it]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processing: 100%|█████████████████████████████| 36/36 [15:11<00:00, 25.33s/it]


In [122]:
joined_df

Unnamed: 0,keyword,relevance_score,rep_doc,freq,topic_index,product,reviewType
0,ladder,0.781868,Ladder looks like it was a used ladder.,552,0,ladder,Delivery/ Packaging
1,ladders,0.720694,Ladder looks like it was a used ladder.,552,0,ladder,Delivery/ Packaging
2,step,0.328507,Ladder looks like it was a used ladder.,552,0,ladder,Delivery/ Packaging
3,sturdy,0.294912,Ladder looks like it was a used ladder.,552,0,ladder,Delivery/ Packaging
4,plastic,0.191308,Ladder looks like it was a used ladder.,552,0,ladder,Delivery/ Packaging
...,...,...,...,...,...,...,...
0,keyboard,0.563985,It has a drawer instead of the keyboard tray.,102,2,desk,Quality
1,tray,0.550895,It has a drawer instead of the keyboard tray.,102,2,desk,Quality
2,drawer,0.416834,It has a drawer instead of the keyboard tray.,102,2,desk,Quality
3,mouse,0.252429,It has a drawer instead of the keyboard tray.,102,2,desk,Quality


In [123]:
joined_df.to_excel("topics_final.xlsx", index=False)

In [None]:
#df_sub = df_simple[(df_simple['product'] == 'mower') & (df_simple['reviewType'] == 'Quality')]

In [110]:
get_topic_df(df_sub, topic_num, product, reviewType)

Unnamed: 0,keyword,relevance_score,rep_doc,freq,topic_index,product,reviewType
0,charging,0.53417,Battery would not fully charge.,1275,0,mower,Quality
1,recharge,0.444163,Battery would not fully charge.,1275,0,mower,Quality
2,batteries,0.438387,Battery would not fully charge.,1275,0,mower,Quality
3,charger,0.435587,Battery would not fully charge.,1275,0,mower,Quality
4,plugged,0.398141,Battery would not fully charge.,1275,0,mower,Quality


In [82]:
#sample_sent = topic_token(df_sub, True)

In [83]:
# topics, probs = topic_model.fit_transform(sample_sent)
# topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4680,-1_mower_mowing_lawn_mow,"[mower, mowing, lawn, mow, worked, working, st...","[While it was working, it was nice and quiet, ..."
1,0,1290,0_charging_recharge_charger_batteries,"[charging, recharge, charger, batteries, plugg...",[Fully charged battery (took 5 hours to turn r...
2,1,541,1_lawnmower_mowing_lawn_charging,"[lawnmower, mowing, lawn, charging, batteries,...",[After fully charging I cut my front lawn with...
3,2,491,2_wheel_broke_axles_broken,"[wheel, broke, axles, broken, dropped, steerin...","[Then the front left wheel came off., The righ..."
4,3,251,3_starts_start_starting_startup,"[starts, start, starting, startup, restart, ca...","[Will not start now., It will not start., Wont..."
...,...,...,...,...,...
229,228,10,228_quality_control_manufacturer_issues,"[quality, control, manufacturer, issues, dept,...","[Very poor quality control., Quality control i..."
230,229,10,229_lawn_jammed_lifespan_cutting,"[lawn, jammed, lifespan, cutting, plowed, trim...",[It gets jammed on almost every stick in the y...
231,230,10,230_locally_walmart_purchase_shop,"[locally, walmart, purchase, shop, store, shop...","[Buy from a local store., I will purchase loca..."
232,231,10,231_mower_lawnmower_cordless_lawn,"[mower, lawnmower, cordless, lawn, cord, corde...",[I will still likely stick with a corded elect...


In [95]:
# topic_num = 0
# rep_doc = topic_model.get_representative_docs(topic= topic_num)[0]