# Comment text cleaning 
1. Break each comment down into as a individual row 
2. Preprocess each row
3. Export cleaned comment data 

In [45]:

import pandas as pd
import os

import re
import ast

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
output_path = "../../../data/alignment_analysis/"
os.makedirs(output_path, exist_ok=True)


### Load Subreddit data

In [51]:
subreddit_df = pd.read_csv(output_path + "subreddit_data.csv")
subreddit_df.head()

Unnamed: 0,subreddit_name,post_id,title,description,score,num_comments,top_comments,upvote_ratio,brand
0,SkincareAddiction,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Anyone have idea what could be best use of thi...,3,4,"['Return it? Give it to someone?', 'You can tr...",1.0,Estée Lauder
1,SkincareAddiction,psd79w,[Product Question] Estée Lauder Advanced Night...,So I have started my first bottle of this seru...,7,24,"[""I've used this serum for years and it's one ...",1.0,Estée Lauder
2,SkincareAddiction,wly2yu,[Product Question] Estée Lauder night repair s...,I have a coupon for a free sample and I’m thin...,3,4,"[""I absolutely love it and I've used it for ye...",1.0,Estée Lauder
3,SkincareAddiction,lx8cl1,[product question] Estée Lauder ANR smell,I have a couple of sample bottles of Estée Lau...,4,5,"['A few *years* old?\n\nYeah, dump that.', 'Yo...",1.0,Estée Lauder
4,SkincareAddiction,1i5q8th,[Product question] Dupe for Estée Lauder Advan...,I got this half off at Ulta based on the esthe...,8,7,['Missha time revolution night repair ampoule'...,1.0,Estée Lauder


In [47]:
all_brand_df = pd.read_csv(f"../../../data/all_brands.csv")
all_brands = set(all_brand_df["brand_name"].dropna().str.lower().unique())

### Break each comment into a row

In [52]:
subreddit_df["top_comments"] = subreddit_df["top_comments"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
def mentions_other_brand(comment, current_brand):
    comment = str(comment).lower()
    current_brand = str(current_brand).lower()
    for brand in all_brands:
        if brand != current_brand and brand in comment:
            return True
    return False

rows = []
for _, row in subreddit_df.iterrows():
    for comment in row["top_comments"]:
        rows.append({
            "post_id": row["post_id"],
            "title": row["title"],
            "brand": row["brand"],
            "subreddit_name": row["subreddit_name"],
            "comment": comment
        })
        
comments_df = pd.DataFrame(rows)
print(comments_df.shape)
# filter out comments that mention other brands
comments_df = comments_df[~comments_df["comment"].apply(lambda x: mentions_other_brand(x, comments_df["brand"]))]
print(comments_df.shape)
comments_df.head()



(3196, 5)
(1304, 5)


Unnamed: 0,post_id,title,brand,subreddit_name,comment
0,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,Return it? Give it to someone?
1,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,You can try to use it on your chest area to se...
2,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,I've used this serum for years and it's one of...
3,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,"The more you use, the faster you use it up and..."
5,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,Hi am I the only one who feel like my serum bo...


### Preprocess comment

In [53]:
custom_stopwords = [
    # Brand names (removed from analysis)
    'estee', 'lauder', 'tarte', 'fenty', 'glossier', 'cosrx', 'etude',
    'sulwhasoo', 'laneige', 'innisfree', 'elf',

    # Platform-related
    'video', 'youtube', 'tiktok', 'instagram', 'reel', 'feed',
    'post', 'stories', 'caption', 'social', 'media',

    # Engagement / action words
    'like', 'likes', 'comment', 'comments', 'share', 'save', 'follow', 'subscribe',
    'tag', 'click', 'link', 'bio', 'visit', 'dm', 'available', 'check',

    # Time / filler
    'today', 'now', 'new', 'soon', 'launch', 'launching', 'stay', 'tune', 'coming', 'back',

    # General beauty-related terms
    'beauty', 'skin', 'skincare', 'routine', 'makeup', 'product', 'products',
    'face', 'body', 'glow', 'look', 'formula', 'texture', 'result',

    # Emoji / symbols
    '✨', '🔥', '💧', '💫', '😍', '💖', '🌟', '💥', '🧴', '📦', '🛍️',

    # Overused positive adjectives
    'feel', 'love', 'use', 'try', 'amazing', 'favorite', 'best', 'perfect', 'must', 'obsessed',

    # Promotional terms
    'shop', 'buy', 'discount', 'deal', 'sale', 'off', 'gift', 'giveaway', 'free', 'offer',

    # Conversation filler
    'hey', 'hello', 'welcome', 'thank', 'you', 'everyone', 'guys', 'hi', 'omg', 'pls', 'yay', 'get', 'got', 'let', 'us',
    "follow", "like", "likes", "just", "really", "thanks", "omg",
    "hi", "hey", "pls", "please", "gonna", "tbh", "honestly",
    "lol", "lmao", "idk", "link", "click", "watch", "dm", "recommend",
]

stopword_set = set(nltk_stopwords.words('english')).union(custom_stopwords)

def preprocess_comment(comment):
    if not isinstance(comment, str):
        return ""
    # lower case 
    comment= comment.lower()
    comment = re.sub(r"http\S+|www\S+|@\w+|#\w+|u/\w+", "", comment)
    comment = re.sub(r"[^a-z\s]", " ",comment)
    comment = re.sub(r"\s+", " ", comment).strip()
    tokens = word_tokenize(comment)
    
    # remove stopwords 
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stopword_set and len(t) > 2] 
    return " ".join(filtered_tokens)

comments_df["cleaned_comment"] = comments_df["comment"].apply(preprocess_comment)

comments_df.to_csv(output_path + "subreddit_comment_data.csv", index=False)
comments_df.head()
comments_df.shape

(1304, 6)