## Reddit Scraping
Documentation reference: https://praw.readthedocs.io/en/stable/

In [14]:
import requests
import pandas as pd
import praw
from praw.models import MoreComments

import os
import zipfile
import ast
import re
# Tools for text analysis
# We can use nltk to extract adjective and verbs related to the product/brand 
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
# Vader sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


os.makedirs("../output", exist_ok=True)
from dotenv import load_dotenv
load_dotenv()


True

# Scrape post from related subreddits
Usage: Consumer sentiment analysis 

In [20]:

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_USERNAME"),
)

# "Skincare_Addiction", "asianskincare", "Blackskincare","SkincareAddicts"
subreddit_list = ["SkincareAddiction",]

all_posts = []

# get the top 20 post from each subreddit (don't know rate limit so 20 for now)
# Documentation: https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html
all_brands = ['Estée Lauder',
'Fenty Beauty (by Rihanna)',
'e.l.f. Cosmetics',
'Tarte Cosmetics',
'Glossier',
'Laneige',
'Sulwhasoo',
'Etude House',
'Innisfree',
'COSRX',
]

def is_bot(author):
    if author is None:
        return True
    name = author.name.lower()
    return "bot" in name or name == "automoderator"

def get_top_comments(post):

    post.comments.replace_more(limit=0)  

    top_comments = []
    for comment in post.comments:
        if isinstance(comment, MoreComments):
            continue 
        if is_bot(comment.author):
            continue
        if comment.body.strip().lower() in ["[deleted]", "[removed]"]:
            continue 
        top_comments.append(comment.body.strip())
        if len(top_comments) == 10:
            break
    return top_comments
        
for sub in subreddit_list:
    try:
        for brand in all_brands:
            query = f'"{brand}"'
            post_collection = reddit.subreddit(sub).search(query, limit=20)  
            for post in post_collection: 
                top_comments = get_top_comments(post)
                all_posts.append({
                "subreddit_name": sub,
                "post_id": post.id,
                "title": post.title,
                "description": post.selftext,
                "score": post.score,
                "num_comments": post.num_comments,
                "top_comments": top_comments,
                "upvote_ratio": post.upvote_ratio,
                "brand": brand
            })
    except Exception as e:
        print(f"Error scraping {sub}: {e}")
        continue
    
subreddit_df = pd.DataFrame(all_posts)
subreddit_df
subreddit_df.to_csv("../output/subreddit_data.csv", index=False)



KeyboardInterrupt: 

# Comment text cleaning 

In [21]:
skincare_df = pd.read_csv("../output/subreddit_data.csv")
skincare_df["top_comments"] = skincare_df["top_comments"].apply(ast.literal_eval)
# make it a bit more readable, need do More cleaning for sentiment analysis 
skincare_df["top_comments"][0]

custom_stopwords = []
stopword_set = set(nltk_stopwords.words('english')).union(custom_stopwords)
# stemmer = SnowballStemmer("english")

def preprocess_comment(comment):
    comment_lower = comment.lower()
    tokens = word_tokenize(comment_lower)
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stopwords]
    return " ".join(filtered_tokens)

skincare_df["cleaned_comments"] = skincare_df["top_comments"].apply(
    lambda comments: [preprocess_comment(c) for c in comments]
)

skincare_df

Unnamed: 0,subreddit_name,post_id,title,description,score,num_comments,top_comments,upvote_ratio,brand,cleaned_comments
0,SkincareAddiction,8qkw0m,[B&A] FIRST TIME POSTER!!! Only 2 months of us...,,5077,268,[Your skin looks great but the best part of th...,0.98,Estée Lauder,"[skin looks great best part picture smile, dai..."
1,SkincareAddiction,8fci2y,[MISC] Brandon Truaxe sent a series of emails ...,,179,152,[AMA Request: Someone who works for Deciem. I ...,0.93,Estée Lauder,[ama request someone works deciem ca imagine l...
2,SkincareAddiction,1i5q8th,[Product question] Dupe for Estée Lauder Advan...,I got this half off at Ulta based on the esthe...,9,7,"[Missha time revolution night repair ampoule, ...",1.00,Estée Lauder,"[missha time revolution night repair ampoule, ..."
3,SkincareAddiction,ttbud1,[Misc] Anyone got any tips/product recommendat...,,35,54,[I would use incredible basic ingredients and ...,0.91,Estée Lauder,[would use incredible basic ingredients stay a...
4,SkincareAddiction,xyfyv9,[Product Question] Is there an eye repair crea...,I’m required to wear makeup five days a week f...,8,45,[I’m required to wear makeup five days a week ...,0.79,Estée Lauder,[required wear makeup five days week hours day...
...,...,...,...,...,...,...,...,...,...,...
155,SkincareAddiction,1jtorfw,[Product Question] COSRX Snail Mucin Alternative,COSRX Snail Mucin Alternative\n\nCOSRX Snail M...,2,5,[I have been using rice products recently and ...,0.75,COSRX,[using rice products recently milky goo rice p...
156,SkincareAddiction,1kapv9w,[Product Question] Is the Cosrx snail mucin co...,\nI am currently using salicylic acid (stridex...,2,2,[Yes generally the Cosrx snail mucin essence i...,1.00,COSRX,[yes generally cosrx snail mucin essence gentl...
157,SkincareAddiction,1hzmf86,When to use the cosrx good morning low ph clea...,Hi! I’m new to skincare and I was wondering if...,8,15,"[You can use this daily, retinol or no retinol...",1.00,COSRX,[use daily retinol retinol best thing listen s...
158,SkincareAddiction,1jl202e,[Routine Help] La roche posay gentle face wash...,As the title says. I use all of these hydratin...,1,6,[Is it only during the day time and not after ...,0.57,COSRX,"[day time night routine, use sort occlusive ni..."


# Comment analysis

The post are searched by keywords, however within each post various brands are mentioned. 
Thinking of breaking comment into rows and tag by directly related brands and its sentiment score and keep search_term as brand search reference

In [None]:
analyzer = SentimentIntensityAnalyzer() 
comment_rows = []


# Search for mentioning of brand value ?
# tagging brand to the comment and getting the sentiment score
for _,row in skincare_df.iterrows():
    for comment in row["top_comments"]:
        comment_lower = comment.lower()
        mentioned_brands = [b for b in all_brands if re.search(rf"\b{re.escape(b.lower())}\b", comment_lower)]
        if not mentioned_brands:
            mentioned_brands = ["None"]
        for brand in mentioned_brands:
            sentiment_score = analyzer.polarity_scores(comment)["compound"]
            sentiment_label = (
                "positive" if sentiment_score > 0.05
                else "negative" if sentiment_score < -0.05
                else "neutral"
            )
            comment_rows.append({
                "post_id": row["post_id"],
                "comment": comment,
                "brand_mentioned": brand,
                "search_term": row["brand"],
                "subreddit": row["subreddit_name"],
                "sentiment_score": sentiment_score,
                "sentiment_label": sentiment_label
            })
comment_df = pd.DataFrame(comment_rows)



<bound method DataFrame.value_counts of      post_id                                            comment  \
0     8qkw0m  Your skin looks great but the best part of thi...   
1     8qkw0m  Here is my Daily Routine:\n\nIn the AM/PM\n1.)...   
2     8qkw0m                                   Looks SO pretty!   
3     8qkw0m  Gorgeous! You also have such a stunning smile,...   
4     8qkw0m  You are so beautiful and happy and bubbly in t...   
..       ...                                                ...   
507  1hzmf86  I love this cleanser. Very gentle and super mo...   
508  1jl202e  Is it only during the day time and not after y...   
509  1jl202e  Do you use any sort of occlusive at night? I a...   
510  1iytc6q  No one will be able to tell you for sure, it's...   
511  1iytc6q  https://preview.redd.it/yafvwf56tile1.jpeg?wid...   

    brand_mentioned   search_term          subreddit  sentiment_score  \
0              None  Estée Lauder  SkincareAddiction           0.9168   
1      Es