## Reddit Scraping
Documentation reference: https://praw.readthedocs.io/en/stable/

In [14]:
import requests
import pandas as pd
import praw
from praw.models import MoreComments

import os
import zipfile
import ast
import re
# Tools for text analysis
# We can use nltk to extract adjective and verbs related to the product/brand 
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
# Vader sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


os.makedirs("../output", exist_ok=True)
from dotenv import load_dotenv
load_dotenv()


True

# Scrape post from related subreddits
Usage: Consumer sentiment analysis 

In [7]:

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_USERNAME"),
)

# "Skincare_Addiction", "asianskincare", "Blackskincare","SkincareAddicts"
subreddit_list = ["SkincareAddiction",]

all_posts = []

# get the top 20 post from each subreddit (don't know rate limit so 20 for now)
# Documentation: https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html
all_brands = ['Estée Lauder',
'Fenty Beauty (by Rihanna)',
'e.l.f. Cosmetics',
'Tarte Cosmetics',
'Glossier',
'Laneige',
'Sulwhasoo',
'Etude House',
'Innisfree',
'COSRX',
]

def is_bot(author):
    if author is None:
        return True
    name = author.name.lower()
    return "bot" in name or name == "automoderator"

def get_top_comments(post):

    post.comments.replace_more(limit=0)  

    top_comments = []
    for comment in post.comments:
        if isinstance(comment, MoreComments):
            continue 
        if is_bot(comment.author):
            continue
        if comment.body.strip().lower() in ["[deleted]", "[removed]"]:
            continue 
        top_comments.append(comment.body.strip())
        if len(top_comments) == 5:
            break
    return top_comments
        
for sub in subreddit_list:
    try:
        for brand in all_brands:
            query = f'"{brand}"'
            post_collection = reddit.subreddit(sub).search(query, limit=20)  
            for post in post_collection: 
                top_comments = get_top_comments(post)
                all_posts.append({
                "subreddit_name": sub,
                "post_id": post.id,
                "title": post.title,
                "description": post.selftext,
                "score": post.score,
                "num_comments": post.num_comments,
                "top_comments": top_comments,
                "upvote_ratio": post.upvote_ratio,
                "brand": brand
            })
    except Exception as e:
        print(f"Error scraping {sub}: {e}")
        continue
    
subreddit_df = pd.DataFrame(all_posts)
subreddit_df
subreddit_df.to_csv("../output/subreddit_data.csv", index=False)



# Comment text cleaning 

In [16]:
skincare_df = pd.read_csv("../output/subreddit_data.csv")
skincare_df["top_comments"] = skincare_df["top_comments"].apply(ast.literal_eval)
# make it a bit more readable, need do More cleaning for sentiment analysis 
skincare_df["top_comments"][0]

custom_stopwords = []
stopword_set = set(nltk_stopwords.words('english')).union(custom_stopwords)
# stemmer = SnowballStemmer("english")

def preprocess_comment(comment):
    comment_lower = comment.lower()
    tokens = word_tokenize(comment_lower)
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stopwords]
    return " ".join(filtered_tokens)

skincare_df["cleaned_comments"] = skincare_df["top_comments"].apply(
    lambda comments: [preprocess_comment(c) for c in comments]
)

skincare_df["cleaned_comments"][0]

['skin looks great best part picture smile',
 'daily routine use pumps clinique take day cleansing oil purple bottle rub face product used skin types oily friends need worry wash oil face warm wash cloth wash face caress daily silk beauty bar regular ol bar soap really cleanses skin gentle dry face towel put two drops estée lauder advanced night repair face little bit goes longgggg way plus try conserve due priciness product rubbed let sit dry kinda tacky feel put dime half dry skin feel skin needs dime lol sunday riley tidal cream repeat step night well skin type since using products mostly normal dry patches appearing exfoliated exfoliate every days using skin medica exfoliating cleanser questions let know love answer edit use kiehl super fluid daily uv defense sunscreen spf face advanced night repair serum applying tidal cream sunscreen bit ashy skin complexion great suggestions sunscreens could use would awesome sorry left',
 'looks pretty',
 'gorgeous also stunning smile girl',
 '

# Comment analysis

The post are searched by keywords, however within each post various brands are mentioned. 
Thinking of breaking comment into rows and tag by directly related brands and its sentiment score and keep search_term as brand search reference

In [None]:
analyzer = SentimentIntensityAnalyzer() 
comment_rows = []


# Search for mentioning of brand value ?
# tagging brand to the comment and getting the sentiment score
for _,row in skincare_df.iterrows():
    for comment in row["top_comments"]:
        comment_lower = comment.lower()
        mentioned_brands = [b for b in all_brands if re.search(rf"\b{re.escape(b.lower())}\b", comment_lower)]
        if not mentioned_brands:
            mentioned_brands = ["None"]
        for brand in mentioned_brands:
            sentiment_score = analyzer.polarity_scores(comment)["compound"]
            sentiment_label = (
                "positive" if sentiment_score > 0.05
                else "negative" if sentiment_score < -0.05
                else "neutral"
            )
            comment_rows.append({
                "post_id": row["post_id"],
                "comment": comment,
                "brand_mentioned": brand,
                "search_term": row["brand"],
                "subreddit": row["subreddit_name"],
                "sentiment_score": sentiment_score,
                "sentiment_label": sentiment_label
            })
comment_df = pd.DataFrame(comment_rows)



<bound method DataFrame.value_counts of      post_id                                            comment  \
0     8qkw0m  Your skin looks great but the best part of thi...   
1     8qkw0m  Here is my Daily Routine:\n\nIn the AM/PM\n1.)...   
2     8qkw0m                                   Looks SO pretty!   
3     8qkw0m  Gorgeous! You also have such a stunning smile,...   
4     8qkw0m  You are so beautiful and happy and bubbly in t...   
..       ...                                                ...   
507  1hzmf86  I love this cleanser. Very gentle and super mo...   
508  1jl202e  Is it only during the day time and not after y...   
509  1jl202e  Do you use any sort of occlusive at night? I a...   
510  1iytc6q  No one will be able to tell you for sure, it's...   
511  1iytc6q  https://preview.redd.it/yafvwf56tile1.jpeg?wid...   

    brand_mentioned   search_term          subreddit  sentiment_score  \
0              None  Estée Lauder  SkincareAddiction           0.9168   
1      Es