## Reddit Scraping
Documentation reference: https://praw.readthedocs.io/en/stable/

In [106]:
import requests
import pandas as pd
import praw
from praw.models import MoreComments

import os
import zipfile
import ast
import re
# Tools for text analysis
# We can use nltk to extract adjective and verbs related to the product/brand 
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
# Vader sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


os.makedirs("../output", exist_ok=True)
from dotenv import load_dotenv
load_dotenv()


True

# Scrape post from related subreddits
Usage: Consumer sentiment analysis 

In [98]:

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_USERNAME"),
)

# "Skincare_Addiction", "asianskincare", "Blackskincare","SkincareAddicts"
subreddit_list = ["SkincareAddiction",]

all_posts = []

# get the top 20 post from each subreddit (don't know rate limit so 20 for now)
# Documentation: https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html
all_brands = ['Estée Lauder',
'Fenty Beauty (by Rihanna)',
'e.l.f. Cosmetics',
'Tarte Cosmetics',
'Glossier',
'Laneige',
'Sulwhasoo',
'Etude House',
'Innisfree',
'COSRX',
]

def is_bot(author):
    if author is None:
        return True
    name = author.name.lower()
    return "bot" in name or name == "automoderator"

def get_top_comments(post):

    post.comments.replace_more(limit=0)  

    top_comments = []
    for comment in post.comments:
        if isinstance(comment, MoreComments):
            continue 
        if is_bot(comment.author):
            continue
        if comment.body.strip().lower() in ["[deleted]", "[removed]"]:
            continue 
        top_comments.append(comment.body.strip())
        if len(top_comments) == 5:
            break
    return top_comments
        
for sub in subreddit_list:
    try:
        for brand in all_brands:
            query = f'"{brand}"'
            post_collection = reddit.subreddit(sub).search(query, limit=20)  
            for post in post_collection: 
                top_comments = get_top_comments(post)
                all_posts.append({
                "subreddit_name": sub,
                "post_id": post.id,
                "title": post.title,
                "description": post.selftext,
                "score": post.score,
                "num_comments": post.num_comments,
                "top_comments": top_comments,
                "upvote_ratio": post.upvote_ratio,
                "brand": brand
            })
    except Exception as e:
        print(f"Error scraping {sub}: {e}")
        continue
    
subreddit_df = pd.DataFrame(all_posts)
subreddit_df
subreddit_df.to_csv("../output/subreddit_data.csv", index=False)



In [97]:
skincare_df = pd.read_csv("../output/subreddit_data.csv")
skincare_df["top_comments"] = skincare_df["top_comments"].apply(ast.literal_eval)
# make it a bit more readable, need do More cleaning for sentiment analysis 
skincare_df["top_comments"][0]
# skincare_df.to_csv("../output/subreddit_data.csv", index=False)



['Your skin looks great but the best part of this picture is your smile!',
 'Here is my Daily Routine:\n\nIn the AM/PM\n1.) I use 3 pumps of the Clinique Take Off the Day Cleansing Oil (purple bottle) and rub it all over my face. This product can be used on all skin types so my oily friends don’t need to worry! \n2.) I wash the oil off my face with a warm wash cloth\n3.) I then wash my face with a Caress Daily Silk Beauty Bar, for regular ol bar soap really cleanses my skin and is so gentle! \n3.) I dry my face with a towel and put on two drops of my Estée Lauder Advanced Night Repair all over my face! A little bit goes a longgggg way! Plus I try to conserve it due to the priciness of the product! \n4.) Once that’s rubbed in I let it sit until it’s dry (has a kinda tacky feel to it) and then I put on a dime and a half (I have dry skin so I feel as if my skin needs more than just a dime lol) of the SUNDAY RILEY Tidal Cream! \n\nThat’s it!! I repeat the same step at night as well! \n\nMy

# Comment analysis

The post are searched by keywords, however within each post various brands are mentioned. 
Thinking of breaking comment into rows and tag by directly related brands and its sentiment score and keep search_term as brand search reference

In [113]:
analyzer = SentimentIntensityAnalyzer() 
comment_rows = []

# tagging brand to the comment and getting the sentiment score
for _,row in skincare_df.iterrows():
    for comment in row["top_comments"]:
        comment_lower = comment.lower()
        mentioned_brands = [b for b in all_brands if re.search(rf"\b{re.escape(b.lower())}\b", comment_lower)]
        if not mentioned_brands:
            mentioned_brands = ["None"]
        for brand in mentioned_brands:
            sentiment_score = analyzer.polarity_scores(comment)["compound"]
            sentiment_label = (
                "positive" if sentiment_score > 0.05
                else "negative" if sentiment_score < -0.05
                else "neutral"
            )
            comment_rows.append({
                "post_id": row["post_id"],
                "comment": comment,
                "brand_mentioned": brand,
                "search_term": row["brand"],
                "subreddit": row["subreddit_name"],
                "sentiment_score": sentiment_score,
                "sentiment_label": sentiment_label
            })
comment_df = pd.DataFrame(comment_rows)
comment_df

Unnamed: 0,post_id,comment,brand_mentioned,search_term,subreddit,sentiment_score,sentiment_label
0,8qkw0m,[,,Estée Lauder,SkincareAddiction,0.0000,neutral
1,8qkw0m,',,Estée Lauder,SkincareAddiction,0.0000,neutral
2,8qkw0m,Y,,Estée Lauder,SkincareAddiction,0.0000,neutral
3,8qkw0m,o,,Estée Lauder,SkincareAddiction,0.0000,neutral
4,8qkw0m,u,,Estée Lauder,SkincareAddiction,0.0000,neutral
...,...,...,...,...,...,...,...
58191,11fam7e,u,,Glossier,SkincareAddiction,0.0000,neutral
58192,11fam7e,l,,Glossier,SkincareAddiction,0.4588,positive
58193,11fam7e,a,,Glossier,SkincareAddiction,0.0000,neutral
58194,11fam7e,',,Glossier,SkincareAddiction,0.0000,neutral
