## Reddit Scraping
Documentation reference: https://praw.readthedocs.io/en/stable/

In [1]:
import requests
import pandas as pd
import praw
from praw.models import MoreComments

import os
import zipfile
import ast
import re
# Tools for text analysis
# We can use nltk to extract adjective and verbs related to the product/brand 
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
# Vader sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from rapidfuzz import process, fuzz

os.makedirs("../output", exist_ok=True)
from dotenv import load_dotenv
load_dotenv()


True

# Scrape post from related subreddits
Focus: get the raw related data from various subreddits

In [34]:

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_USERNAME"),
)

# "Skincare_Addiction", "asianskincare", "Blackskincare","SkincareAddicts"
subreddit_list = ["SkincareAddiction", "Sephora"]

all_posts = []

# get the top 20 post from each subreddit (don't know rate limit so 20 for now)
# Documentation: https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html

# add some more variation to the search query 
target_brands = [
'Estée Lauder',
'Fenty Beauty',
'e.l.f. Cosmetics',
'Tarte Cosmetics',
'Tarte',
'Glossier',
'Laneige',
'Sulwhasoo',
'Etude House',
'Innisfree',
'COSRX',
]

def is_bot(author):
    if author is None:
        return True
    name = author.name.lower()
    return "bot" in name or name == "automoderator"

# Match by header title (should I go more in depth here?)
def brand_word_match(text, brand_list):
    match = process.extractOne(text, brand_list, scorer=fuzz.partial_ratio)
    if match and match[1] > 85:
        return match[0]
    return None


def get_top_comments(post):

    post.comments.replace_more(limit=0)  

    top_comments = []
    for comment in post.comments:
        if isinstance(comment, MoreComments):
            continue 
        if is_bot(comment.author):
            continue
        if comment.body.strip().lower() in ["[deleted]", "[removed]"]:
            continue 
        top_comments.append(comment.body.strip())
        if len(top_comments) == 10:
            break
    return top_comments


reddit_tags = ['[Product Question]', '[Review]']
for sub in subreddit_list:
    try:
        for brand in target_brands:
            for tag in reddit_tags:
                query = f'"{tag} {brand}"'
                post_collection = reddit.subreddit(sub).search(query.lower(), limit=20, sort="relevance")
                for post in post_collection: 
                    is_match = brand_word_match(post.title, target_brands)
                    if is_match:
                        top_comments = get_top_comments(post)
                        all_posts.append({
                        "subreddit_name": sub,
                        "post_id": post.id,
                        "title": post.title,
                        "description": post.selftext,
                        "score": post.score,
                        "num_comments": post.num_comments,
                        "top_comments": top_comments,
                        "upvote_ratio": post.upvote_ratio,
                        "brand": brand
                    })
    except Exception as e:
        print(f"Error scraping {sub}: {e}")
        continue
    
subreddit_df = pd.DataFrame(all_posts)

# subreddit_df.to_csv("../output/subreddit_data.csv", index=False)



Unnamed: 0,subreddit_name,post_id,title,description,score,num_comments,top_comments,upvote_ratio,brand
0,SkincareAddiction,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Anyone have idea what could be best use of thi...,3,4,"[Return it? Give it to someone?, You can try t...",1.00,Estée Lauder
1,SkincareAddiction,psd79w,[Product Question] Estée Lauder Advanced Night...,So I have started my first bottle of this seru...,7,24,[I've used this serum for years and it's one o...,1.00,Estée Lauder
2,SkincareAddiction,wly2yu,[Product Question] Estée Lauder night repair s...,I have a coupon for a free sample and I’m thin...,3,4,[I absolutely love it and I've used it for yea...,1.00,Estée Lauder
3,SkincareAddiction,cszttj,[product question] Estée Lauder Advanced Night...,alot good Reviews is worth HYPE ?\n\n4.9/5 292...,13,12,[Nope! I am lucky that my sister works in the ...,1.00,Estée Lauder
4,SkincareAddiction,lx8cl1,[product question] Estée Lauder ANR smell,I have a couple of sample bottles of Estée Lau...,5,5,"[A few *years* old?\n\nYeah, dump that., You d...",1.00,Estée Lauder
...,...,...,...,...,...,...,...,...,...
132,SkincareAddiction,mhmqkd,[Acne]REVIEWING COSRX Acne Pimple Master Patch,I did receive these products for FREE in excha...,0,1,[],0.20,COSRX
133,Sephora,mmh3si,[Review] Fenty Beauty Eaze Drop Blurring Skin ...,"**Skin type:** combo (oily in the t-zone, norm...",65,20,[Oily and currently dehydrated here! I have MA...,0.99,Fenty Beauty
134,Sephora,1fvkbfc,Review: Glossier Rêve and Doux Perfumes,I went in-store today to try the new Glossier ...,55,28,[I am so tempted to buy both but I'm trying to...,0.93,Glossier
135,Sephora,1fx72ac,My review: Glossier Reve perfume,I am a huge gourmand perfume person. I wanted ...,23,6,[I notice my designer fragrances last longer t...,0.93,Glossier


In [40]:
subreddit_df[subreddit_df['brand'] == 'Tarte']

Unnamed: 0,subreddit_name,post_id,title,description,score,num_comments,top_comments,upvote_ratio,brand
9,SkincareAddiction,hvtpmx,[Product Question] Tarte Maracuja Neck Treatment,So this neck cream is absolutely frustrating. ...,1,3,[Can you just use the neck cream at night when...,1.0,Tarte
10,SkincareAddiction,eycxf2,[Product Question] Tarte Tingle Treatment - Is...,I used the Tarte tingling treatment toner for ...,2,4,[I used the Tarte Tingling Knockout Treatment ...,1.0,Tarte
11,SkincareAddiction,8dbkgp,[Product question] Tarte Knockout Reaction,Found several post about tingling sensation. I...,2,9,[You burned and turned red - that is a reactio...,0.75,Tarte
12,SkincareAddiction,gtyath,[Product Question] Tarte Collagen Super Serum,So I'm new to getting into skin care. My siste...,1,1,[],1.0,Tarte
13,SkincareAddiction,cp1cyp,[product question] Tarte Amaxonian Clay 24 hr ...,,0,2,[The [CosDNA page](http://www.cosdna.com/eng/c...,0.5,Tarte
14,SkincareAddiction,ce00ou,[Product Question] Tarte Knockout on sensitive...,I’ve seen great things on here and online abou...,0,1,[It's not a toner. It's a chemical exfoliator ...,0.4,Tarte
15,SkincareAddiction,8acbqb,"[Product Question] Tarte blemish bully, has an...",I’m wondering if anyone has tried Tarte Blemis...,3,4,[I just bought this because I also thought it ...,0.81,Tarte
16,SkincareAddiction,atkgwq,[Product Question] Tarte Rainforest of the Sea...,Anyone use the deep sea collagen super serum f...,3,1,"[I'm also very curious, and I'm curious to kno...",1.0,Tarte
17,SkincareAddiction,9reawm,[Product Question] Tarte drink of H2O hydratin...,I got a sample of [tarte drink of h2o moistur...,2,2,[In skincare I wouldn’t necessarily say anythi...,0.75,Tarte
18,SkincareAddiction,4dsk11,[Product Question] Tarte Rainforest of the Sea...,I got a sample of this from Sephora a couple d...,1,7,"[Sorry , but I don't rate this.It contains irr...",0.66,Tarte


# Comment text cleaning 

Before: Comment is under a list 

After: Each comment within the list is broken down into a row

In [41]:
rows = []
for _, row in subreddit_df.iterrows():
    for comment in row["top_comments"]:
        rows.append({
            "post_id": row["post_id"],
            "title": row["title"],
            "brand": row["brand"],
            "subreddit_name": row["subreddit_name"],
            "comment": comment
        })
        
comments_df = pd.DataFrame(rows)
comments_df.head()


Unnamed: 0,post_id,title,brand,subreddit_name,comment
0,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,Return it? Give it to someone?
1,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,You can try to use it on your chest area to se...
2,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,I've used this serum for years and it's one of...
3,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,"The more you use, the faster you use it up and..."
4,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,Most serums just need a few drops to cover the...


In [42]:
custom_stopwords = []
stopword_set = set(nltk_stopwords.words('english')).union(custom_stopwords)

def preprocess_comment(comment):
    # lower case 
    comment= comment.lower()
    # remove url 
    comment = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", comment)
    # remove non-alphabetic characters 
    comment = re.sub(r"[^a-z\s]", " ", comment)

    tokens = word_tokenize(comment)
    # remove stopwords 
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stopword_set]
    return " ".join(filtered_tokens)

comments_df["cleaned_comment"] = comments_df["comment"].apply(preprocess_comment)

comments_df.to_csv("../output/subreddit_comment_data.csv", index=False)
comments_df.head()

Unnamed: 0,post_id,title,brand,subreddit_name,comment,cleaned_comment
0,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,Return it? Give it to someone?,return give someone
1,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,You can try to use it on your chest area to se...,try use chest area see tolerated otherwise wou...
2,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,I've used this serum for years and it's one of...,used serum years one favourites use couple dro...
3,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,"The more you use, the faster you use it up and...",use faster use sooner replace course company w...
4,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,Most serums just need a few drops to cover the...,serums need drops cover face see half dropper ...
