## Reddit Scraping
Documentation reference: https://praw.readthedocs.io/en/stable/
This script scrapes the top 20 Reddit posts for each target skincare brand and retrieves the top 10 comments from each post. The collected data forms the basis for alignment analyses on reddit. 


The following brands were selected as targets for analysis:

- `Estée Lauder`
- `Fenty Beauty`
- `Fenty`
- `e.l.f. Cosmetics`
- `e.l.f.`
- `elf`
- `Tarte Cosmetics`
- `Tarte`
- `Glossier`
- `Laneige`
- `Sulwhasoo`
- `Etude House`
- `Etude`
- `Innisfree`
- `COSRX`



Data was collected from the following skincare-related subreddits:

- `SkincareAddiction`
- `Sephora`
- `Blackskincare`
- `AsianBeauty`
- `KoreanBeauty`
- `BrownBeauty`
- `IndianSkincareAddicts`

The following fields are collected for each Reddit post related to the target brand:

- `subreddit_name`: Name of the subreddit the post is from  
- `post_id`: Unique identifier for the post  
- `title`: Title of the Reddit post  
- `description`: Body text of the post  
- `score`: Total upvotes minus downvotes  
- `num_comments`: Number of comments on the post  
- `top_comments`: A list of the top comments extracted from the post  
- `upvote_ratio`: Ratio of upvotes to total votes  
- `brand`: The skincare brand associated with the post (based on the search query)

In [9]:
import requests
import pandas as pd
import praw
from praw.models import MoreComments

import os
import zipfile
import ast
import re
# Tools for text analysis
# We can use nltk to extract adjective and verbs related to the product/brand 
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
# Vader sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from rapidfuzz import process, fuzz
import matplotlib.pyplot as plt

output_path = "../../../data/alignment_analysis/"
os.makedirs(output_path, exist_ok=True)
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime, timedelta


# Scrape post from related subreddits
This kinda takes forever, so maybe use the already loaded ones

In [11]:

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_USERNAME"),
)

subreddit_list = ["SkincareAddiction", 
                  "Sephora", 
                  "Blackskincare", 
                  "AsianBeauty", 
                  'KoreanBeauty',
                  'BrownBeauty',
                  "IndianSkincareAddicts", 
                  ]

all_posts = []

# get the top 20 post from each subreddit (don't know rate limit so 20 for now)
# Documentation: https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html
# added some more name variation to the search query 

target_brands = [
'Estée Lauder',
'Fenty Beauty',
'Fenty',
'e.l.f. Cosmetics',
'e.l.f.',
'elf',
'Tarte Cosmetics',
'Tarte',
'Glossier',
'Laneige',
'Sulwhasoo',
'Etude House',
'Etude',
'Innisfree',
'COSRX',
]
five_years_ago = datetime.utcnow() - timedelta(days=5*365)
# Remove irrelevant comment made by bots 
def is_bot(author):
    if author is None:
        return True
    name = author.name.lower()
    return "bot" in name or name == "automoderator"

# Match by header title (should I go more in depth here?)
def brand_word_match(text, brand_list):
    match = process.extractOne(text, brand_list, scorer=fuzz.partial_ratio)
    if match and match[1] > 85:
        return match[0]
    return None


def get_top_comments(post):

    post.comments.replace_more(limit=0)  

    top_comments = []
    for comment in post.comments:
        if isinstance(comment, MoreComments):
            continue 
        if is_bot(comment.author):
            continue
        if comment.body.strip().lower() in ["[deleted]", "[removed]"]:
            continue 
        top_comments.append(comment.body.strip())
        if len(top_comments) == 10:
            break
    return top_comments


# Product Question and Review are specific to skincareaddiction to pull more relevant post, but '' can be used for other subreddits# qu
reddit_tags = ['[Product Question]', '[Review]', '']

# Filter out duplicate posts 
post_seen = {}
for sub in subreddit_list:
    try:
        for brand in target_brands:
            for tag in reddit_tags:
                query = f'"{tag} {brand}"'
                post_collection = reddit.subreddit(sub).search(query.lower(), limit=20, sort="relevance", time_filter='all')
                for post in post_collection: 
                    # Filter out duplicate posts 
                    if post.id in post_seen:
                        continue 
                    # Filter out posts older than 5 years
                    post_datetime = datetime.utcfromtimestamp(post.created_utc)
                    if post_datetime < five_years_ago:
                        continue
                    
                    is_match = brand_word_match(post.title, target_brands)
                    if is_match:
                        top_comments = get_top_comments(post)
                        all_posts.append({
                        "subreddit_name": sub,
                        "post_id": post.id,
                        "title": post.title,
                        "description": post.selftext,
                        "score": post.score,
                        "num_comments": post.num_comments,
                        "top_comments": top_comments,
                        "upvote_ratio": post.upvote_ratio,
                        "brand": brand
                    })
                    # mark post as seen
                    post_seen[post.id] = True
    except Exception as e:
        print(f"Error scraping {sub}: {e}")
        continue
    
subreddit_df = pd.DataFrame(all_posts)




In [8]:
# Raw subreddit data 
subreddit_df.to_csv(output_path + "subreddit_data.csv", index=False)
subreddit_df

OSError: Cannot save file into a non-existent directory: '../../../data/alingment_analysis/reddit'