# Comment text cleaning 
1. Break each comment down into as a individual row 
2. Preprocess each row
3. Export cleaned comment data 

In [2]:

import pandas as pd
import os

import re
import ast

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
output_path = "../../../data/alingment_analysis/reddit/"
os.makedirs(output_path, exist_ok=True)


### Load Subreddit data

In [3]:
subreddit_df = pd.read_csv(output_path + "subreddit_data.csv")
subreddit_df.head()

Unnamed: 0,subreddit_name,post_id,title,description,score,num_comments,top_comments,upvote_ratio,brand
0,SkincareAddiction,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Anyone have idea what could be best use of thi...,3,4,"['Return it? Give it to someone?', 'You can tr...",1.0,Estée Lauder
1,SkincareAddiction,psd79w,[Product Question] Estée Lauder Advanced Night...,So I have started my first bottle of this seru...,7,24,"[""I've used this serum for years and it's one ...",1.0,Estée Lauder
2,SkincareAddiction,wly2yu,[Product Question] Estée Lauder night repair s...,I have a coupon for a free sample and I’m thin...,3,4,"[""I absolutely love it and I've used it for ye...",1.0,Estée Lauder
3,SkincareAddiction,lx8cl1,[product question] Estée Lauder ANR smell,I have a couple of sample bottles of Estée Lau...,4,5,"['A few *years* old?\n\nYeah, dump that.', 'Yo...",1.0,Estée Lauder
4,SkincareAddiction,1i5q8th,[Product question] Dupe for Estée Lauder Advan...,I got this half off at Ulta based on the esthe...,8,7,['Missha time revolution night repair ampoule'...,1.0,Estée Lauder


### Break each comment into a row

In [4]:
subreddit_df["top_comments"] = subreddit_df["top_comments"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

rows = []
for _, row in subreddit_df.iterrows():
    for comment in row["top_comments"]:
        rows.append({
            "post_id": row["post_id"],
            "title": row["title"],
            "brand": row["brand"],
            "subreddit_name": row["subreddit_name"],
            "comment": comment
        })
        
comments_df = pd.DataFrame(rows)
comments_df.head()



Unnamed: 0,post_id,title,brand,subreddit_name,comment
0,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,Return it? Give it to someone?
1,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,You can try to use it on your chest area to se...
2,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,I've used this serum for years and it's one of...
3,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,"The more you use, the faster you use it up and..."
4,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,Most serums just need a few drops to cover the...


### Preprocess comment

In [6]:
custom_stopwords = [ "lol", "omg", "uh", "umm", "yeah", "okay", "ok", "haha", "honestly",
                     "thread", "post", "comment", "reddit", "sub", "subreddit", "upvote", "downvote",
                     "skin", "skincare", "product", "brand", "brands", "used", "using", "use", # skin care noise word
                    ]
stopword_set = set(nltk_stopwords.words('english')).union(custom_stopwords)

def preprocess_comment(comment):
    if not isinstance(comment, str):
        return ""
    # lower case 
    comment= comment.lower()
    comment = re.sub(r"http\S+|www\S+|@\w+|#\w+|u/\w+", "", comment)
    tokens = word_tokenize(comment)
    
    # remove stopwords 
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stopword_set]
    return " ".join(filtered_tokens)

comments_df["cleaned_comment"] = comments_df["comment"].apply(preprocess_comment)

comments_df.to_csv(output_path + "subreddit_comment_data.csv", index=False)
comments_df.head()
# comments_df.shape

Unnamed: 0,post_id,title,brand,subreddit_name,comment,cleaned_comment
0,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,Return it? Give it to someone?,return give someone
1,1bv7f30,[Product Question] Estée Lauder ANR - Allergic,Estée Lauder,SkincareAddiction,You can try to use it on your chest area to se...,try chest area see tolerated otherwise would g...
2,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,I've used this serum for years and it's one of...,serum years one favourites couple drops covers...
3,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,"The more you use, the faster you use it up and...",faster sooner replace course company would lik...
4,psd79w,[Product Question] Estée Lauder Advanced Night...,Estée Lauder,SkincareAddiction,Most serums just need a few drops to cover the...,serums need drops cover face see half dropper ...
