In [8]:
import pandas as pd
import re

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
#import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Load all instagram data


In [2]:
data_path = '../../../data/instagram'

In [3]:
estee_df = pd.read_csv(f"{data_path}/estee_lauder.csv")
tarte_df = pd.read_csv(f"{data_path}/tarte_cosmetics.csv")
innisfree_df = pd.read_csv(f"{data_path}/innisfree.csv")
elf_df = pd.read_csv(f"{data_path}/elf_cosmetics.csv")
glossier_df = pd.read_csv(f"{data_path}/glossier.csv",
    low_memory=False)
laneige_df = pd.read_csv(f"{data_path}/laneige.csv")
sulwhasoo_df = pd.read_csv(f"{data_path}/sulwhasoo.csv")
etude_df = pd.read_csv(f"{data_path}/etude_house.csv")
cosrx_df = pd.read_csv(f"{data_path}/cosrx.csv",low_memory=False)
fenty_df = pd.read_csv(f"{data_path}/fenty_beauty.csv",low_memory=False)


In [11]:
estee_df["brand"] = "Estée Lauder"
tarte_df["brand"] = "Tarte"
innisfree_df["brand"] = "Innisfree"
elf_df["brand"] = "e.l.f"
glossier_df["brand"] = "Glossier"
laneige_df["brand"] = "Laneige"
sulwhasoo_df["brand"] = "Sulwhasoo"
etude_df["brand"] = "Etude"
cosrx_df["brand"] = "COSRX"
fenty_df["brand"] = "Fenty Beauty"

In [12]:
# concat everything

all_df = pd.concat([
    estee_df, tarte_df, innisfree_df, elf_df, glossier_df,
    laneige_df, sulwhasoo_df, etude_df, cosrx_df, fenty_df
], ignore_index=True)


In [13]:
all_df.sample(20)
all_df.to_csv(f"{data_path}/raw_instagram_data.csv", index=False)

## Preprocess Instagram 

In [14]:
custom_words_toad = [
    # Brand names (removed from analysis)
    'estee', 'lauder', 'tarte', 'fenty', 'glossier', 'cosrx', 'etude',
    'sulwhasoo', 'laneige', 'innisfree', 'elf',

    # Platform-related
    'video', 'youtube', 'tiktok', 'instagram', 'reel', 'feed',
    'post', 'stories', 'caption', 'social', 'media',

    # Engagement / action words
    'like', 'likes', 'comment', 'comments', 'share', 'save', 'follow', 'subscribe',
    'tag', 'click', 'link', 'bio', 'visit', 'dm', 'available', 'check',

    # Time / filler
    'today', 'now', 'new', 'soon', 'launch', 'launching', 'stay', 'tune', 'coming', 'back',

    # General beauty-related terms
    'beauty', 'skin', 'skincare', 'routine', 'makeup', 'product', 'products',
    'face', 'body', 'glow', 'look', 'formula', 'texture', 'result',

    # Emoji / symbols
    '✨', '🔥', '💧', '💫', '😍', '💖', '🌟', '💥', '🧴', '📦', '🛍️',

    # Overused positive adjectives
    'feel', 'love', 'use', 'try', 'amazing', 'favorite', 'best', 'perfect', 'must', 'obsessed',

    # Promotional terms
    'shop', 'buy', 'discount', 'deal', 'sale', 'off', 'gift', 'giveaway', 'free', 'offer',

    # Conversation filler
    'hey', 'hello', 'welcome', 'thank', 'you', 'everyone', 'guys', 'hi', 'omg', 'pls', 'yay', 'get', 'got', 'let', 'us'
]


def preprocess(df_col, custom_words_toad):
    porter = PorterStemmer()
    list_stopwords = stopwords.words("english")
    new_stopwords = set(list_stopwords + custom_words_toad)

    corpus_lower = df_col.fillna("").str.lower().to_list()

    nostop_listing = []
    for text in corpus_lower:
        # Clean URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r"[^\w\s]", '', text)
        # Tokenize and remove stopwords
        tokens = [
            word for word in wordpunct_tokenize(text)
            if word.isalpha() and word not in new_stopwords
        ]
        # Apply stemming
        stemmed_tokens = [porter.stem(word) for word in tokens if len(word) > 2]
        nostop_listing.append(stemmed_tokens)

    return nostop_listing
    
# already ran this before
all_df["text_clean"] = preprocess(all_df["text"], custom_words_toad)
all_df["text_clean_str"] = all_df["text_clean"].apply(lambda tokens: " ".join(tokens).lower() if isinstance(tokens, (list, tuple)) else "")
all_df.to_csv(f"{data_path}/all_instagram_cleaned.csv", index=False)

In [15]:
all_df.shape

(216850, 22)