# Instagram_setup

This notebook has reusable functions and shared resources for text preprocessing, DTM creation, 
and branded hashtag sentiment analysis for Instagram beauty brand data.

Contents:
- custom_words_toad: Domain-specific stopword list
- preprocess(): Cleans and tokenizes captions
- create_dtm(): Converts cleaned text to document-term matrix with metadata
- compare_branded_hashtags(): Compares average sentiment of branded vs non-branded posts per hashtag


In [None]:
import re
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
custom_words_toad = [
    # Brand names (removed from analysis)
    'estee', 'lauder', 'tarte', 'fenty', 'glossier', 'cosrx', 'etude',
    'sulwhasoo', 'laneige', 'innisfree', 'elf',

    # Platform-related
    'video', 'youtube', 'tiktok', 'instagram', 'reel', 'feed',
    'post', 'stories', 'caption', 'social', 'media',

    # Engagement / action words
    'like', 'likes', 'comment', 'comments', 'share', 'save', 'follow', 'subscribe',
    'tag', 'click', 'link', 'bio', 'visit', 'dm', 'available', 'check',

    # Time / filler
    'today', 'now', 'new', 'soon', 'launch', 'launching', 'stay', 'tune', 'coming', 'back',

    # General beauty-related terms
    'beauty', 'skin', 'skincare', 'routine', 'makeup', 'product', 'products',
    'face', 'body', 'glow', 'look', 'formula', 'texture', 'result',

    # Emoji / symbols
    '✨', '🔥', '💧', '💫', '😍', '💖', '🌟', '💥', '🧴', '📦', '🛍️',

    # Overused positive adjectives
    'feel', 'love', 'use', 'try', 'amazing', 'favorite', 'best', 'perfect', 'must', 'obsessed',

    # Promotional terms
    'shop', 'buy', 'discount', 'deal', 'sale', 'off', 'gift', 'giveaway', 'free', 'offer',

    # Conversation filler
    'hey', 'hello', 'welcome', 'thank', 'you', 'everyone', 'guys', 'hi', 'omg', 'pls', 'yay', 'get', 'got', 'let', 'us'
]


def preprocess(df_col, custom_words_toad):
    porter = PorterStemmer()
    
    # Compile full custom stopword list
    list_stopwords = stopwords.words("english")
    new_stopwords = set(list_stopwords + custom_words_toad)

    corpus_lower = df_col.fillna("").str.lower().to_list()

    nostop_listing = []
    for text in corpus_lower:
        # Clean URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r"[^\w\s]", '', text)
        # Tokenize and remove stopwords
        tokens = [
            word for word in wordpunct_tokenize(text)
            if word.isalpha() and word not in new_stopwords
        ]
        # Apply stemming
        stemmed_tokens = [porter.stem(word) for word in tokens if len(word) > 2]
        nostop_listing.append(stemmed_tokens)

    return nostop_listing


In [4]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True, max_features=5000, min_df=5,           # ignore rare words
        stop_words='english')  # or try 10000 if you can afford more memory) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(drop=True), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [5]:
def compare_branded_hashtags(df, brand_name):
    """
    Compare average sentiment of branded vs. non-branded posts for each hashtag.

    Parameters:
        df (pd.DataFrame): Dataframe with 'brand', 'hashtags', 'compound', and 'is_branded_content'
        brand_name (str): The brand to filter on

    Returns:
        pd.DataFrame: Pivot table of hashtags with sentiment comparison
    """
    filtered = df[df["brand"] == brand_name]

    sentiment = (
        filtered.groupby(["hashtags", "is_branded_content"])
        .agg(avg_sentiment=("compound", "mean"), count=("hashtags", "count"))
        .reset_index()
    )

    pivot = sentiment.pivot_table(
        index="hashtags",
        columns="is_branded_content",
        values="avg_sentiment"
    )

    pivot = pivot.rename(columns={
        True: "branded_sentiment",
        False: "non_branded_sentiment"
    })

    pivot["diff"] = pivot.get("branded_sentiment", 0) - pivot.get("non_branded_sentiment", 0)

    return pivot.dropna().sort_values("diff", ascending=False)