In [1]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
#import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda
#!pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim

## sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## visualizing LDA--likely need to install
#!pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctlist = [char for char in string.punctuation] # list of english punctuation marks

In [2]:
import spacy
sp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()
import ast
import networkx as nx
import urllib, json
from itertools import combinations
from collections import Counter

## Load Data

In [3]:
estee_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_estee.csv")
estee_df.head()

In [4]:
custom_words_toad = [
    'estee', 'lauder', 'esteelauder', 'esteelaudersg', 'advancednightrepair', 'anr',  
    'video', 'youtube', 'tiktok', 'instagram', 'reel', 'feed',                        
    'like', 'likes', 'comment', 'comments', 'share', 'save', 'follow', 'subscribe',  
    'today', 'now', 'new', 'shop', 'buy', 'link', 'bio', 'visit', 'available',       
    'beauty', 'skin', 'skincare', 'routine', 'makeup', 'product', 'products',       
    '✨', '🔥', '💧', '💫', '😍', '💖',                                                
    'feel', 'love', 'use', 'try', 'amazing', 'favorite', 'best',                     
    'night', 'repair', 'serum', 'hydrating', 'hydration', 'cream',                    
    'hey', 'hello', 'welcome', 'thank', 'you', 'everyone', 'guys'                    
]


def preprocess(df_col, custom_words_toad):
    porter = PorterStemmer()
    list_stopwords = stopwords.words("english")
    new_stopwords = set(list_stopwords + custom_words_toad)

    corpus_lower = df_col.fillna("").str.lower().to_list()

    nostop_listing = []
    for text in corpus_lower:
        # Clean URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Tokenize and remove stopwords
        tokens = [
            word for word in wordpunct_tokenize(text)
            if word.isalpha() and word not in new_stopwords
        ]
        # Apply stemming
        stemmed_tokens = [porter.stem(word) for word in tokens if len(word) > 2]
        nostop_listing.append(stemmed_tokens)

    return nostop_listing
    
estee_df["text_clean"] = preprocess(estee_df["text"], custom_words_toad)

In [5]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(drop=True), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [6]:
## process text more, lower and to string
estee_df["text_clean_str"] = estee_df["text_clean"].apply(lambda tokens: " ".join(tokens).lower())

### NLP

In [7]:
# Define function to extract GPEs from one string
def get_org(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "ORG"]


In [8]:
# Define function to extract product from one string
def get_product(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]


### Sentiment Analysis

In [9]:
## initialize a scorer
sent_obj = SentimentIntensityAnalyzer()
print(type(sent_obj))
## score one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"
sentiment_example = sent_obj.polarity_scores(practice_listing)
sentiment_example

In [10]:
estee_df["sentiment"] = estee_df["text_clean_str"].apply(sent_obj.polarity_scores)

In [11]:
estee_df["compound"] = estee_df["sentiment"].apply(lambda x: x["compound"])

In [12]:
import seaborn as sns
sns.regplot(x = estee_df["statistics.views"], y = estee_df.compound)

This scatterplot shows the relationship between:
- **x-axis**: statistics.views, which is the number of views an Instagram post got
- **y-axis**: compound, which is the sentiment score from VADER (+1 = very positive, -1 = very negative)

As we can see, most of the estee lauder posts have a view count of under 2 millions views, with most posts having a **positive sentiment**. The regression line is slightly rising, meaning posts with more views tend to have slightly more positive sentiment, but the effect is very small.



### Sentiment Analysis for Branded vs Nonbraded

In [13]:
## Comparing Branded and NonBranded Sentiment
sns.lmplot(data=estee_df, x="statistics.views", y="compound", hue="is_branded_content", scatter_kws={"alpha": 0.4})

In [14]:
is_brand_num = len(estee_df[estee_df["is_branded_content"] == True])
is_brand_num
nonbrand_num = len(estee_df[estee_df["is_branded_content"] == False])
nonbrand_num

#### Observations
The plot shows a clear imbalance in the dataset: there are far more non-branded posts than branded ones (16,988 vs. 401). Most branded posts cluster around high sentiment scores, especially between 0.75 and 1.0, indicating strong positive sentiment. This aligns with expectations—branded content tends to use more promotional and upbeat language.

In contrast, non-branded content spans a broader sentiment range, including both highly positive and negative values, suggesting more varied and authentic user expression.

Interestingly, while branded content is consistently positive, it doesn’t appear to drive significantly higher view counts—both branded and non-branded posts are heavily concentrated under 1 million views. The weak slope of the regression line also indicates that sentiment is not a strong predictor of view count. This suggests that while branded posts may aim to maintain positivity, it’s not necessarily sentiment that drives engagement or visibility. Overall, branded content appears polished and emotionally safe, whereas non-branded content provides richer insights into public perception, making it valuable for understanding audience sentiment in a more nuanced way.

In [15]:
## Comparing Branded and NonBranded Sentiment and like count
sns.lmplot(data=estee_df, x="statistics.like_count", y="compound", hue="is_branded_content", scatter_kws={"alpha": 0.4})

From the plot, we can see a distinct difference in how sentiment relates to like counts for branded and non-branded content.

- **Branded posts (orange)** tend to cluster in the **positive** sentiment range, especially between compound scores of 0.5 to 1.0. The slight **upward** trend suggests that more liked branded content tends to be more positive — though the sample size may be small.

- **Non-branded posts (blue)** show a wider spread of sentiment, ranging from very negative to very positive. Interestingly, there's a **slight negative** trend: more liked non-branded posts actually correlate with slightly lower sentiment. This might reflect how critical or edgy posts can still attract engagement. That said, this trend might be misleading --- there is only one non-branded post with an unusually high like count and low sentiment, which could skew the regression line. Most content overall stays within a lower like range regardless of tone.

The concentration near the origin (low like counts) for both categories suggests that most posts don't go viral — and sentiment doesn't strongly predict popularity in the general case.



### Hashtags and Sentiment

In [16]:
## convert to list 
estee_df['hashtags'] = estee_df['hashtags'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
)
hashtag_df = estee_df.explode("hashtags")

In [17]:
hashtag_df.head()

In [18]:
hashtag_sentiment = (
    hashtag_df.groupby("hashtags")["compound"]
    .mean()
    .sort_values(ascending=False)
)

In [19]:
hashtag_stats = (
    hashtag_df.groupby("hashtags")
    .agg(avg_sentiment=("compound", "mean"), count=("compound", "count"))
    .sort_values(by="avg_sentiment", ascending=False)
)

In [20]:
filtered = hashtag_stats[hashtag_stats["count"] >= 10]

In [21]:
top10 = filtered.head(10)
bottom10 = filtered.tail(10)

plt.figure(figsize=(10, 5))
top10["avg_sentiment"].plot(kind="barh", color="green", title="Top Hashtags by Avg Sentiment")
plt.xlabel("Avg Compound Sentiment")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
bottom10["avg_sentiment"].plot(kind="barh", color="red", title="Lowest Sentiment Hashtags")
plt.xlabel("Avg Compound Sentiment")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### Hashtag Sentiment Analysis Interpretation
#### Top Hashtags by Sentiment Analysis
Based on the two graphs, the hashtags with the highest average sentiment include **#femalepreneur, #makeupgiveaway, and #multieffecteyecream**, indicating strong positive reactions from users. This suggests that Estée Lauder’s campaigns promoting **women’s empowerment, product giveaways, and skincare lines** are resonating well with audiences. Additionally, high sentiment associated with hashtags in*Bahasa Indonesia or Malay (e.g., #jualesteelaudertermurah) indicates that Estée Lauder enjoys positive brand perception in **Southeast Asian** markets, especially in the context of affordability and accessibility. Overall, the consistently positive sentiment across these hashtags suggests that the brand’s marketing efforts are both emotionally resonant and internationally effective.

#### Lowest Hashtags by Sentiment Analysis
The graph displays the hashtags with the lowest average sentiment scores, with terms like **#breastcancerawarenessmonth, #timetoeendbreastcancer, and #pinkribbon** appearing prominently. These hashtags are closely tied to Breast Cancer Awareness campaigns, yet their low sentiment scores likely reflect a **limitation** of the sentiment analysis tool rather than genuine negativity. Models like VADER rely on individual word polarity, so emotionally heavy terms such as "cancer," "diagnosed," or "survivor" can skew the sentiment negatively, even when the overall message is **hopeful, supportive, or awareness-driven**. For example, a post honoring a survivor or discussing the impact of breast cancer may contain compassionate intent but still be flagged as negative due to language associated with illness or loss. You can see the text and their compound rating below. Additionally, hashtags like #blackwomen may be part of broader conversations about health equity or underrepresentation, which can surface complex or critical discourse not easily captured by standard sentiment tools. These results highlight the importance of considering context and social intent when interpreting sentiment scores, especially in campaigns related to health, advocacy, and inclusion.

In reality, these posts are often tied to deeply meaningful, supportive messaging. Their frequency in the dataset actually points to Estée Lauder’s **strong commitment to breast cancer advocacy**, showing that the brand continues to spotlight this cause prominently across their content.

To better reflect the true tone of such socially driven campaigns, a more **nuanced** analysis would be needed—such as applying context-aware models (like transformer-based sentiment classifiers), adding a custom label for awareness content, or even conducting qualitative keyword and theme analysis rather than relying solely on sentiment scores. This would avoid misinterpreting emotionally sensitive but impactful content and give a more accurate picture of Estée Lauder’s brand voice and values.

In [22]:
bc_df = estee_df[estee_df['hashtags'].astype(str).str.contains("breastcancer", case=False, na=False)]
text_compound = bc_df[["text", "compound", "sentiment"]]
pd.set_option('display.max_colwidth', None)
text_compound.sample(10)  # 10 random rows

## Network Analysis

###  Focus on Branded Data

In [23]:
branded_df = estee_df[estee_df['is_branded_content'] == True]
# Flatten all hashtags
flat_tags = [tag for tags in branded_df['hashtags'] if isinstance(tags, list) for tag in tags]
top_tags = set([tag for tag, _ in Counter(flat_tags).most_common(50)])  # or 100

In [24]:
co_occurrence_edges = []
for tags in elf_df["hashtags"]:
    if isinstance(tags, list):
        tags = [tag.strip() for tag in tags if isinstance(tag, str) and tag.strip()]
        if len(tags) > 1:
            co_occurrence_edges.extend(combinations(sorted(set(tags)), 2))

In [None]:
edge_counts = Counter(co_occurrence_edges)
edge_df = pd.DataFrame(edge_counts.items(), columns=["pair", "weight"])
edge_df["source"] = edge_df["pair"].apply(lambda x: x[0])
edge_df["target"] = edge_df["pair"].apply(lambda x: x[1])
edge_df = edge_df[["source", "target", "weight"]]

In [None]:
# Keep only strong co-occurrence edges
filtered_edge_df = edge_df[edge_df["weight"] >= 5]  # Try 3, 4, or even 5
G = nx.from_pandas_edgelist(filtered_edge_df, source='source', target='target', edge_attr='weight')

# Create 'group' attribute using group_map
for node in G.nodes():
    group = group_map.get(node, default_group)
    G.nodes[node]['group'] = group
    G.nodes[node]['color'] = group2color[group]  # Optional: for future coloring



In [None]:
largest_cc = max(nx.connected_components(G), key=len)
GG = G.subgraph(largest_cc).copy()

stylized_network, config = visualize(GG, port=2000)


In [None]:
stylized_network[:5]  # Show first few node entries to confirm group/color are applied

## LDA

In [None]:
## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= estee_df.text_clean_str,
                      metadata = estee_df[['is_branded_content', 'hashtags', 'post_owner.name']])

In [None]:
## show first set of rows/cols
dtm_nopre.head()

## show arbitrary later cols in resulting data
dtm_nopre.shape
dtm_nopre.iloc[0:5, 480:500]

In [None]:
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

In [None]:
print(dtm_nopre.columns)


In [None]:
## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  estee_df.text_clean_str]


## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(estee_df.shape[0]*0.05)
upper_bound = round(estee_df.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering

## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

In [None]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True)

print(type(ldamod))

In [None]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)

In [None]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

In [None]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

In [None]:
topics = ldamod.print_topics(num_words=10)
for i, topic in topics:
    print(f"Topic {i}: {topic}")



In [None]:
topic_labels = {
    0: "Makeup",
    1: "Gifts & Retail",
    2: "Skincare",
    3: "Campaign",
    4: "Work"
}


## LDA Conclusion
Our LDA model surfaced five main themes in Estée Lauder’s Instagram posts:

- Makeup – Posts focused on product launches, tutorials, and beauty looks.

- Gifts & Retail – Content around gift sets, holiday promos, and store campaigns.

- Skincare – Posts highlighting skincare benefits, ingredients, and routines.

- Campaign – Branded hashtags, slogans, and collabs with ambassadors.

- Work – Behind-the-scenes moments and glimpses into the team or brand culture.

Overall, Estée Lauder’s content blends product focus with brand storytelling and seasonal marketing, giving followers both inspiration and insight into the company.



## Combine LDA and Sentiment Analysis