In [1]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
#import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda
#!pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim

## sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## visualizing LDA--likely need to install
#!pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctlist = [char for char in string.punctuation] # list of english punctuation marks

In [2]:
import spacy
sp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()
import ast
import networkx as nx
import urllib, json
from itertools import combinations
from collections import Counter
from netwulf import visualize

In [3]:
#estee_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_estee.csv")
#tarte_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_tarte.csv")
#innisfree_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_innisfree.csv")
#elf_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_elf.csv")
'''glossier_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_glossier.csv",
    low_memory=False)
laneige_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_laneige.csv")
sulwhasoo_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_sulwhasoo.csv")
etude_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_etude.csv")
cosrx_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_cosrx.csv",low_memory=False)
fenty_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/instagram_fenty.csv",low_memory=False)'''
all_df = pd.read_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/all_brands_cleaned.csv", low_memory=False)

In [4]:
'''estee_df["brand"] = "Estée Lauder"
tarte_df["brand"] = "Tarte"
innisfree_df["brand"] = "Innisfree"
elf_df["brand"] = "e.l.f"
glossier_df["brand"] = "Glossier"
laneige_df["brand"] = "Laneige"
sulwhasoo_df["brand"] = "Sulwhasoo"
etude_df["brand"] = "Etude"
cosrx_df["brand"] = "COSRX"
fenty_df["brand"] = "Fenty Beauty"'''

In [5]:
# concat everything
'''
all_df = pd.concat([
    estee_df, tarte_df, innisfree_df, elf_df, glossier_df,
    laneige_df, sulwhasoo_df, etude_df, cosrx_df, fenty_df
], ignore_index=True)
'''

In [6]:
all_df.sample(20)

In [7]:
custom_words_toad = [
    # Brand names (removed from analysis)
    'estee', 'lauder', 'tarte', 'fenty', 'glossier', 'cosrx', 'etude',
    'sulwhasoo', 'laneige', 'innisfree', 'elf',

    # Platform-related
    'video', 'youtube', 'tiktok', 'instagram', 'reel', 'feed',
    'post', 'stories', 'caption', 'social', 'media',

    # Engagement / action words
    'like', 'likes', 'comment', 'comments', 'share', 'save', 'follow', 'subscribe',
    'tag', 'click', 'link', 'bio', 'visit', 'dm', 'available', 'check',

    # Time / filler
    'today', 'now', 'new', 'soon', 'launch', 'launching', 'stay', 'tune', 'coming', 'back',

    # General beauty-related terms
    'beauty', 'skin', 'skincare', 'routine', 'makeup', 'product', 'products',
    'face', 'body', 'glow', 'look', 'formula', 'texture', 'result',

    # Emoji / symbols
    '✨', '🔥', '💧', '💫', '😍', '💖', '🌟', '💥', '🧴', '📦', '🛍️',

    # Overused positive adjectives
    'feel', 'love', 'use', 'try', 'amazing', 'favorite', 'best', 'perfect', 'must', 'obsessed',

    # Promotional terms
    'shop', 'buy', 'discount', 'deal', 'sale', 'off', 'gift', 'giveaway', 'free', 'offer',

    # Conversation filler
    'hey', 'hello', 'welcome', 'thank', 'you', 'everyone', 'guys', 'hi', 'omg', 'pls', 'yay', 'get', 'got', 'let', 'us'
]


def preprocess(df_col, custom_words_toad):
    porter = PorterStemmer()
    list_stopwords = stopwords.words("english")
    new_stopwords = set(list_stopwords + custom_words_toad)

    corpus_lower = df_col.fillna("").str.lower().to_list()

    nostop_listing = []
    for text in corpus_lower:
        # Clean URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Tokenize and remove stopwords
        tokens = [
            word for word in wordpunct_tokenize(text)
            if word.isalpha() and word not in new_stopwords
        ]
        # Apply stemming
        stemmed_tokens = [porter.stem(word) for word in tokens if len(word) > 2]
        nostop_listing.append(stemmed_tokens)

    return nostop_listing
    
# already ran this before
#all_df["text_clean"] = preprocess(all_df["text"], custom_words_toad)

In [8]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True, max_features=5000  # or try 10000 if you can afford more memory) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(drop=True), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [9]:
## process text more, lower and to string
#all_df["text_clean_str"] = all_df["text_clean"].apply(lambda tokens: " ".join(tokens).lower())

In [10]:
#all_df.to_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/all_brands_cleaned.csv", index=False)


## Sentiment Analysis

In [11]:
## initialize a scorer
sent_obj = SentimentIntensityAnalyzer()
print(type(sent_obj))
## score one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"
sentiment_example = sent_obj.polarity_scores(practice_listing)
sentiment_example

In [None]:
all_df["sentiment"] = all_df["text_clean_str"].astype(str).apply(sent_obj.polarity_scores)

In [None]:
all_df["compound"] = all_df["sentiment"].apply(lambda x: x["compound"])

In [None]:
import seaborn as sns
sns.regplot(x = all_df["statistics.views"], y = all_df.compound)

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=all_df, x="brand", y="compound", palette="Set2")
plt.xticks(rotation=45)
plt.title("Sentiment Distribution per Brand")
plt.tight_layout()
plt.show()

In [None]:
avg_sentiment = all_df.groupby("brand")["compound"].mean().sort_values()
sns.barplot(x=avg_sentiment.values, y=avg_sentiment.index)
plt.title("Average Sentiment Score by Brand")

In [None]:
sns.lmplot(data=all_df, x="statistics.like_count", y="compound", hue="is_branded_content", scatter_kws={"alpha": 0.4})

In [None]:
## convert to list 
all_df['hashtags'] = all_df['hashtags'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
)
hashtag_df = all_df.explode("hashtags")

In [None]:
#normalize hashtag text
hashtag_df["hashtags"] = hashtag_df["hashtags"].astype(str).str.lower().str.strip()
hashtag_df = hashtag_df[hashtag_df["hashtags"].notna() & (hashtag_df["hashtags"] != "")]

In [None]:
# Aggregate sentiment per brand and hashtag
hashtag_sentiment = (
    hashtag_df.groupby(["brand", "hashtags"])
    .agg(avg_sentiment=("compound", "mean"), count=("hashtags", "count"))
    .reset_index()
)

In [None]:
hashtag_stats = (
    hashtag_df.groupby("hashtags")
    .agg(avg_sentiment=("compound", "mean"), count=("compound", "count"))
    .sort_values(by="avg_sentiment", ascending=False)
)

In [None]:
filtered = hashtag_stats[hashtag_stats["count"] >= 50]
filtered.head()

In [None]:
# Loop through each brand
brands = hashtag_sentiment["brand"].unique()
hashtag_sentiment["hashtags"] = hashtag_sentiment["hashtags"].astype(str)
hashtag_sentiment = hashtag_sentiment[
    hashtag_sentiment["hashtags"].apply(lambda x: all(char in string.printable for char in x))
]

for brand in brands:
    # Filter sentiment data to current brand
    filtered = hashtag_sentiment[hashtag_sentiment["brand"] == brand]

    # Apply frequency filter: only hashtags used at least 10 times
    filtered = filtered[filtered["count"] >= 10]

    # Sort by average sentiment
    filtered = filtered.sort_values("avg_sentiment", ascending=False)

    # Skip brands with fewer than 10 qualifying hashtags
    if len(filtered) < 10:
        continue

    top10 = filtered.head(10)
    bottom10 = filtered.tail(10)

    # Top 10 Hashtags
    plt.figure(figsize=(10, 5))
    plt.barh(top10["hashtags"], top10["avg_sentiment"], color="green")
    plt.title(f"{brand}: Top 10 Hashtags by Average Sentiment (min 10 uses)")
    plt.xlabel("Avg Compound Sentiment")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    # Bottom 10 Hashtags
    plt.figure(figsize=(10, 5))
    plt.barh(bottom10["hashtags"], bottom10["avg_sentiment"], color="red")
    plt.title(f"{brand}: Bottom 10 Hashtags by Average Sentiment (min 50 uses)")
    plt.xlabel("Avg Compound Sentiment")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()


## LDA

In [None]:
dtm_nopre = create_dtm(
    list_of_strings=all_df["text_clean_str"].fillna(""),  # Replace NaN with empty string
    metadata=all_df[["is_branded_content", "hashtags", "post_owner.name"]]
)

In [None]:
## show first set of rows/cols
dtm_nopre.head()

## show arbitrary later cols in resulting data
dtm_nopre.shape
dtm_nopre.iloc[0:5, 480:500]

In [None]:
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

In [None]:
## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  all_df.text_clean_str]


## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(all_df.shape[0]*0.05)
upper_bound = round(all_df.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering

## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

In [None]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 8, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True)

print(type(ldamod))

In [None]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)

In [None]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

In [None]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)