# **Topic Modelling**

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from collections import Counter
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import spacy
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import gensim
from gensim.models import KeyedVectors
from pprint import pprint
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pilyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.DataFrame(columns= ['source', 'source_type', 'date', 'author', 'headline', 'lead', 'body', 'keywords', 'country'])

input_data = {
    'jasper.csv': 'chatbot',
    'chatgpt.csv': 'chatbot',
    'perplexity.csv': 'chatbot',
    'cosas_pe.csv': 'human',
    'harpersbazaar_mx.csv': 'human',
    'instyle_es.csv': 'human',
    'lofficiel_ar.csv': 'human',
    'oceandrive_ve.csv': 'human',
    'paula_cl.csv': 'human',
    'somosohlala_ar.csv': 'human',
    'telva_es.csv': 'human',
    'vistelacalle_cl.csv': 'human',
    'vogue_es.csv': 'human',
    'vogue_mx.csv': 'human',
}

for filename, source_type in input_data.items():
    data = pd.read_csv(f'../data/{filename}', header=0, delimiter=",", encoding='utf-8', dtype=str)
    data['source_type'] = source_type
    df = pd.concat([df, data], ignore_index=True)

df['date'] = pd.to_datetime(df['date'], format='ISO8601', utc=True)
df['author'] = df['author'].str.title()
df.sample(5)

Unnamed: 0,source,source_type,date,author,headline,lead,body,keywords,country
1048,Vogue Spain,human,2020-03-11 09:31:51.813000+00:00,Julia Hobbs,Los 8 principales temas de conversación del me...,Aquí tienes los 8 temas que más han dado que h...,El mes de la moda temporada otoño-invierno 202...,"tendencias otoño-invierno 2020-2021,semanas de...",Spain
674,Telva,human,2022-03-24 13:03:16+00:00,The Newsroom,Los marketplaces que reúnen marcas conscientes...,Reunimos los marketplaces que apuestan por la ...,"Propuestas con una producción artesanal , vega...","moda,moda sostenible,Noticias de moda",Spain
91,Revista COSAS,human,2023-02-06 20:49:58+00:00,Gabriela Peña,Entrevista al arquitecto Tom Gimbert: Hacia un...,"Desde hace una década, Tom Gimbert, arquitecto...","Desde hace una década, Tom Gimbert, arquitecto...",Tom Gimbert,Peru
765,Viste La Calle,human,2018-07-12 14:00:39+00:00,Andrea Martínez Maugard,Entrevista a los creadores de I-Tal Collective...,"Vicente, uno de los creadores de I-Tal Collect...","Vicente, uno de los creadores de I-Tal Collect...",moda,Chile
429,Ocean Drive,human,2020-10-29 16:38:11+00:00,Luis Fernando Jimenez,MODA SOSTENIBLE A UN NUEVO NIVEL: ZARA Y PANTO...,Aunque el 2020 no ha sido como muchos lo esper...,Aunque el 2020 no ha sido como muchos lo esper...,"eco-friendly,moda ecológica,sostenibilidad,Zara",Venezuela


### Prepare data 

In [3]:
df['combined_text'] = df[['headline', 'lead', 'body']].fillna('').apply(lambda x: ' '.join(x), axis=1)
df['combined_text'] 

0       La industria de la moda es uno de los sectores...
1       Industria de la moda y moda sostenible La pala...
2       ¿Cómo la industria de la moda latinoamericana ...
3       Industria de la moda, moda sostenible y moda s...
4       La industria de la moda se une para el Día Mun...
                              ...                        
1145    Blake Lively confirma la tendencia de los vest...
1146    Dakota Johnson eleva un crop top básico con pa...
1147    Cómo llevar vestido boho transparente y lucir ...
1148    Anitta lleva pantalones baggy y sandalias de t...
1149    Culpar a la sobrepoblación del cambio climátic...
Name: combined_text, Length: 1150, dtype: object

In [4]:
nlp = spacy.load("es_core_news_md", disable = ["ner","parser"])

In [5]:
from unidecode import unidecode # Remove accents

def cleaning(doc):
    txt = [unidecode(token.lemma_.lower()) for token in doc if not token.is_stop and len(token) > 1 and token.lemma_ != 'el' and token.text != 'el'] # this line tokenizes, takes out stopwords, and returns lemmas
    return " ".join(txt)

def process(series):
    docs = [row for row in series]
    txts = [cleaning(doc) for doc in nlp.pipe(docs, n_process=-1, batch_size=500)] # nlp.pipe is a bit of spacy magic to run the process efficiently 
    return txts

In [6]:
df['combined_text'] = process(df['combined_text'])
df.sample(10)

Unnamed: 0,source,source_type,date,author,headline,lead,body,keywords,country,combined_text
607,Telva,human,2022-07-21 06:28:55+00:00,Elisa Álvarez Espejo,La firma gallega que está detrás de las blusas...,Las blusas (y vestidos) de la firma Andion no ...,El talento de los gallegos junto con su trabaj...,"moda,Tendencias de moda,moda sostenible,Looks",Spain,firma gallego blusa likes ig marca merecer pen...
810,Viste La Calle,human,2016-11-17 16:00:22+00:00,Vistelacalle,"In.Seek.Too, la marca de ropa sustentable y mu...",Colaboración por Paulina Mardones Una misma ...,Colaboración por Paulina Mardones Una misma pr...,moda,Chile,in seek too marca ropa sustentable multifuncio...
944,Vogue Spain,human,2021-01-18 14:57:49.330000+00:00,Emily Chan,Stella McCartney nos da las claves para lograr...,Stella McCartney se ha dirigido a los alumnos ...,Stella McCartney está ahora mismo pasando tran...,"stella mccartney,sostenibilidad,vogue 365",Spain,stella mccartney clave lograr industria sosten...
1086,Vogue Spain,human,2020-05-02 07:03:10.138000+00:00,Patricia Moreno,"Cuando acabe el confinamiento, ¿compraremos de...",Expertos en sostenibilidad y consumo creen que...,Descarga el número íntegro y gratuito de Vogue...,"sostenibilidad,ecológico,stella mccartney,h&m,...",Spain,acabar confinamiento compraremos sostenible la...
179,Revista COSAS,human,2021-10-02 02:19:54+00:00,Arianna Gonzales,Paris Fashion Week: El resumen de las mejores ...,"En esta nota, descubre el resumen definitivo d...",El calendario de moda 2021 llega a su parte fi...,"Balmain,Loewe,YSL",Peru,paris fashion week resumen mejor pasarela nota...
234,Harper's Bazaar,human,2021-11-12 00:00:00+00:00,Lorenza García,FOTOS: Las mejores vestidas los de los CDFA Fa...,Los CFDA Fashion Awards 2021 fueron todo un éx...,Los CFDA Fashion Awards estuvieron a cargo de ...,"looks,Cara Delevingne,CFDA,Zendaya,Emily Blunt...",Mexico,foto mejor vestido cdfa fashion awards cfda fa...
130,Revista COSAS,human,2023-01-15 21:44:08+00:00,Walter Chunga,"¿Quién es R'Bonney Gabriel, la ganadora del Mi...",La modelo y diseñadora de modas estadounidense...,La modelo y diseñadora de modas estadounidense...,"EE.UU.,Estados Unidos,Miss Universo 2022,R'Bon...",Peru,r'bonney gabriel ganadora miss universo 2022 m...
874,Vogue Spain,human,2022-06-25 12:55:46.770000+00:00,Tatiana Ojea,"SUAGONGO, la firma de joyas atemporales y sost...","Aunando tecnología, sostenibilidad y artesanía...",SUAGONGO es una firma de joyas minimalistas y ...,"joyas,sostenibilidad,pendientes,vogue 365,moda...",Spain,suagongo firma joya atemporal sostenible auten...
1146,Vogue Mexico,human,2023-04-28 16:59:40+00:00,Katharina Fuchs,Dakota Johnson eleva un crop top básico con pa...,Los crop tops lisos no son precisamente famoso...,Dakota Johnson lo demuestra con su look más re...,Dakota Johnson,Mexico,dakota johnson elevar crop top basico pantalon...
14,Jasper,chatbot,2023-04-06 10:07:27+00:00,Jasper,El futuro de la moda sostenible: ¿Qué cambios ...,El futuro de la moda sostenible está aquí para...,Los últimos años han visto un gran impulso en ...,"moda,moda futura,moda sostenible,moda sustentable",,futuro moda sostenible cambio esperar ano \r ...


In [7]:
texts = [text.split() for text in df['combined_text'].tolist()]

In [8]:
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [9]:
print(corpus[:2])
#the first number is the index of the word in the vocabulary
#the second number is the frequency of this word in the specific document

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 3), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 4), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 3), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 3), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 13), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 3), (108, 1), (109, 1), (110, 1

In [10]:
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]])

[[('3d', 1), ('abrumador', 1), ('acceso', 1), ('accesorio', 2), ('agregar', 1), ('alimentar', 1), ('ambiente', 1), ('america', 1), ('ano', 2), ('aprender', 1), ('area', 1), ('asequible', 1), ('asia', 1), ('aspecto', 1), ('auge', 1), ('avance', 1), ('avanzar', 1), ('ayudar', 1), ('buscar', 1), ('cambiar', 1), ('cambio', 1), ('comercializacion', 1), ('comienzo', 1), ('competitivo', 1), ('complejo', 1), ('comprador', 1), ('comprension', 1), ('comprometido', 1), ('comunicar', 1), ('conexion', 1), ('consciente', 1), ('considerar', 1), ('constante', 1), ('consumidor', 3), ('continuar', 1), ('contribuir', 1), ('crear', 2), ('crecer', 1), ('crecimiento', 1), ('demanda', 1), ('desafio', 2), ('desarrollo', 1), ('describir', 1), ('descubrir', 1), ('digital', 1), ('dinamica', 1), ('dinamico', 1), ('directo', 1), ('disenador', 1), ('diseno', 4), ('economia', 1), ('ejemplo', 1), ('elemento', 1), ('emergente', 1), ('emocion', 1), ('emocionante', 2), ('emprender', 1), ('empresa', 1), ('energizante', 1

### LDA Topic Modeling

LDA is a mathematical model used to locate and measure the mix of words associated with each topic and, in turn, determine the mix of topics used to describe each document. Resources: http://www.morethanbooks.eu/topic-modeling-introduccion/ and http://www.cs.columbia.edu/~blei/papers/Blei2012.pdf

Some explanation of the parameters:
    
- `corpus` : the bag of word corpus we have created in the previous cell
- `id2word`: the vocabulary index
- `num_topic`: the number of topics we want to create
- `random_state`: a number (seed) to replicate the results if the topic modeling is run by someone else or at a different moment
- `update_every`: Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning.
- `chunksize`: Number of documents to be used in each training chunk (i.e., how many documents are processed at a time in the training algorithm). The higher the better, but it depends also on the number of documents we have in the corpus. With big collections of documents we can use values like 100 or higher.
- `passes`: Number of passes through the corpus during training (i.e., how often we train the model on the entire corpus; how often we repeat a particular loop over each document)
- `per_word_topics`: If True, the model also computes a list of topics, sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count).

In [11]:
# First training 

In [12]:
# The number of documents to be used in this training is 50 (chunksize) 
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=3, random_state=42, 
   update_every=1, chunksize=50, passes=10, alpha='auto', per_word_topics=True
)

In [13]:
# Print the topic (number before comma), and the list of associated words per each topic (The numbers next to the words represent the importance or weight of each word in the given topic)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.016*"moda" + 0.012*"marca" + 0.010*"sostenible" + 0.010*"el" + '
  '0.009*"prenda" + 0.009*"ropa" + 0.008*"industria" + 0.007*"producto" + '
  '0.006*"material" + 0.006*"comprar"'),
 (1,
  '0.011*"el" + 0.008*"ano" + 0.005*"vida" + 0.004*"mundo" + 0.004*"persona" + '
  '0.004*"climatico" + 0.004*"mujer" + 0.003*"querer" + 0.003*"ver" + '
  '0.003*"cosa"'),
 (2,
  '0.009*"moda" + 0.009*"coleccion" + 0.008*"vestido" + 0.007*"diseno" + '
  '0.006*"pieza" + 0.006*"el" + 0.006*"marca" + 0.006*"vintage" + '
  '0.005*"prenda" + 0.005*"firma"')]


In [14]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [15]:
# Second training 

In [16]:
# The number of documents to be used in this training is 100 (chunksize) 
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=3, random_state=42, 
   update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
)

In [17]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.018*"moda" + 0.015*"marca" + 0.011*"prenda" + 0.010*"ropa" + '
  '0.009*"sostenible" + 0.008*"el" + 0.008*"industria" + 0.007*"material" + '
  '0.006*"comprar" + 0.006*"producto"'),
 (1,
  '0.012*"el" + 0.007*"ano" + 0.005*"vida" + 0.004*"mundo" + 0.004*"climatico" '
  '+ 0.003*"cambio" + 0.003*"mujer" + 0.003*"persona" + 0.003*"querer" + '
  '0.003*"poder"'),
 (2,
  '0.008*"coleccion" + 0.008*"vestido" + 0.008*"moda" + 0.006*"el" + '
  '0.006*"pieza" + 0.006*"diseno" + 0.005*"vintage" + 0.005*"look" + '
  '0.005*"ano" + 0.004*"marca"')]


In [18]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [19]:
# Save the figure in PNG
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import tempfile
import webbrowser

# Generate PyLDAvis visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

# Save the visualization as HTML file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
pyLDAvis.save_html(vis, temp_file.name)

# Open the HTML file in a web browser
webbrowser.open(temp_file.name)


True

### TOPIC MODELLING HUMAN TEXTS

In [20]:
human_data = df[df['source_type'] == 'human']
human_data['combined_text'] 

86      sw apuesta moda sostenible susan wagner rastri...
87      guia cosa marca moda sostenible peruano poder ...
88      marca peruano invitar festival moda sostenible...
89      disenadores peruano presentar coleccion feria ...
90      radicales sostenible vivienda hermoso hecho ba...
                              ...                        
1145    blake lively confirmar tendencia vestido gabar...
1146    dakota johnson elevar crop top basico pantalon...
1147    vestido boho transparente lucir sexy 40 vestid...
1148    anitta pantalon baggy sandalia tacon 2000 cant...
1149    culpar sobrepoblacion cambio climatico problem...
Name: combined_text, Length: 1064, dtype: object

In [21]:
texts = [text.split() for text in human_data['combined_text'].tolist()]
id2word = corpora.Dictionary(texts)  #
corpus = [id2word.doc2bow(text) for text in texts]

In [22]:
# The number of documents to be used in this training is 100 (chunksize) 
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=3, random_state=42, 
   update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
)

In [23]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"vestido" + 0.008*"vintage" + 0.007*"moda" + 0.006*"look" + '
  '0.005*"the" + 0.005*"coleccion" + 0.004*"ano" + 0.004*"lucir" + '
  '0.004*"pieza" + 0.004*"el"'),
 (1,
  '0.010*"el" + 0.005*"producto" + 0.004*"natural" + 0.004*"piel" + '
  '0.003*"belleza" + 0.003*"vida" + 0.003*"color" + 0.003*"bolso" + '
  '0.003*"zapatilla" + 0.002*"envase"'),
 (2,
  '0.013*"moda" + 0.010*"marca" + 0.010*"el" + 0.008*"prenda" + 0.006*"ano" + '
  '0.006*"ropa" + 0.006*"sostenible" + 0.005*"industria" + 0.004*"mundo" + '
  '0.004*"material"')]


In [24]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [None]:
# Generate PyLDAvis visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

# Save the visualization as HTML file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
pyLDAvis.save_html(vis, temp_file.name)

# Open the HTML file in a web browser
webbrowser.open(temp_file.name)

### TOPIC MODELLING CHATBOT TEXTS

In [25]:
chatbot_data = df[df['source_type'] == 'chatbot']
chatbot_data['combined_text'] 

0     industria moda sector dinamico competitivo mun...
1     industria moda moda sostenible palabra modo an...
2     industria moda latinoamericano transformar reg...
3     industria moda moda sostenible moda sustentabl...
4     industria moda unir mundial agua ano 22 marzo ...
                            ...                        
81    aja barber activista moda sostenible luchar fa...
82    aja barber activista luchar sostenibilidad mod...
83    moda sostenible espana chile marca apostar sos...
84    moda sostenible espana chile marca apostar sos...
85    moda sostenible espana chile marca apostar sos...
Name: combined_text, Length: 86, dtype: object

In [26]:
texts = [text.split() for text in chatbot_data['combined_text'].tolist()]
id2word = corpora.Dictionary(texts) 
corpus = [id2word.doc2bow(text) for text in texts]

In [27]:
# The number of documents to be used in this training is 10 (chunksize). Being a smaller dataset, the amount of chucksize was lowered
lda_model = gensim.models.ldamodel.LdaModel(
   corpus=corpus, id2word=id2word, num_topics=3, random_state=42, 
   update_every=1, chunksize=10, passes=10, alpha='auto', per_word_topics=True
)

In [28]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.070*"moda" + 0.025*"industria" + 0.019*"barber" + 0.013*"forma" + '
  '0.011*"aja" + 0.010*"activista" + 0.008*"descolonizacion" + '
  '0.008*"explotacion" + 0.007*"resistencia" + 0.007*"textil"'),
 (1,
  '0.037*"marca" + 0.021*"cena" + 0.015*"ropa" + 0.015*"chile" + '
  '0.014*"elegir" + 0.011*"sostenibilidad" + 0.011*"outfit" + 0.011*"optar" + '
  '0.010*"tienda" + 0.010*"comprar"'),
 (2,
  '0.065*"moda" + 0.063*"sostenible" + 0.024*"marca" + 0.019*"industria" + '
  '0.018*"produccion" + 0.017*"impacto" + 0.016*"ropa" + 0.016*"prenda" + '
  '0.015*"ambiental" + 0.013*"material"')]


In [29]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [None]:
# Generate PyLDAvis visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

# Save the visualization as HTML file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
pyLDAvis.save_html(vis, temp_file.name)

# Open the HTML file in a web browser
webbrowser.open(temp_file.name)