<a href="https://colab.research.google.com/github/pedro9olivares/ML_and_AI_for_the_Working_Analyst/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introducción: carga de datos, exploración y filtrado

In [None]:
!pip install pyLDAvis  # Para visualizar los resultados de LDA

In [None]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt 
import seaborn as sns  

nltk.download('stopwords')
nltk.download('wordnet')  #WordNetLemmatizer
nltk.download('punkt')

In [7]:
def preprocesar(texto):
  # Convierte a minúsculas
  texto = (texto).lower()

  # Elimina stopwords
  stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  texto = stop.sub('', texto) 

  # Quitar puntuación y números
  texto = re.sub('[^ña-z]+', ' ', texto)

  # Lematizar y quedarnos con palabras que tengan más de tres caracteres
  lemmatizer = WordNetLemmatizer()
  texto = texto.split()
  texto = ' '.join([lemmatizer.lemmatize(i) for i in texto if len(i)>2])
  
  return(texto)

In [3]:
df = pd.read_csv('/content/drive/MyDrive/LDA/abcnews-date-text.csv', parse_dates=['publish_date'] )
print(df.shape)
df.head()

(1226258, 2)


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [4]:
df.publish_date.min(), df.publish_date.max()

(Timestamp('2003-02-19 00:00:00'), Timestamp('2020-12-31 00:00:00'))

In [5]:
# Promedio de noticias por día
len(df)/(df.publish_date.max()-df.publish_date.min()).days

187.93226053639847

In [6]:
# Filtrado a todas las noticias del 2020
filtro = (df['publish_date']>= '2020-01-01')&(df['publish_date']<= '2020-12-31')
df = df[filtro]
df = df.reset_index(drop=True )
len(df)

40240

In [8]:
df['headline_pp']=df['headline_text'].apply(preprocesar)
df.head()

Unnamed: 0,publish_date,headline_text,headline_pp
0,2020-01-01,a new type of resolution for the new year,new type resolution new year
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight new year eve f...
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...


In [9]:
# Bag of words (BOW)
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,2))
BOW = vectorizer.fit_transform(df['headline_pp'])
BOW.shape

(40240, 17889)

In [11]:
# Número de palabras distintas
vocabulario = vectorizer.get_feature_names_out()
len(vocabulario)

17889

# LDA


In [13]:
# n_components especifica en cuántos temas particionará el dataset
lda_model=LatentDirichletAllocation(n_components=5,learning_method='online',random_state=42,max_iter=50)

In [14]:
%%time
lda_model.fit(BOW) # Entrena el modelo y obtiene la matriz documento-topico

CPU times: user 4min 41s, sys: 1.73 s, total: 4min 43s
Wall time: 4min 42s


LatentDirichletAllocation(learning_method='online', max_iter=50, n_components=5,
                          random_state=42)

## Distribución de temas en cada noticia (O ó ϴ) 
(Relación de cada encabezado con cierto tema)

In [18]:
document_topic = pd.DataFrame(lda_model.transform(BOW))
print(document_topic.shape)
document_topic.head()

(40240, 5)


Unnamed: 0,0,1,2,3,4
0,0.199279,0.028573,0.184108,0.028812,0.559229
1,0.025041,0.025081,0.025191,0.025001,0.899686
2,0.693037,0.01693,0.017398,0.112981,0.159655
3,0.365887,0.532855,0.034198,0.033334,0.033726
4,0.239982,0.439999,0.04,0.04,0.240019


In [21]:
pd.merge(df, document_topic, left_index=True, right_index=True)

Unnamed: 0,publish_date,headline_text,headline_pp,0,1,2,3,4
0,2020-01-01,a new type of resolution for the new year,new type resolution new year,0.199279,0.028573,0.184108,0.028812,0.559229
1,2020-01-01,adelaide records driest year in more than a de...,adelaide record driest year decade,0.025041,0.025081,0.025191,0.025001,0.899686
2,2020-01-01,adelaide riverbank catches alight after new ye...,adelaide riverbank catch alight new year eve f...,0.693037,0.016930,0.017398,0.112981,0.159655
3,2020-01-01,adelaides 9pm fireworks spark blaze on riverbank,adelaide firework spark blaze riverbank,0.365887,0.532855,0.034198,0.033334,0.033726
4,2020-01-01,archaic legislation governing nt women propert...,archaic legislation governing woman property r...,0.239982,0.439999,0.040000,0.040000,0.240019
...,...,...,...,...,...,...,...,...
40235,2020-12-31,what abc readers learned from 2020 looking bac...,abc reader learned looking back year,0.020312,0.143208,0.020001,0.696724,0.119755
40236,2020-12-31,what are the south african and uk variants of ...,south african variant covid,0.033334,0.366653,0.034192,0.366597,0.199225
40237,2020-12-31,what victorias coronavirus restrictions mean f...,victoria coronavirus restriction mean new year...,0.374914,0.015385,0.015421,0.578345,0.015936
40238,2020-12-31,whats life like as an american doctor during c...,whats life like american doctor covid,0.149999,0.399955,0.025230,0.274785,0.150030


## Distribución de palabras en cada tema (μ)

In [22]:
μs = pd.DataFrame(lda_model.exp_dirichlet_component_,
                         columns=vocabulario)
print(μs.shape)
μs.head()

(5, 17889)


Unnamed: 0,aacta,aaron,ab,abandon,abandoned,abandoned baby,abattoir,abbott,abc,abc analyst,...,zlate,zlate cvetanovski,zodiac,zoe,zombie,zone,zoo,zoom,zuckerberg,zverev
0,8.055449e-08,8.394868e-08,8.290329e-08,8.59712e-08,0.0002583584,4.856293e-05,8.847004e-08,0.0001606078,9.363507e-08,7.99874e-08,...,8.007074e-08,8.007074e-08,8.259616e-08,8.386654e-08,5.527396e-05,1.010441e-07,8.700062e-08,0.0002163308,7.941396e-08,8.010021e-08
1,5.479865e-05,9.063174e-08,8.905556e-08,9.687651e-08,9.750708e-08,9.057895e-08,0.0001858361,9.396337e-08,0.002232402,8.905083e-08,...,8.799667e-08,8.799667e-08,5.563037e-05,9.990114e-08,9.725587e-08,9.643613e-08,9.649468e-08,1.009692e-07,4.355457e-05,0.0001318067
2,8.937147e-08,4.060349e-05,8.383059e-08,0.000145318,9.298521e-08,9.201562e-08,9.192969e-08,8.576488e-08,8.999806e-08,8.442723e-08,...,9.009405e-08,9.009405e-08,8.384335e-08,4.324443e-05,8.518493e-08,0.0006206949,0.0005930656,9.235608e-08,8.460164e-08,8.441493e-08
3,6.273702e-08,6.972041e-08,0.0002316852,7.293856e-08,7.287104e-08,6.769409e-08,0.0001518431,6.654507e-08,0.001102018,4.525883e-05,...,6.478957e-08,6.478957e-08,6.27436e-08,6.877298e-08,6.273777e-08,7.352419e-08,6.603361e-08,6.68622e-08,7.040503e-08,6.2908e-08
4,7.476241e-08,7.598304e-08,7.571805e-08,7.872401e-08,8.212703e-08,7.659022e-08,8.005516e-08,7.808353e-08,8.047507e-08,7.373083e-08,...,3.923673e-05,3.923673e-05,7.70101e-08,7.356924e-08,7.502965e-08,8.437525e-08,8.074065e-08,7.812199e-08,7.355545e-08,7.354552e-08


# Visualización

In [26]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, BOW, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# (Opcional) Guardado y lectura del modelo

In [27]:
import pickle

In [None]:
# Guardado
"""
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
tuple_models = (lda_model, BOW, vectorizer)
pickle.dump(tuple_models, open (path + "tuple_model_news2020.pkl", 'wb'))
"""

In [None]:
# Lectura
"""
path = '/content/drive/MyDrive/Modelos/modelosLDA/LDA_sklearn/'
lda_model, BOW, vectorizer = pickle.load(open(path + "tuple_model_news2020.pkl", 'rb'))
"""