In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import pyLDAvis.lda_model

# Topic Modeling with El Litoral and Rosario12 news

First of all we load the datasets and clean them

In [3]:
#dataset with news between 2008 and 2016 from Rosario12
df_p12 = pd.read_excel("p12_2008_2016.xlsx")

#dataset with news between 2008 and 2020 from El Litoral
df_el = pd.read_excel("el_2008_2020.xlsx")

In [5]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')


# drop rows with missing content
df_p12 = df_p12.dropna(subset=['content'])

# convert the content to lowercase
df_p12['cleaned_content'] = df_p12['content'].str.lower()

# remove special characters, numbers, and punctuation
df_p12['cleaned_content'] = df_p12['cleaned_content'].apply(lambda x: re.sub(r'[^a-zA-Z\sáéíóúüñÁÉÍÓÚÜÑ]', '', x))

# tokenize the text and remove stop words
stop_words = set(stopwords.words('spanish'))
df_p12['cleaned_content'] = df_p12['cleaned_content'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

# remove extra whitespaces
df_p12['cleaned_content'] = df_p12['cleaned_content'].apply(lambda x: ' '.join(x.split()))

# display the cleaned dataframe
df_p12[['content', 'cleaned_content']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patricio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\patricio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,content,cleaned_content
0,"Como consecuencia del impacto, el hombre que i...",consecuencia impacto hombre iba acompañante co...
1,"Por Paula Kearney, El Programa de Promoción F...",paula kearney programa promoción familiar area...
2,Otra de las cuestiones sobre las que trabajan ...,cuestiones trabajan operadores calle consumo d...
3,"Por María del Carmen Arias * , La noción de t...",maría carmen arias noción toxicomanía sufrido ...
4,"Edición: anterior \n siguiente , © 2000-...",edición anterior siguiente wwwpaginacomar repú...


In [6]:
# add a column to each dataframe with the media the data belongs to

df_el["media"] = "litoral"
df_p12["media"] = "p12"

In [7]:
#limit entrys to dates that are part of our analysis and check how many news there are
start_date = "2008-1-1"
end_date = "2015-12-31"
mask = (df_p12['date'] > start_date) & (df_p12['date'] <= end_date)

df_p12 = df_p12.loc[mask]
len(df_p12)

360

In [8]:
# drop irrelevant columns and check if both datasets have the same columns
df_p12.drop(columns=["position", "day", "month", "year", "description"], inplace=True, errors="ignore")
df_el.drop(columns=["query", "subtitle"], inplace=True, errors="ignore")

print(df_p12.columns)
print(df_el.columns)        

Index(['date', 'title', 'url', 'content', 'cleaned_content', 'media'], dtype='object')
Index(['date', 'title', 'url', 'content', 'cleaned_content', 'media'], dtype='object')


In [9]:
#concat dataframes, drop rows with missing content and check how many articles we have for each media
df = pd.concat([df_el, df_p12], axis=0, ignore_index=True)

df.dropna(subset=['cleaned_content'], inplace=True)

df.value_counts("media")

media
litoral    510
p12        360
dtype: int64

## Topic modeling with LDA (2008-2011)

Here we lemmatize the content and vectorize it. The result is a matrix (X)  where each row of this matrix corresponds to a document (text sample), and each column corresponds to a word in the vocabulary. The values in the matrix represent the count of each word in the corresponding document.

In [None]:
end_date = "2011-12-31"
mask = (df['date'] <= end_date)

df1 = df.loc[mask]



# loading spacy model for spanish language
nlp = spacy.load("es_core_news_sm")

# seting stopwords
stop_words_spanish = spacy.lang.es.stop_words.STOP_WORDS
stop_words_spanish.update(["año", "wwwpaginacomar", "software", "desarrollado", "gnulinux"])

# function to lemmatize
def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words_spanish])

df1['cleaned_content'] = df1['cleaned_content'].apply(lemmatize)

# vectorize the content
vectorizer = CountVectorizer(max_features=1000, lowercase=True, stop_words=stop_words_spanish)
X = vectorizer.fit_transform(df1['cleaned_content'])

# apply Latent Dirichlet Allocation 
num_topics = 8  # select number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# assign topic probabilities to each article
df1['topic_probabilities'] = lda.transform(X).tolist()

#using pyLDAvis library to visualize in html file the relevance of topics and words associated with it

import pyLDAvis.lda_model
pyLDAvis.enable_notebook()
prueba = pyLDAvis.lda_model.prepare(lda, X, vectorizer, mds='tsne', n_jobs=1)
pyLDAvis.save_html(prueba, "2008-2011_tsne.html")


As a result we have for each article the probability of it belonging to the 8 topics we modeled

In [21]:
df1[['title', 'topic_probabilities']]

Unnamed: 0,title,topic_probabilities
4,"""Ahora se incauta más droga porque hay más tar...","[0.6126970146102731, 0.0004909926346014362, 0...."
6,"""El enemigo es el narcotráfico, no el Frente P...","[0.0018708806101210432, 0.33596309539764335, 0..."
14,"""La cosa no es con ustedes"": el mensaje que pu...","[0.0027835253492482083, 0.002783806147025475, ..."
21,"""Por la Asignación Universal por Hijo aumentar...","[0.0018409389082352148, 0.9871139737318255, 0...."
31,Acribillan a familia en venganza por la muerte...,"[0.411420612680842, 0.3371789524310031, 0.0007..."
...,...,...
582,La persecución del consumo de drogas según pas...,"[0.00016981241123813426, 0.0001698211561314628..."
583,La provincia apunta al narcotráfico,"[0.6377249216477957, 0.0012648295414492655, 0...."
584,Cocaína en bolsitas y fierros,"[0.00076834845069267, 0.0007683318082665085, 0..."
585,Asalto de cumbia en el penal de Rosario,"[0.2566642343511511, 0.14100446204355174, 0.15..."


# 2011-2016

In [None]:
start_date = "2011-12-31"
mask = (df['date'] > start_date)

df2 = df.loc[mask]




nlp = spacy.load("es_core_news_sm")

stop_words_spanish = spacy.lang.es.stop_words.STOP_WORDS
# Realizar lematización en español
def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words_spanish])


df2['cleaned_content'] = df2['cleaned_content'].apply(lemmatize)


vectorizer = CountVectorizer(max_features=1000, lowercase=True, stop_words=stop_words_spanish)
X = vectorizer.fit_transform(df2['cleaned_content'])


num_topics = 8  # select number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)


df2['topic_probabilities'] = lda.transform(X).tolist()

pyLDAvis.enable_notebook()
prueba = pyLDAvis.lda_model.prepare(lda, X, vectorizer, mds='tsne', n_jobs=1)
pyLDAvis.save_html(prueba, "2012-2016_tsne.html")


## Modeled according to media

El Litoral

In [None]:
df_el = df[df["media"] == "litoral"]

# defining stopwords
stop_words_spanish = spacy.lang.es.stop_words.STOP_WORDS

# lemmatization
def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words_spanish])

df_el['cleaned_content'] = df_el['cleaned_content'].apply(lemmatize)


vectorizer = CountVectorizer(max_features=1000, lowercase=True, stop_words=stop_words_spanish)
X = vectorizer.fit_transform(df_el['cleaned_content'])


num_topics = 8  # select number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)


df_el['topic_probabilities'] = lda.transform(X).tolist()

pyLDAvis.enable_notebook()
prueba = pyLDAvis.lda_model.prepare(lda, X, vectorizer, mds='tsne', n_jobs=1)
pyLDAvis.save_html(prueba, "ellitoral.html")

Página 12

In [None]:
df_p12 = df[df["media"] == "p12"]


stop_words_spanish = spacy.lang.es.stop_words.STOP_WORDS


def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.text.lower() not in stop_words_spanish])


df_p12['cleaned_content'] = df_p12['cleaned_content'].apply(lemmatize)


vectorizer = CountVectorizer(max_features=1000, lowercase=True, stop_words=stop_words_spanish)
X = vectorizer.fit_transform(df_p12['cleaned_content'])


num_topics = 8  # select number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)


df_p12['topic_probabilities'] = lda.transform(X).tolist()

pyLDAvis.enable_notebook()
prueba = pyLDAvis.lda_model.prepare(lda, X, vectorizer, mds='tsne', n_jobs=1)
pyLDAvis.save_html(prueba, "p12.html")