# Tópicos

In [1]:
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

## Loading data

In [2]:
with open('data/out/cleaned_tomo_1.txt') as f:
    text1 = f.read()

with open('data/out/cleaned_tomo_5a.txt') as f:
    text5a = f.read()

with open('data/out/cleaned_tomo_5b.txt') as f:
    text5b = f.read()

with open('data/out/cleaned_tomo_7.txt') as f:
    text7 = f.read()

## Processing data

In [3]:
all_texts = text1 + ' ' + text5a + ' ' + text5b + ' ' + text7

In [4]:
general_counter = Counter(all_texts.split())

In [5]:
all_words = set(all_texts.split())

In [6]:
len(all_words)

18754

In [7]:
data = [text1, text5a, text5b, text7]

In [8]:
def get_topic(topic, feature_names, n_top_words):
    return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic in model.components_:
        topics.append(get_topic(topic, feature_names, n_top_words))
    return topics

In [9]:
def weight(topic):
    return sum(general_counter[w] for w in topic)

In [10]:
N_FEATURES = 1000
N_TOP_WORDS = 10

In [11]:
data_samples = data

In [17]:
%%time

tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=N_FEATURES,
#     stop_words='english',
)
tf = tf_vectorizer.fit_transform(data_samples)
tf_feature_names = tf_vectorizer.get_feature_names()

topics_list = []
for N_TOPICS in list(range(2, 5)):
    print(N_TOPICS)
    lda = LDA(
        n_components=N_TOPICS,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
        n_jobs=2,
    )
    lda.fit(tf)
    topics = get_topics(lda, tf_feature_names, N_TOP_WORDS)
    topics_aux = [(weight(topic), topic, len(topics)) for topic in topics]
    topics_list.extend(topics_aux)
topics_list.sort(reverse=True)

2
3
4
CPU times: user 480 ms, sys: 72 ms, total: 552 ms
Wall time: 1.11 s


In [18]:
len(topics_list)

9

In [19]:
topics_list

[(1010,
  ['mesas',
   'sexual',
   'desescalamiento',
   'cons',
   'étnicos',
   'visita',
   'reúne',
   'episcopal',
   'recomendaciones',
   'pensamiento'],
  4),
 (988,
  ['mesas',
   'sexual',
   'desescalamiento',
   'cons',
   'visita',
   'reúne',
   'étnicos',
   'episcopal',
   'recomendaciones',
   'contribución'],
  3),
 (986,
  ['mesas',
   'sexual',
   'pensamiento',
   'exploratorio',
   'afrodescendientes',
   'directas',
   'étnicos',
   'visita',
   'capacidades',
   'henry'],
  4),
 (986,
  ['mesas',
   'sexual',
   'pensamiento',
   'exploratorio',
   'afrodescendientes',
   'directas',
   'étnicos',
   'visita',
   'capacidades',
   'henry'],
  3),
 (986,
  ['mesas',
   'sexual',
   'pensamiento',
   'exploratorio',
   'afrodescendientes',
   'directas',
   'étnicos',
   'capacidades',
   'visita',
   'henry'],
  2),
 (925,
  ['exploratorio',
   'mesas',
   'confianz',
   'menores',
   'sexual',
   'henry',
   'posconflicto',
   'recuento',
   'trucción',
   'ust

In [15]:
ccc=Counter([z for x, y, z in topics_list[:100]])

In [16]:
ccc.most_common()

[(10, 10), (9, 9), (8, 8), (7, 7), (6, 6), (5, 5), (4, 4), (3, 3), (2, 2)]