# Tópicos

In [1]:
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

## Loading data

In [2]:
with open('data/out/cleaned_tomo_1.txt') as f:
    text1 = f.read()

with open('data/out/cleaned_tomo_5a.txt') as f:
    text5a = f.read()

with open('data/out/cleaned_tomo_5b.txt') as f:
    text5b = f.read()

with open('data/out/cleaned_tomo_7.txt') as f:
    text7 = f.read()

## Processing data

In [3]:
all_texts = text1 + ' ' + text5a + ' ' + text5b + ' ' + text7

In [4]:
general_counter = Counter(all_texts.split())

In [13]:
all_words = set(all_texts.split())

In [14]:
len(all_words)

19617

In [5]:
data = [text1, text5a, text5b, text7]

In [6]:
def get_topic(topic, feature_names, n_top_words):
    return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic in model.components_:
        topics.append(get_topic(topic, feature_names, n_top_words))
    return topics

In [7]:
def weight(topic):
    return sum(general_counter[w] for w in topic)

In [29]:
N_FEATURES = 1000
N_TOP_WORDS = 10

In [30]:
data_samples = data

In [34]:
%%time

tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=N_FEATURES,
#     stop_words='english',
)
tf = tf_vectorizer.fit_transform(data_samples)
tf_feature_names = tf_vectorizer.get_feature_names()

topics_list = []
for N_TOPICS in list(range(4, 5)):
    print(N_TOPICS)
    lda = LDA(
        n_components=N_TOPICS,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
        n_jobs=2,
    )
    lda.fit(tf)
    topics = get_topics(lda, tf_feature_names, N_TOP_WORDS)
    topics_aux = [(weight(topic), topic, len(topics)) for topic in topics]
    topics_list.extend(topics_aux)
topics_list.sort(reverse=True)

4
CPU times: user 408 ms, sys: 48 ms, total: 456 ms
Wall time: 718 ms


In [35]:
len(topics_list)

4

In [36]:
topics_list

[(1076,
  ['sexual',
   'mesas',
   'visita',
   'informes',
   'pensamiento',
   'desescalamiento',
   'cons',
   'confianz',
   'reúne',
   'trucción'],
  4),
 (1040,
  ['sexual',
   'mesas',
   'pensamiento',
   'jep',
   'trucción',
   'confianz',
   'desescalamiento',
   'cons',
   'informes',
   'diferencial'],
  4),
 (956,
  ['informes',
   'reúne',
   'afrodescendientes',
   'menores',
   'cicr',
   'trucción',
   'visita',
   'desescalamiento',
   'sexual',
   'confianz'],
  4),
 (574,
  ['exploratorio',
   'henry',
   'usted',
   'acercamientos',
   'timoleón',
   'creo',
   'actas',
   'dije',
   'acta',
   'cicr'],
  4)]

In [21]:
ccc=Counter([z for x, y, z in topics_list[:100]])

In [22]:
ccc.most_common()

[(10, 6),
 (11, 6),
 (9, 6),
 (12, 5),
 (13, 5),
 (14, 5),
 (18, 5),
 (19, 5),
 (8, 5),
 (7, 4),
 (15, 4),
 (17, 4),
 (20, 4),
 (4, 3),
 (5, 3),
 (6, 3),
 (80, 3),
 (16, 3),
 (100, 3),
 (40, 3),
 (120, 3),
 (2, 2),
 (3, 2),
 (200, 2),
 (160, 2),
 (1, 1),
 (140, 1),
 (180, 1),
 (60, 1)]