# Tópicos

In [2]:
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

## Loading data

In [3]:
with open('data/out/cleaned_podcast_2.txt') as f:
    text2 = f.read()

with open('data/out/cleaned_podcast_3.txt') as f:
    text3 = f.read()

with open('data/out/cleaned_podcast_4.txt') as f:
    text4 = f.read()

with open('data/out/cleaned_podcast_5.txt') as f:
    text5 = f.read()

with open('data/out/cleaned_podcast_6.txt') as f:
    text6 = f.read()

with open('data/out/cleaned_podcast_7.txt') as f:
    text7 = f.read()

## Processing data

In [6]:
all_texts = text2 + ' ' + text3 + ' ' + text4 + ' ' + text5 + ' ' + text6 + ' ' + text7

In [7]:
general_counter = Counter(all_texts.split())

In [8]:
all_words = set(all_texts.split())

In [9]:
len(all_words)

4188

In [11]:
data = [text2, text3, text4, text5, text6, text7]

In [12]:
def get_topic(topic, feature_names, n_top_words):
    return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic in model.components_:
        topics.append(get_topic(topic, feature_names, n_top_words))
    return topics

In [13]:
def weight(topic):
    return sum(general_counter[w] for w in topic)

In [14]:
N_FEATURES = 1000
N_TOP_WORDS = 10

In [15]:
data_samples = data

In [19]:
%%time

tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=N_FEATURES,
#     stop_words='english',
)
tf = tf_vectorizer.fit_transform(data_samples)
tf_feature_names = tf_vectorizer.get_feature_names()

topics_list = []
for N_TOPICS in list(range(1, 11)):
    print(N_TOPICS)
    lda = LDA(
        n_components=N_TOPICS,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
        n_jobs=2,
    )
    lda.fit(tf)
    topics = get_topics(lda, tf_feature_names, N_TOP_WORDS)
    topics_aux = [(weight(topic), topic, len(topics)) for topic in topics]
    topics_list.extend(topics_aux)
topics_list.sort(reverse=True)

1
2
3
4
5
6
7
8
9
10
CPU times: user 1.19 s, sys: 352 ms, total: 1.54 s
Wall time: 4.05 s


In [20]:
len(topics_list)

55

In [21]:
topics_list

[(397,
  ['niños',
   'gente',
   'niño',
   'momento',
   'hablar',
   'literatura',
   'veces',
   'bien',
   'lugar',
   'cuenta'],
  2),
 (397,
  ['niños',
   'gente',
   'niño',
   'momento',
   'hablar',
   'literatura',
   'veces',
   'bien',
   'cuenta',
   'país'],
  1),
 (383,
  ['niños',
   'niño',
   'hablar',
   'adultos',
   'momento',
   'lugar',
   'miedo',
   'bien',
   'silencio',
   'gente'],
  6),
 (383,
  ['niños',
   'niño',
   'hablar',
   'adultos',
   'lugar',
   'momento',
   'miedo',
   'bien',
   'silencio',
   'gente'],
  8),
 (371,
  ['niños',
   'niño',
   'hablar',
   'momento',
   'literatura',
   'adultos',
   'veces',
   'bien',
   'lugar',
   'libro'],
  9),
 (371,
  ['niños',
   'niño',
   'hablar',
   'momento',
   'literatura',
   'adultos',
   'veces',
   'bien',
   'lugar',
   'libro'],
  7),
 (369,
  ['niños',
   'niño',
   'hablar',
   'momento',
   'literatura',
   'adultos',
   'veces',
   'bien',
   'lugar',
   'ciencia'],
  3),
 (369,
  ['

In [21]:
ccc=Counter([z for x, y, z in topics_list[:100]])

In [22]:
ccc.most_common()

[(10, 6),
 (11, 6),
 (9, 6),
 (12, 5),
 (13, 5),
 (14, 5),
 (18, 5),
 (19, 5),
 (8, 5),
 (7, 4),
 (15, 4),
 (17, 4),
 (20, 4),
 (4, 3),
 (5, 3),
 (6, 3),
 (80, 3),
 (16, 3),
 (100, 3),
 (40, 3),
 (120, 3),
 (2, 2),
 (3, 2),
 (200, 2),
 (160, 2),
 (1, 1),
 (140, 1),
 (180, 1),
 (60, 1)]