# LDA


In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Topics

In [2]:
topics_df = pd.read_csv('data/out/lda_winners.csv', names=['SIMILARITY', 'TOPIC', 'N_TOPICS'])

In [3]:
topics_df.head()

Unnamed: 0,SIMILARITY,TOPIC,N_TOPICS
0,0.672244,car drive seat road park driving bag drove ho...,191
1,0.664257,car drive house seat road driving drove home ...,182
2,0.653546,car house door home drive seat road driving d...,105
3,0.646933,team ball park dream game play line hit won r...,160
4,0.64108,game play ball playing hit played beat won sh...,157


In [4]:
topics_df.shape

(20100, 3)

## Processing data

In [5]:
SIMILARITY_THRESHOLD = 0.5
INTERSECTION_THRESHOLD = 5

### Topics

In [6]:
top_topics_df = topics_df[topics_df.SIMILARITY >= SIMILARITY_THRESHOLD]

In [7]:
len(top_topics_df)

1951

In [8]:
%%time

indexes = []

for i, row in top_topics_df.iterrows():
    add = True
    words = set(row.TOPIC.split())
    previous_df = top_topics_df.loc[indexes]
    for previous_topic in previous_df.TOPIC:
        previous_words = set(previous_topic.split())
        intersect = words.intersection(previous_words)
        if len(intersect) >= INTERSECTION_THRESHOLD:
            add = False
            break
    if add:
        indexes.append(i)

CPU times: user 1.97 s, sys: 30.2 ms, total: 2 s
Wall time: 1.88 s


In [9]:
unique_topics_df = top_topics_df.loc[indexes]

In [10]:
len(unique_topics_df)

58

In [11]:
unique_topics_df.head()

Unnamed: 0,SIMILARITY,TOPIC,N_TOPICS
0,0.672244,car drive seat road park driving bag drove ho...,191
3,0.646933,team ball park dream game play line hit won r...,160
5,0.640202,box picture bags stairs closet open bag shoes...,80
8,0.636975,room door open floor bed table small window w...,56
9,0.633821,says say asks looks ask smiles walks head nod...,155


In [12]:
for i, (j, row) in enumerate(unique_topics_df.iterrows()):
    print(i + 1, '-', row.SIMILARITY)
    print(row.TOPIC.strip())
    print()

1 - 0.6722436207494797
car drive seat road park driving bag drove home ride house window door inside hand bags head street taking pick

2 - 0.646932983514499
team ball park dream game play line hit won running playing run ready fast air throw shot right pass set

3 - 0.6402015347323387
box picture bags stairs closet open bag shoes pack till pull door doors walk steps turn lights putting reach upstairs

4 - 0.6369746603676049
room door open floor bed table small window wall hall voice inside doors chair stairs eyes hallway glass kitchen head

5 - 0.6338205069076265
says say asks looks ask smiles walks head nods hand pulls smile starts walk laughs takes turns look hands gets

6 - 0.6331798812868554
hair black eyes blue white brown red dark green light blonde like shirt color grey wearing door pink skin jeans

7 - 0.6330450241102841
eyes face lips smile hands hand like body voice head felt mouth hair feel feeling fingers way moment small words

8 - 0.6283909457053908
book read books readi

## Saving data

In [13]:
unique_topics_df.to_csv('data/out/lda_unique_topics.csv', index=False)