# NMF


In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Topics

In [2]:
topics_df = pd.read_csv('data/out/nmf_winners.csv', names=['SIMILARITY', 'TOPIC', 'N_TOPICS'])

In [3]:
topics_df.head()

Unnamed: 0,SIMILARITY,TOPIC,N_TOPICS
0,0.648428,look walk looks asks smile head smiles walks ...,100
1,0.645495,look walk looks smile head asks smiles walks ...,136
2,0.642189,black hair white blue red brown dark green sh...,134
3,0.640779,asks ask smiles nods nod laughs looks takes t...,104
4,0.640779,asks ask smiles looks nods nod laughs smile w...,172


In [4]:
topics_df.shape

(20100, 3)

## Processing data

In [5]:
SIMILARITY_THRESHOLD = 0.5
INTERSECTION_THRESHOLD = 5

### Topics

In [6]:
top_topics_df = topics_df[topics_df.SIMILARITY >= SIMILARITY_THRESHOLD]

In [7]:
len(top_topics_df)

2865

In [8]:
%%time

indexes = []

for i, row in top_topics_df.iterrows():
    add = True
    words = set(row.TOPIC.split())
    previous_df = top_topics_df.loc[indexes]
    for previous_topic in previous_df.TOPIC:
        previous_words = set(previous_topic.split())
        intersect = words.intersection(previous_words)
        if len(intersect) >= INTERSECTION_THRESHOLD:
            add = False
            break
    if add:
        indexes.append(i)

CPU times: user 2.73 s, sys: 44.2 ms, total: 2.78 s
Wall time: 2.56 s


In [9]:
unique_topics_df = top_topics_df.loc[indexes]

In [10]:
len(unique_topics_df)

46

In [11]:
unique_topics_df.head()

Unnamed: 0,SIMILARITY,TOPIC,N_TOPICS
0,0.648428,look walk looks asks smile head smiles walks ...,100
2,0.642189,black hair white blue red brown dark green sh...,134
66,0.629752,room living bed floor stairs kitchen table cl...,102
126,0.622152,pack alpha head line clothes bags bag run lea...,151
127,0.621926,team game ball play playing won dance hit pla...,109


In [12]:
for i, (j, row) in enumerate(unique_topics_df.iterrows()):
    print(i + 1, '-', row.SIMILARITY)
    print(row.TOPIC.strip())
    print()

1 - 0.6484281229425437
look walk looks asks smile head smiles walks turn nod ask start hear say face hand nods starts grab turns

2 - 0.6421888141867366
black hair white blue red brown dark green shirt color wearing dress jeans pink light long purple grey blonde wear

3 - 0.6297519278197948
room living bed floor stairs kitchen table clothes walls hallway hall couch chair hospital closet dinner window left door wall

4 - 0.6221522230372238
pack alpha head line clothes bags bag run leave safe rest stuff running ready attack form hurt office mate fight

5 - 0.6219258375997435
team game ball play playing won dance hit played shot join rest room group high line good beat run end

6 - 0.6145783965007526
lips kiss hands eyes hand neck kissed body felt arms pulled mouth face kissing fingers chest bed waist tongue head

7 - 0.6011015693351099
car house drive door drove road driving seat home window got ride school bag minutes street going walk pulled way

8 - 0.5922285034294336
story reading wo

## Saving data

In [13]:
unique_topics_df.to_csv('data/out/nmf_unique_topics.csv', index=False)