In [1]:
import spacy
spacy.load('en_core_web_md')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet') # tiene relaciones semanticas. Tiene palabras ,etc. Vale solo para ingles.

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/octaviodelsueldo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
# from nltk.stem.wordnet import WordNetLemmatizer
# def get_lemma2(word):
#     return WordNetLemmatizer().lemmatize(word)

In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/octaviodelsueldo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [7]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['social', 'status', 'analysis', 'palin', 'email', 'network']
['partitioning', 'ordering', 'large', 'radiosity', 'computation']
['cloudy', 'modular', 'cloud', 'storage', 'system']
['comparison', 'frequency', 'domain', 'interpolation', 'implementation', 'transmitter']
['investigation', 'partial', 'query', 'proximity', 'search']
['uniform', 'resolution', '7-bit', 'flash', 'converter', 'wideband', 'signal', 'conversion']
['analysis', 'structure']
['journey', 'cystic', 'fibrosis']
['multi', 'level', 'conversion', 'circuit', 'multi', 'system']
['string', 'haptic', 'workbench']
['restructure', 'nest', 'relations', 'partition', 'normal']
['improve', 'power', 'delay', 'product', 'circuit', 'using', 'source', 'follower', 'output', 'stage']
['indexing', 'multi', 'dimensional', 'uncertain', 'arbitrary', 'probability', 'density', 'function']
['physical', 'layer', 'security', 'inter', 'session', 'interference', 'large', 'wireless', 'network']
['hybrid', 'stereo', 'camera', 'approach', 'synthesis', 

In [8]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [9]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [10]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [11]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [12]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.026*"large" + 0.026*"computation" + 0.026*"ordering" + 0.026*"partitioning"')
(1, '0.036*"image" + 0.025*"base" + 0.024*"processing" + 0.013*"layer"')
(2, '0.028*"sigmadelta" + 0.028*"order" + 0.028*"modulator" + 0.028*"87.5-db"')
(3, '0.032*"multi" + 0.032*"search" + 0.017*"system" + 0.017*"design"')
(4, '0.032*"resolution" + 0.017*"power" + 0.017*"analysis" + 0.017*"circuit"')


In [13]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.022*"image" + 0.022*"network" + 0.022*"analysis" + 0.022*"processing"')
(1, '0.033*"multi" + 0.023*"system" + 0.013*"design" + 0.013*"resolution"')
(2, '0.030*"base" + 0.021*"power" + 0.021*"search" + 0.012*"circuit"')


In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.068*"multi" + 0.036*"system" + 0.036*"level" + 0.036*"viral"')
(1, '0.030*"analysis" + 0.030*"circuit" + 0.030*"product" + 0.030*"stage"')
(2, '0.041*"design" + 0.041*"cellular" + 0.041*"adder" + 0.041*"automaton"')
(3, '0.039*"lossless" + 0.039*"coding" + 0.039*"least" + 0.039*"square"')
(4, '0.041*"search" + 0.041*"power" + 0.041*"clickthroughs" + 0.041*"joint"')
(5, '0.032*"resolution" + 0.032*"conversion" + 0.032*"probability" + 0.032*"wideband"')
(6, '0.037*"large" + 0.037*"fourth" + 0.037*"order" + 0.037*"microphone"')
(7, '0.066*"image" + 0.023*"camera" + 0.023*"plane" + 0.023*"physical"')
(8, '0.055*"restructure" + 0.055*"normal" + 0.055*"partition" + 0.055*"relations"')
(9, '0.057*"base" + 0.030*"synthesis" + 0.030*"analog" + 0.030*"form"')
