In [15]:
import gensim
import string
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn.cluster import DBSCAN
from nltk.corpus import stopwords

In [2]:
def tokenize(text):
    words = [word.lower().strip(string.punctuation) for word in text.split()]
    words = [word for word in words if word]
    
    return words

In [3]:
data = pd.read_csv('train.csv')

In [4]:
data['question_text_norm'] = data['question_text'].apply(tokenize)

In [5]:
corpus = list(data['question_text_norm'].values)

In [6]:
vocab = Counter()
for sent in corpus:
    vocab.update(sent)

In [7]:
vocab = {v for v,c in vocab.most_common(10000)}

In [25]:
vocab -= set(stopwords.words('english'))

In [77]:
ft = gensim.models.Word2Vec(corpus, size=200, sg=1,max_vocab_size=300000)

In [78]:
X = np.zeros((len(vocab), 200))
id2word = {i:word for i, word in enumerate(vocab)}

for i,word in id2word.items():
    try:
        vec = ft[word]
    except (KeyError, ValueError):
        continue
    
    X[i] = vec


  


In [23]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [100]:
cluster = DBSCAN(eps=0.3, leaf_size=100, metric='cosine')
cluster.fit(X)

DBSCAN(algorithm='auto', eps=0.3, leaf_size=100, metric='cosine',
    metric_params=None, min_samples=5, n_jobs=None, p=None)

In [101]:
set(cluster.labels_)

{-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35}

In [99]:
cls = defaultdict(list)

for i, cl in enumerate(cluster.labels_):
    if cl != -1:
        cls[cl].append(id2word[i])

f = open('cluster.txt', 'w')
for cl in cls:
    f.write('### '+ str(cl) + ' ###\n')
    f.write('\n'.join(cls[cl]))
    f.write('\n\n')
f.close()

In [26]:
dictinary = gensim.corpora.Dictionary(corpus)

In [27]:
dictinary.filter_extremes(no_above=0.3, no_below=10)
dictinary.compactify()

In [28]:
print(dictinary)

Dictionary(35260 unique tokens: ['fours', 'rhinoplasty', 'ivf', 'mogul', 'wheatgrass']...)


In [40]:
'a' in dictinary.token2id

True

In [32]:
corpusbo = [dictinary.doc2bow(text) for text in corpus]

In [33]:
lda = gensim.models.LdaMulticore(corpusbo, 100, id2word=dictinary, passes=3, 
                                 chunksize=1000, iterations=10)

  diff = np.log(self.expElogbeta)


In [34]:
lda.print_topics(100)

[(0,
  '0.043*"a" + 0.039*"case" + 0.036*"ideas" + 0.033*"for" + 0.025*"in" + 0.024*"color" + 0.024*"suggest" + 0.022*"of" + 0.018*"iran" + 0.016*"replace"'),
 (1,
  '0.047*"to" + 0.037*"times" + 0.030*"it" + 0.030*"interview" + 0.028*"a" + 0.028*"difficult" + 0.026*"asked" + 0.026*"100" + 0.024*"stock" + 0.023*"in"'),
 (2,
  '0.089*"buy" + 0.074*"to" + 0.053*"a" + 0.033*"successful" + 0.025*"in" + 0.023*"least" + 0.020*"pass" + 0.020*"for" + 0.020*"can" + 0.019*"policy"'),
 (3,
  '0.063*"working" + 0.050*"medical" + 0.041*"care" + 0.039*"general" + 0.033*"yourself" + 0.031*"private" + 0.025*"large" + 0.022*"email" + 0.021*"factors" + 0.020*"for"'),
 (4,
  '0.090*"in" + 0.068*"for" + 0.055*"university" + 0.048*"study" + 0.041*"students" + 0.041*"class" + 0.024*"of" + 0.024*"technology" + 0.023*"how" + 0.022*"which"'),
 (5,
  '0.159*"other" + 0.070*"bad" + 0.046*"and" + 0.042*"car" + 0.040*"each" + 0.027*"are" + 0.025*"humans" + 0.023*"why" + 0.022*"of" + 0.021*"australia"'),
 (6,
  '0.