In [2]:
from __future__ import print_function
from time import time

import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [3]:
n_top_words = 10


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
data_samples = pd.read_csv("profiles.csv")

In [5]:
text_data = []
for i in range(len(data_samples)):
    for j in range(10):
        cur_text = data_samples['essay'+str(j)][i]
        if (type(cur_text) != str):
            cur_text = str(cur_text)
        if cur_text is not "":
            try:
                text_data[i] = text_data[i] + (cur_text)
            except IndexError:
                text_data.append(cur_text)
                
        else: 
            continue

In [6]:
len(text_data)

59946

In [7]:
text_train, text_test = train_test_split(text_data, test_size=0.2)

In [8]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=1000,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(text_train)

Extracting tf features for LDA...


In [9]:
n_components = 10
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (47956, 1000))
lda = LatentDirichletAllocation(n_components=10, learning_method = 'online')
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=47956 and n_features=1000...
done in 288.521s.

Topics in LDA model:
Topic #0: music movies like food books love favorite shows tv rock
Topic #1: new san francisco bay years friends city work area ve
Topic #2: strong em br com http www youtube target watch _blank
Topic #3: fi nbsp sci psychology nannannannannannannannannannan fantasy action philosophy comedy computers
Topic #4: love life people amp world br art open things music
Topic #5: br music like food amp making things people love books
Topic #6: love like friends just im good music want family know
Topic #7: love friends good enjoy food family life music like new
Topic #8: br like don just really people things good think know
Topic #9: interests class href ilink br games music science rock star



In [10]:
lda.score(tf)

-37003748.16514067