In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
from scipy.spatial.distance import cosine, euclidean
from tqdm import tqdm_notebook as tqdm

import os,sys,inspect
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# from .. import gaussian_mixture_cotrain
from gaussian_mixture_cotrain import GaussianMixtureCotrain

from pprint import pprint

# Run GMM clustering on blog descriptions

In [2]:
# Load data
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

(4617128, 300)

In [21]:
X = desc_emb[:500000,:]
clf = GaussianMixture(n_components=50, verbose=2, warm_start=True)
clf.fit(X)

Initialization 0


KeyboardInterrupt: 

In [6]:
outpath = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'

with open(outpath, 'wb') as f:
    pickle.dump(clf, f)

## Try to continue training a model

In [3]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [30]:
X = desc_emb[:500000,:]
clf.fit(X)

Initialization 0


KeyboardInterrupt: 

# Examine trained GMM

In [2]:
# Load data
# desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
# desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_avg.npy'
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_sum.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

(3992, 300)

In [3]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
# path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'
# path = '/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc_sum.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [17]:
# Load descriptions
# path = '/usr0/home/mamille2/tumblr/data/en_blog_descriptions.pkl'
path = '/usr0/home/mamille2/tumblr/data/desc_recent5.pkl'
desc_df = pd.read_pickle(path)

# descs = desc_df['parsed_blog_description'].tolist()
desc_toks = desc_df['tokenized_blog_description'].tolist()

In [5]:
# clf.bic(desc_emb[:500000,:]) # -615M for 20 comps, -652M for 50 comps
clf.bic(desc_emb)

14416957.655118914

In [6]:
clf.lower_bound_

554.49986363703295

In [7]:
# Get highest weights
wted_comps = np.argsort(clf.weights_)[::-1]
wted_comps

array([ 6, 40,  4,  0, 27, 28, 12, 39, 13,  7, 26,  8, 23, 14, 20, 49, 17,
       46, 30,  9, 31, 25, 32,  3, 42, 19, 36, 45, 48, 44, 33, 38,  2,  5,
       15, 21,  1, 16, 18, 24, 10, 22, 37, 34, 29, 11, 43, 41, 47, 35])

## Examine datapoints with highest probabilities assigned for each cluster; examine cluster assignments

In [8]:
# probs = clf.predict_proba(desc_emb[:500000,:])
probs = clf.predict_proba(desc_emb)
probs.shape

(3992, 50)

In [9]:
top_probs = np.argsort(probs, axis=0)[::-1]
top_probs.shape

(3992, 50)

In [22]:
def top_descs(probs, descs, k, order, vocab_file=None):
    """ Prints top k descriptions for each component"""
    
    top_probs = np.argsort(probs, axis=0)[::-1]
    
    if vocab_file: # dict [n_words]: [vocab]
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)
    
    for i in order:
        print("Component {}".format(i))
        col = top_probs[:,i]
#     for i, c in enumerate(top_probs.T):
        
        for el in col[:k]: 
            if vocab_file:
                print('\t' + ' '.join(d if d in vocab[100000] else '<unk>' for d in descs[el])) # for tokenized
            else:
                print('\t' + ' '.join(d if d in vocab[100000] else '<unk>' for d in descs[el])) # for tokenized
#             print('\t' + descs[el])
            
        print()

In [23]:
# Top descriptions from halfday co-training, sum
top_descs(probs, desc_toks, 20, wted_comps, '/usr0/home/mamille2/tumblr/data/halfday_top5_vocab100000.pkl')

Component 6
	<unk> <unk> <unk>
	.
	background casting data
	« peace of mind comes from within »
	<unk> on yahoo
	euronews on yahoo
	<unk> on yahoo
	<unk> on yahoo
	<unk>
	<unk>
	<unk> 이야기 그리고 <unk>
	soccer <unk> m ;)
	duel me
	exhausted but trying
	<unk>
	news from <unk>
	( ͡° ͜ʖ ͡° )
	<unk> on yahoo
	sloths in space
	<unk>

Component 40
	sebastian <unk> . sniper .
	i 'm dying squirtle
	ask for new <unk>
	polyvore fashion page now
	none of this is real
	my name is alex and
	stress is my <unk> .
	<unk> <unk> <unk> <unk>
	anal in the am
	<unk> dearest one .
	here for louis tomlinson
	<unk> <unk> <unk> <unk> az <unk>
	abc news videos on yahoo
	bottom of the <unk> sound
	pensamientos <unk> sobre <unk>
	<unk> are all mad <unk>
	hoe for seventeen and exo
	my name is andrew
	moved to <unk> ! ! !
	we write plays and books !

Component 4
	god someone just bring gabriel back .
	bisexual male that loves tumblr and kindness
	lion state of <unk> <unk> cali di <unk>
	not all of me will die    ( they

In [10]:
# Top descriptions from just descriptions (50 components)
top_descs(probs, descs, 20, wted_comps)

Component 39
	15 yo disabled/sped High School student. Graduating in 2019. I thought making a studyblr would be a good idea for some reason. icon by desferal
	they/them
	This is for the lovers of Dougie Poynter's Tattoo! If you love Dougie Poynter (who doesn't) then you will love this Tumblr! Follow us on Twitter @DougiesTattoo. Thanks!
	Sam | July 23 | Leo | "Light can always be found if one just remembers to turn on the light"
	Sydney. Single. God  < 3 Drama free. Worry free. Staying classy. NOT trashy. (:
	My name is Emma. Im 17 and live in London :D
	| Sadie a begginer cosplayer | Dating the love of my life lesspale-more-pale | next con & cosplay ~ Supercon (48days) Edward Elric (Fullmetal Alchmist)
	I mainly post pictures of oli and his old girlfriend Sarah and some memes so enjoy
	Inspired by the "Humans of New York" page. Get to know more about your VCU community.
	I have a slight obsession with pineapples/thorki/ destiel/ stony / stucky / johnlock /spidypool/ Star Wars/ avenger

	Hockey player
	Dyamond Mahone  < 3
	you die if  you  try
	Eye See All and  Lose It
	All the fan girl stuff than people around me won’t understand 👽
	cute  < 3
	Celebrating .. Love .. Sex ! Women !

Component 33
	Random thoughts!
	i post things i like
	Random Random Random
	Take it.
	I like stapling shit.
	Now Im Nothing
	I am live for me not you!
	my life in pictures
	Hi there.
	Welcome to Super Thierry's World
	Thoughts on Life
	cool stuff
	The life I live
	It's the little things.
	i'm sad
	by: Lynnda Mordoch
	it's good.
	Just random stuff
	Take a time out
	Random.

Component 38
	bored af
	Just stuff I like ok?
	this is all trash
	i'm the weird girl with a blog!..
	Pizza is life
	You make me lol
	im gay
	Just a fangirl.
	little and new to tumblr
	Me. . . I guess?
	Don't mess with me
	I don't know I'm just sad.
	i just wanna draw
	Just Be You LOL
	Love is......
	Just a person who likes to draw :)
	Not an ordinary human
	I love to draw and being funny
	what a piece of fuck this shit is

In [28]:
# Top descriptions from halfday co-training, averages
top_descs(probs, descs, 20, wted_comps)

Component 19
	*SFW* Lilith Age 19 Little age 4-7 Bi-Sexual. I like cute things that make me happy. I am taken by a loving daddy. 
	They always told me to chase my dreams What if I dream of being dead?
	hi im gamzee and i have an applesauce addiction. please use they/them or he/him pronouns when referring to me :o)
	i'm olivia and harry styles owns this ass
	follow me @janecrocker413
	Hi i'm Ember and use they/them.
	I take imagine and ship requests!
	Cause my life is dope and I do dope shit - Kanye West
	love this show so much~bellarke
	20✨-🇲🇽-CA🌞Sehun is my ult but Lay won't leave me alone
	▪ currently reading What We Left Behind by Robin Talley ▪ talk to me! ▫ ask box open ▪ icon by pinklilies ▪
	▪hi I'm Hikari and I'm a multifandom geek▪anime and gay ships are my life▪I do headcanons for my fandoms and the occasional cringey jokes▪on wattpad@Grell_Smutcliff(*^o^*)▪
	hey i'm kevin and i hope u have a Good Time here at shithead central
	I wish I was as cool as the void
	Love yourself 

## Find closest words in embedding space to cluster means
Doesn't really mean anything, as are averaging embeddings across all words in a post and 'dmitry' is closest to each cluster mean

In [24]:
path = '/usr0/home/mamille2/tumblr/data/desc_ftvecs100000.pkl'

with open(path, 'rb') as f:
    wd_embs = pickle.load(f)
    
len(wd_embs[100000])

100000

In [26]:
closests = []
dist = euclidean

for m in tqdm(clf.means_):
    closest_dist = np.infty
    closest_wd = None
    
    for wd, emb in wd_embs[100000].items():
        if dist(m,emb) < closest_dist:
            closest_wd = wd
            
    closests.append(closest_wd)
    
closests




['dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry']

# Sample blog descriptions for analysis

In [3]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

  interactivity=interactivity, compiler=compiler, result=result)


4617128


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description'],
      dtype='object')

In [5]:
pd.set_option('display.max_colwidth', 999)

In [9]:
s = data.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

Unnamed: 0,tumblog_id,tumblr_blog_name,tumblr_blog_title,tumblr_blog_url,timezone,tumblr_blog_description,parsed_blog_description
3625145,331658591.0,diggybangs,,https://diggybangs.tumblr.com/,Australia/Canberra,Diggy Bangs.,Diggy Bangs.
518383,246641476.0,lollycipe-blog,❤Marïa Manuela❤,https://lollycipe-blog.tumblr.com/,US/Eastern,13 anos 💋 l Portugal 💋 l Aveiro 💋 l Andreia ❤ l Xana ❤ l Helena '26 ❤ l Rafa ❤ l Tomás ❤ l Gabi ❤ l Telmo B.❤ l Edu ❤ l Ele ❤,13 anos 💋 l Portugal 💋 l Aveiro 💋 l Andreia ❤ l Xana ❤ l Helena '26 ❤ l Rafa ❤ l Tomás ❤ l Gabi ❤ l Telmo B.❤ l Edu ❤ l Ele ❤
3414511,107954785.0,missionsandjesuslikethings-blog,MissionandJesuslikethings,http://missionsandjesuslikethings-blog.tumblr.com/,US/Mountain,My journey to the great unknown of missions.,My journey to the great unknown of missions.
2919762,351700714.0,jm-dalla,∞,https://jm-dalla.tumblr.com/,US/Eastern,17 / greece / k-pop / movies,17 / greece / k-pop / movies
2488729,246323799.0,creseselia,ˏˋ claude zhao ˎˊ,https://creseselia.tumblr.com/,US/Eastern,"<p>""To see you smile is to feel the sun.""</p><p>➳ other account: @imagineclaude</p><p>Wattpad: @piplupin</p>","""To see you smile is to feel the sun.""➳ other account: @imagineclaudeWattpad: @piplupin"
1545902,14225153.0,pechetty-blog,Apoptosis,http://pechetty-blog.tumblr.com/,US/Eastern,Fresh Anonymity,Fresh Anonymity
1506853,300915956.0,janiraaclaveer,Angus Young💥,https://janiraaclaveer.tumblr.com/,US/Eastern,AC/DC😍,AC/DC😍
679111,310173390.0,chelo-tblr-by-paby,tu y yo así piensalo,https://chelo-tblr-by-paby.tumblr.com/,US/Eastern,gay love,gay love
1454846,199337636.0,3l3phantworld,Embrace every moment,https://3l3phantworld.tumblr.com/,US/Eastern,Laugh. Breathe. Believe.,Laugh. Breathe. Believe.
578457,362108892.0,yclibra17,Libra A,https://yclibra17.tumblr.com/,US/Eastern,<p>from Taiwan</p>,from Taiwan


## Blog descriptions from blogs that have text posts in halfday

In [11]:
text_posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/halfday_text.pkl')
print(len(text_posts))
text_posts.columns

3078642


Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'restrictedtags_200freq'],
      dtype='object')

In [17]:
# Blogs that also have text descriptions
tumblogs_allposts = text_posts['tumblog_id'].unique()
len(tumblogs_allposts)

726081

In [None]:
count_series = text_posts.groupby(['tumblog_id']).size()

In [23]:
tumblogs_2posts = count_series[count_series >= 2].index

In [25]:
tumblogs_5posts = count_series[count_series >= 5].index

In [27]:
tumblogs_10posts = count_series[count_series >= 10].index

In [13]:
data_text = data[data['tumblog_id'].isin(tumblogs_allposts)]
len(data_text)

20792

In [31]:
data_text2 = data[data['tumblog_id'].isin(tumblogs_2posts)]
len(data_text2)

11547

In [26]:
data_text = data[data['tumblog_id'].isin(tumblogs_5posts)]
len(data_text)

4432

In [29]:
data_text10 = data[data['tumblog_id'].isin(tumblogs_10posts)]
len(data_text10)

1797

In [30]:
# Sample from those who have at least 10 text posts in halfday

s = data_text10.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

Unnamed: 0,tumblog_id,tumblr_blog_name,tumblr_blog_title,tumblr_blog_url,timezone,tumblr_blog_description,parsed_blog_description
1568767,149821420.0,thenerdologist,"Few Even Think To Ask ""The Question""",https://thenerdologist.tumblr.com/,US/Eastern,<p>Use the pain of loss...</p><p>instagram: john_daniel_pena</p>,Use the pain of loss...instagram: john_daniel_pena
4324756,243213825.0,audiblewince,Leo,https://audiblewince.tumblr.com/,US/Eastern,"<p>im @audiblewince on most sites</p><p>awful artist/cosplayer</p><p>he/him</p><p>i love to scream abt my sporadic interests</p><p><a href=""https://audiblewince.tumblr.com/post/165012768777/about"">about</a> <a href=""https://ko-fi.com/A3073DHJ"">Buy Me a Coffee</a></p>",im @audiblewince on most sitesawful artist/cosplayerhe/himi love to scream abt my sporadic interestsabout Buy Me a Coffee
309030,322787978.0,redpalladiin,I’d die for Mike Wheeler,http://redpalladiin.tumblr.com/,US/Pacific,<p>meme man </p>,meme man
635577,234918721.0,ghostpai,,http://ghostpai.tumblr.com/,US/Eastern,hey,hey
123339,206644517.0,mattsmithdavidtennant,Whatever I like,https://mattsmithdavidtennant.tumblr.com/,US/Eastern,personal account,personal account
3313680,222004619.0,slitherioking-daniel,･:*:✼✿ Terrific,http://slitherioking-daniel.tumblr.com/,US/Eastern,<p>Martina</p><p>16 </p>,Martina16
2534749,331408628.0,professorcactus,:'),https://professorcactus.tumblr.com/,US/Eastern,<p>My name is meel. I'm a succulent enthusiast.</p>,My name is meel. I'm a succulent enthusiast.
1339682,226535409.0,wooden-cat,Not That Nice,https://wooden-cat.tumblr.com/,US/Eastern,shit,shit
3830198,246553170.0,makenziedoughnut,Nor Nor,https://makenziedoughnut.tumblr.com/,US/Eastern,I AM THE FLAMINGO QUEEN,I AM THE FLAMINGO QUEEN
3233745,270185245.0,mostlikelylauren,,https://mostlikelylauren.tumblr.com/,US/Eastern,"<a href=""https://mostlikelylauren.tumblr.com/tagged/me"">Lauren</a> | 21 | NC</p><p>Morgan | Soccer</p><p>IG: laurenellis04</p>",Lauren | 21 | NCMorgan | SoccerIG: laurenellis04


In [32]:
# Sample from those who have at least 2 text posts in halfday

s = data_text2.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

Unnamed: 0,tumblog_id,tumblr_blog_name,tumblr_blog_title,tumblr_blog_url,timezone,tumblr_blog_description,parsed_blog_description
1253256,292040519.0,royilz,Kush&wizdom,https://royilz.tumblr.com/,US/Eastern,<p>Kush and wizdomm</p>,Kush and wizdomm
2988811,245928069.0,foreignbrat,☽,http://foreignbrat.tumblr.com/,US/Pacific,twenty // california,twenty // california
2180905,256295412.0,mihdun,I will face god and walk backwards into hell,https://mihdun.tumblr.com/,US/Eastern,hello I am Midon. science.,hello I am Midon. science.
1682082,258016116.0,25island,Cowboy Kid - King of the Worms,https://25island.tumblr.com/,US/Eastern,❤💀Leader of the sad boys club 💀❤,❤💀Leader of the sad boys club 💀❤
2449032,142460101.0,julyrod11,If.,http://julyrod11.tumblr.com/,US/Eastern,<p>Hooyah Never Quit..</p><p>CO</p>,Hooyah Never Quit..CO
2526403,314518533.0,itsthekiks,It's Kiki,https://itsthekiks.tumblr.com/,US/Eastern,Mostly about anime and novels and kitties. A little about art. Over 30. Poly Bi Cis Lady.,Mostly about anime and novels and kitties. A little about art. Over 30. Poly Bi Cis Lady.
887235,311103498.0,m00ngal,ukulele players dont interact,http://m00ngal.tumblr.com/,US/Eastern,º hi im adrian/riley and i exist sometimes • trans enby • 14 • aries sun/cancer moon/virgo rising • gay as hell • ENTP • beginner witch º ~terfs/swerfs/ddlg/nsfw blogs stay away~,º hi im adrian/riley and i exist sometimes • trans enby • 14 • aries sun/cancer moon/virgo rising • gay as hell • ENTP • beginner witch º ~terfs/swerfs/ddlg/nsfw blogs stay away~
176349,307488570.0,agayboysblog69,The sea witch,https://agayboysblog69.tumblr.com/,US/Eastern,just a gay | male witch | learning and growing in the craft,just a gay | male witch | learning and growing in the craft
1637973,270359067.0,tanubear101,Hola.,https://tanubear101.tumblr.com/,US/Eastern,<p>CW :75 kgs. GW: 60 UGW: 55.</p><p>💘</p>,CW :75 kgs. GW: 60 UGW: 55.💘
855948,48975674.0,yougotafraninmee,Laughing With Anger,http://yougotafraninmee.tumblr.com/,US/Pacific,<p>Francella. 24. Los Angeles Area. IG&amp;Twitter: Yougotafraninme</p>,Francella. 24. Los Angeles Area. IG&Twitter: Yougotafraninme


# LSA on blog descriptions

## Get blog descriptions

In [2]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

  interactivity=interactivity, compiler=compiler, result=result)


4617128


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description'],
      dtype='object')

In [4]:
blog_descs = data['parsed_blog_description'].values
blog_descs.shape

(4617128,)

## Get tfidf matrix

In [5]:
tfidf = TfidfVectorizer(max_features=100000)
tfidf_mat = tfidf.fit_transform(blog_descs)
tfidf_mat.shape

(4617128, 100000)

## Do SVD

In [6]:
svd = TruncatedSVD(n_components=300)
svd_mat = svd.fit_transform(tfidf_mat)
svd_mat.shape

(4617128, 300)

In [7]:
svd.explained_variance_ratio_.sum() 
# 17% with 100 components over full vocab 
# 22% with 100 components over top 100k words
# 34% with 300 components over top 100k words

0.33799431595227575

In [8]:
# Words x components matrix
svd.components_.shape

(300, 100000)

In [9]:
# word features
feats = tfidf.get_feature_names()
len(feats)

100000

## Get ranked word features by component

In [10]:
top = np.argsort(svd.components_)[:100]
top.shape

(100, 100000)

In [11]:
top_sub = top[:, :100]
top_sub.shape

(100, 100)

In [12]:
feats2names = np.vectorize(lambda x: feats[x])
top_feats = feats2names(top_sub)
top_feats

array([['zlatanstrophywife', 'gerardwaay', 'qwertyuiop', ...,
        '単身パックの見積もりハテナのマーク', '車査定ウルフの高額買取', 'シアリス通販'],
       ['you', 'the', 'me', ..., 'because', 'something', 'little'],
       ['you', 'page', 'me', ..., 'from', 'someone', 'look'],
       ..., 
       ['14', 'old', 'years', ..., 'doing', 'made', 'loves'],
       ['free', 'up', 'idk', ..., 'shut', 'world', 'download'],
       ['happy', 'anything', 'never', ..., 'fm', 'down', 'date']], 
      dtype='<U26')

In [14]:
for i, factor in enumerate(top_feats):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

Factor 0
array(['zlatanstrophywife', 'gerardwaay', 'qwertyuiop',
       '将全球游戏玩家转换成我们用户的娱乐经营理念', '提供多种多样休闲娱乐产品给全球超过3000万会员', '走向世界',
       '牢牢坚持面向亚欧', 'ooie', 'onlineko', 'bbbbb', 'hhhhhhhhh', 'jufjfhg',
       'clintbxrtn', 'yuer', 'giribouy', 'sajlor', 'gentlemangos', 'eoq',
       'edit_tumblelog', 'theme_id', '636545', 'ohmygodstiel',
       'tiniagoldstein', 'lesbianmonarch', 'tralalalala', '21rj',
       'unfuckwithable', 'mionamisugi', '32929', 'hellloooooo', 'hfndb',
       '취향대로', 'ooiid', '大家过来学', 'hiiiiiiiiiiiiiiii', 'philsshirts',
       '19さい', 'fuckthepopulation', '并深受广大玩家喜爱', '随时为广大玩家提供最为即时的线上游戏娱乐项目',
       '致力于打造一个方便快捷的线上游戏娱乐平台', 'iloveyounamjoo', 'rainyjelena',
       'myhotcomments', 'leesoohyukswife', 'npi', 'discription1',
       'just4fun', 'lemonmenace', '这里是联系方式', 'qq123456', '你看我的联系方式是什么',
       '薬通販', 'ディプリックスシアリス20', 'ディプリックス', 'シアリス20', 'klasno', 'シアリス',
       'セット割引', 'ディプロ', 'undescription', 'tdh', 'bet官方中文网址', 'バイあグラ',
       '13913374256货到付币', '872050

In [13]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topwords.npy', top_feats)

## Get ranked documents by component

In [15]:
top_docs_idx = np.argsort(svd_mat.T) # Select 10 highest components
top_docs_idx.shape

(300, 4617128)

In [16]:
top_docs_idx = top_docs_idx[:100]
top_docs_idx.shape

(100, 4617128)

In [17]:
top_sub = top_docs_idx[:, :100]
top_sub.shape

(100, 100)

In [18]:
idx2docs = np.vectorize(lambda x: blog_descs[x])
top_docs = idx2docs(top_sub)
top_docs

array([['asdflkasjflk', 'zlatanstrophywife', 'zlatanstrophywife', ...,
        'Ooie *³*', '@onlineko', '@onlineko'],
       ['love you to The 🌜and🔙', 'Love me and you',
        'Forevor. Love. You. And. Me.', ..., 'Me and You.', 'Me and you',
        'You and Me'],
       ['& you love me', 'me love you', 'You+me=Love', ..., 'You & Me',
        'YOU&ME<3', 'You&Me'],
       ..., 
       ['14 years old', '14 years old.', '14 years old 👍', ...,
        'Im 14 Years Old.', '14 year old♥', 'a 14 year old'],
       ['Get Free', 'GET FREE', 'get free', ..., 'free', 'free! 弱虫 排球',
        'Free.'],
       ['I never post anything ¯\\_(ツ)_/¯', 'I never post anything :-)',
        'I will never post anything.', ..., 'Happy =D', 'happy', 'b happy']], 
      dtype='<U1541')

In [19]:
for i, factor in enumerate(top_docs):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

Factor 0
array(['asdflkasjflk', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife biiiiitch', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife biiiiitch', 'zlatanstrophywife', '圖檔儲存blog',
       'moepic17\u3000moepic18\u3000萌え連\u3000ネタ連\u3000詳細希望\u3000壁連\u3000和連\u3000詳細希望裏\u3000萌え裏\u3000詳細漫画裏\u3000壁裏',
       'asfasfasfsf', '@gerardwaay', '@gerardwaay ', '@gerardwaay',
       '@gerardwaay', '@gerardwaay ', '@gerardwaay', '@gerardwaay',
       '@gerardwaay', '@ge

In [20]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topdocs.npy', top_docs)