In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook as tqdm

from pprint import pprint

# Try modified co-training GMM clustering on blog descriptions, blog posts

In [8]:
import sklearn
sklearn.mixture.__file__

'/usr0/home/mamille2/anaconda3/lib/python3.6/site-packages/sklearn/mixture/__init__.py'

# Run GMM clustering on blog descriptions

In [2]:
# Load data
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

(4617128, 300)

In [21]:
X = desc_emb[:500000,:]
clf = GaussianMixture(n_components=50, verbose=2, warm_start=True)
clf.fit(X)

Initialization 0


KeyboardInterrupt: 

In [6]:
outpath = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'

with open(outpath, 'wb') as f:
    pickle.dump(clf, f)

## Try to continue training a model

In [3]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [30]:
X = desc_emb[:500000,:]
clf.fit(X)

Initialization 0


KeyboardInterrupt: 

# Examine trained GMM

In [2]:
# Load data
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

(4617128, 300)

In [3]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [21]:
# Load descriptions
path = '/usr0/home/mamille2/tumblr/data/en_blog_descriptions.pkl'

desc_df = pd.read_pickle(path)

descs = desc_df['parsed_blog_description'].tolist()

In [5]:
clf.bic(desc_emb[:500000,:]) # -615M for 20 comps, -652M for 50 comps

-652362305.68936038

In [6]:
clf.lower_bound_ #

682.17966700032866

In [25]:
# Get highest weights
wted_comps = np.argsort(clf.weights_)[::-1]
wted_comps

array([39, 10,  3, 21, 30, 28, 32, 16, 19, 37, 47,  0, 27, 41,  2, 20,  1,
       11, 12,  9,  4, 44, 13,  7, 15, 17, 34, 38, 43, 36, 29, 31, 25, 40,
        6, 33, 49,  5, 24, 42, 46,  8, 14, 48, 35, 18, 26, 22, 23, 45])

In [29]:
clf.means_

array([[-0.11212465, -0.11403156, -0.12360644, ..., -0.11187172,
        -0.10297821, -0.04798643],
       [-0.08569848, -0.11787281, -0.0838162 , ...,  0.07069382,
        -0.00814412, -0.14190927],
       [ 1.6362772 , -0.49236873,  0.44092634, ...,  0.40811399,
         0.03749909, -0.41353515],
       ..., 
       [-0.05246725, -0.14640116, -0.0706309 , ...,  0.05178944,
        -0.00546304, -0.1176688 ],
       [-0.09030975, -0.12794083, -0.0739253 , ..., -0.03673233,
        -0.05611355, -0.06036458],
       [-0.17816107, -0.12885563, -0.02081199, ...,  0.22268526,
        -0.02361585, -0.10965976]])

## Examine datapoints with highest probabilities assigned for each cluster; examine cluster assignments

In [5]:
probs = clf.predict_proba(desc_emb[:500000,:])
probs.shape

(500000, 50)

In [6]:
probs[0]

array([  1.20976052e-042,   0.00000000e+000,   1.52360335e-092,
         1.00000000e+000,   8.12129314e-036,   1.93793547e-069,
         0.00000000e+000,   7.75550953e-169,   0.00000000e+000,
         3.61510708e-125,   0.00000000e+000,   3.30080715e-115,
         6.68643809e-138,   5.76839621e-092,   0.00000000e+000,
         0.00000000e+000,   2.44262597e-117,   4.29854236e-114,
         0.00000000e+000,   1.96113941e-020,   0.00000000e+000,
         3.60823176e-010,   0.00000000e+000,   1.55309242e-245,
         3.32514947e-093,   0.00000000e+000,   0.00000000e+000,
         1.31511156e-077,   7.82321204e-056,   2.36434898e-065,
         1.01534780e-047,   0.00000000e+000,   1.86277688e-024,
         0.00000000e+000,   3.27560623e-028,   0.00000000e+000,
         0.00000000e+000,   1.74109317e-077,   0.00000000e+000,
         3.24621128e-020,   0.00000000e+000,   4.63822488e-172,
         3.14343716e-106,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+

In [11]:
top_probs = np.argsort(probs, axis=0)[::-1]
top_probs.shape

(500000, 50)

In [13]:
top_probs[0]

array([421963, 176385, 396035,  86066, 109848, 427292, 126971, 277704,
       295281, 177176,  64296, 112864, 311406, 238769, 103584, 360115,
       129957, 126491, 210507, 439597, 455651, 192403, 460845, 495616,
       442880, 134176, 295014, 342284, 475962,  72640, 249999, 464254,
       371641,  69367, 218296, 444975, 387219, 348249,  53492, 170092,
        32949, 274243, 320553, 359427, 412489, 334584, 499999,  80175,
        18167,  34078])

In [18]:
print(probs[421963,0])
print(probs[40067,0])
print(probs[176385,1])

1.0
1.0
1.0


In [28]:
def top_descs(probs, descs, k, order):
    """ Prints top k descriptions for each component"""
    
    top_probs = np.argsort(probs, axis=0)[::-1]
    
    for i in order:
        print("Component {}".format(i))
        col = top_probs[:,i]
#     for i, c in enumerate(top_probs.T):
        
        for el in col[:k]: 
            print('\t' + descs[el])
            
        print()

In [29]:
top_descs(probs, descs, 20, wted_comps)

Component 39
	cole | tanaka (ajin) fict. | he/him
	••PORN BLOG +18•• Basically I'm a fucking queen who loves lesbian sex ••always touching myself ••
	Alejandra/Mexico City/Bisexual/Egalitarian (not a Feminist)Just a blog where I post whatever I like
	A you tuber at:sevencutegirls123 Instagram:giveearthtomimi
	a blog for me to show my affection for certain ppl and things.                    @maknaecher is my main
	Younique Make-up Presenter from Kent. Follow me for make-up tips and tricks. Join Younique today! :)...https://www.facebook.com/YouniquelyYoursbyMariaAdamshttps://www.youtube.com/user/greatwhiteadams https://instagram.com/youniquelymaria78/https://twitter.com/youniquelymaria
	L 医者. River Healer. Art lover. Sleeping painter. Emotionally in flare. Half dead poet.  I like beauty and chaos. Mostly chaos.
	I am an 19 year old girl who loves pussy!  bi-sexual!  i want all the pussy i can get!!!  ALL AGES R WELCOME!!!!!
	26 years old // Canadian // FTM - Little Man BORN Via CSection 

## Find closest words (cosine distance) in embedding space to cluster means
Doesn't really mean anything, as are averaging embeddings across all words in a post and 'dmitry' is closest to each cluster mean

In [8]:
path = '/usr0/home/mamille2/tumblr/data/desc_ftvecs100000.pkl'

with open(path, 'rb') as f:
    wd_embs = pickle.load(f)
    
len(wd_embs[100000])

100000

In [10]:
closests = []

for m in tqdm(clf.means_):
    closest_dist = np.infty
    closest_wd = None
    
    for wd, emb in wd_embs[100000].items():
        if cosine(m,emb) < closest_dist:
            closest_wd = wd
            
    closests.append(closest_wd)




In [11]:
closests

['dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry',
 'dmitry']

# Sample blog descriptions for analysis

In [3]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

  interactivity=interactivity, compiler=compiler, result=result)


4617128


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description'],
      dtype='object')

In [5]:
pd.set_option('display.max_colwidth', 999)

In [9]:
s = data.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

Unnamed: 0,tumblog_id,tumblr_blog_name,tumblr_blog_title,tumblr_blog_url,timezone,tumblr_blog_description,parsed_blog_description
3625145,331658591.0,diggybangs,,https://diggybangs.tumblr.com/,Australia/Canberra,Diggy Bangs.,Diggy Bangs.
518383,246641476.0,lollycipe-blog,❤Marïa Manuela❤,https://lollycipe-blog.tumblr.com/,US/Eastern,13 anos 💋 l Portugal 💋 l Aveiro 💋 l Andreia ❤ l Xana ❤ l Helena '26 ❤ l Rafa ❤ l Tomás ❤ l Gabi ❤ l Telmo B.❤ l Edu ❤ l Ele ❤,13 anos 💋 l Portugal 💋 l Aveiro 💋 l Andreia ❤ l Xana ❤ l Helena '26 ❤ l Rafa ❤ l Tomás ❤ l Gabi ❤ l Telmo B.❤ l Edu ❤ l Ele ❤
3414511,107954785.0,missionsandjesuslikethings-blog,MissionandJesuslikethings,http://missionsandjesuslikethings-blog.tumblr.com/,US/Mountain,My journey to the great unknown of missions.,My journey to the great unknown of missions.
2919762,351700714.0,jm-dalla,∞,https://jm-dalla.tumblr.com/,US/Eastern,17 / greece / k-pop / movies,17 / greece / k-pop / movies
2488729,246323799.0,creseselia,ˏˋ claude zhao ˎˊ,https://creseselia.tumblr.com/,US/Eastern,"<p>""To see you smile is to feel the sun.""</p><p>➳ other account: @imagineclaude</p><p>Wattpad: @piplupin</p>","""To see you smile is to feel the sun.""➳ other account: @imagineclaudeWattpad: @piplupin"
1545902,14225153.0,pechetty-blog,Apoptosis,http://pechetty-blog.tumblr.com/,US/Eastern,Fresh Anonymity,Fresh Anonymity
1506853,300915956.0,janiraaclaveer,Angus Young💥,https://janiraaclaveer.tumblr.com/,US/Eastern,AC/DC😍,AC/DC😍
679111,310173390.0,chelo-tblr-by-paby,tu y yo así piensalo,https://chelo-tblr-by-paby.tumblr.com/,US/Eastern,gay love,gay love
1454846,199337636.0,3l3phantworld,Embrace every moment,https://3l3phantworld.tumblr.com/,US/Eastern,Laugh. Breathe. Believe.,Laugh. Breathe. Believe.
578457,362108892.0,yclibra17,Libra A,https://yclibra17.tumblr.com/,US/Eastern,<p>from Taiwan</p>,from Taiwan


## Blog descriptions from blogs that have text posts in halfday

In [11]:
text_posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/halfday_text.pkl')
print(len(text_posts))
text_posts.columns

3078642


Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'restrictedtags_200freq'],
      dtype='object')

In [17]:
# Blogs that also have text descriptions
tumblogs_allposts = text_posts['tumblog_id'].unique()
len(tumblogs_allposts)

726081

In [None]:
count_series = text_posts.groupby(['tumblog_id']).size()

In [23]:
tumblogs_2posts = count_series[count_series >= 2].index

In [25]:
tumblogs_5posts = count_series[count_series >= 5].index

In [27]:
tumblogs_10posts = count_series[count_series >= 10].index

In [13]:
data_text = data[data['tumblog_id'].isin(tumblogs_allposts)]
len(data_text)

20792

In [31]:
data_text2 = data[data['tumblog_id'].isin(tumblogs_2posts)]
len(data_text2)

11547

In [26]:
data_text = data[data['tumblog_id'].isin(tumblogs_5posts)]
len(data_text)

4432

In [29]:
data_text10 = data[data['tumblog_id'].isin(tumblogs_10posts)]
len(data_text10)

1797

In [30]:
# Sample from those who have at least 10 text posts in halfday

s = data_text10.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

Unnamed: 0,tumblog_id,tumblr_blog_name,tumblr_blog_title,tumblr_blog_url,timezone,tumblr_blog_description,parsed_blog_description
1568767,149821420.0,thenerdologist,"Few Even Think To Ask ""The Question""",https://thenerdologist.tumblr.com/,US/Eastern,<p>Use the pain of loss...</p><p>instagram: john_daniel_pena</p>,Use the pain of loss...instagram: john_daniel_pena
4324756,243213825.0,audiblewince,Leo,https://audiblewince.tumblr.com/,US/Eastern,"<p>im @audiblewince on most sites</p><p>awful artist/cosplayer</p><p>he/him</p><p>i love to scream abt my sporadic interests</p><p><a href=""https://audiblewince.tumblr.com/post/165012768777/about"">about</a> <a href=""https://ko-fi.com/A3073DHJ"">Buy Me a Coffee</a></p>",im @audiblewince on most sitesawful artist/cosplayerhe/himi love to scream abt my sporadic interestsabout Buy Me a Coffee
309030,322787978.0,redpalladiin,I’d die for Mike Wheeler,http://redpalladiin.tumblr.com/,US/Pacific,<p>meme man </p>,meme man
635577,234918721.0,ghostpai,,http://ghostpai.tumblr.com/,US/Eastern,hey,hey
123339,206644517.0,mattsmithdavidtennant,Whatever I like,https://mattsmithdavidtennant.tumblr.com/,US/Eastern,personal account,personal account
3313680,222004619.0,slitherioking-daniel,･:*:✼✿ Terrific,http://slitherioking-daniel.tumblr.com/,US/Eastern,<p>Martina</p><p>16 </p>,Martina16
2534749,331408628.0,professorcactus,:'),https://professorcactus.tumblr.com/,US/Eastern,<p>My name is meel. I'm a succulent enthusiast.</p>,My name is meel. I'm a succulent enthusiast.
1339682,226535409.0,wooden-cat,Not That Nice,https://wooden-cat.tumblr.com/,US/Eastern,shit,shit
3830198,246553170.0,makenziedoughnut,Nor Nor,https://makenziedoughnut.tumblr.com/,US/Eastern,I AM THE FLAMINGO QUEEN,I AM THE FLAMINGO QUEEN
3233745,270185245.0,mostlikelylauren,,https://mostlikelylauren.tumblr.com/,US/Eastern,"<a href=""https://mostlikelylauren.tumblr.com/tagged/me"">Lauren</a> | 21 | NC</p><p>Morgan | Soccer</p><p>IG: laurenellis04</p>",Lauren | 21 | NCMorgan | SoccerIG: laurenellis04


In [32]:
# Sample from those who have at least 2 text posts in halfday

s = data_text2.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

Unnamed: 0,tumblog_id,tumblr_blog_name,tumblr_blog_title,tumblr_blog_url,timezone,tumblr_blog_description,parsed_blog_description
1253256,292040519.0,royilz,Kush&wizdom,https://royilz.tumblr.com/,US/Eastern,<p>Kush and wizdomm</p>,Kush and wizdomm
2988811,245928069.0,foreignbrat,☽,http://foreignbrat.tumblr.com/,US/Pacific,twenty // california,twenty // california
2180905,256295412.0,mihdun,I will face god and walk backwards into hell,https://mihdun.tumblr.com/,US/Eastern,hello I am Midon. science.,hello I am Midon. science.
1682082,258016116.0,25island,Cowboy Kid - King of the Worms,https://25island.tumblr.com/,US/Eastern,❤💀Leader of the sad boys club 💀❤,❤💀Leader of the sad boys club 💀❤
2449032,142460101.0,julyrod11,If.,http://julyrod11.tumblr.com/,US/Eastern,<p>Hooyah Never Quit..</p><p>CO</p>,Hooyah Never Quit..CO
2526403,314518533.0,itsthekiks,It's Kiki,https://itsthekiks.tumblr.com/,US/Eastern,Mostly about anime and novels and kitties. A little about art. Over 30. Poly Bi Cis Lady.,Mostly about anime and novels and kitties. A little about art. Over 30. Poly Bi Cis Lady.
887235,311103498.0,m00ngal,ukulele players dont interact,http://m00ngal.tumblr.com/,US/Eastern,º hi im adrian/riley and i exist sometimes • trans enby • 14 • aries sun/cancer moon/virgo rising • gay as hell • ENTP • beginner witch º ~terfs/swerfs/ddlg/nsfw blogs stay away~,º hi im adrian/riley and i exist sometimes • trans enby • 14 • aries sun/cancer moon/virgo rising • gay as hell • ENTP • beginner witch º ~terfs/swerfs/ddlg/nsfw blogs stay away~
176349,307488570.0,agayboysblog69,The sea witch,https://agayboysblog69.tumblr.com/,US/Eastern,just a gay | male witch | learning and growing in the craft,just a gay | male witch | learning and growing in the craft
1637973,270359067.0,tanubear101,Hola.,https://tanubear101.tumblr.com/,US/Eastern,<p>CW :75 kgs. GW: 60 UGW: 55.</p><p>💘</p>,CW :75 kgs. GW: 60 UGW: 55.💘
855948,48975674.0,yougotafraninmee,Laughing With Anger,http://yougotafraninmee.tumblr.com/,US/Pacific,<p>Francella. 24. Los Angeles Area. IG&amp;Twitter: Yougotafraninme</p>,Francella. 24. Los Angeles Area. IG&Twitter: Yougotafraninme


# LSA on blog descriptions

## Get blog descriptions

In [2]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

  interactivity=interactivity, compiler=compiler, result=result)


4617128


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description'],
      dtype='object')

In [4]:
blog_descs = data['parsed_blog_description'].values
blog_descs.shape

(4617128,)

## Get tfidf matrix

In [5]:
tfidf = TfidfVectorizer(max_features=100000)
tfidf_mat = tfidf.fit_transform(blog_descs)
tfidf_mat.shape

(4617128, 100000)

## Do SVD

In [6]:
svd = TruncatedSVD(n_components=300)
svd_mat = svd.fit_transform(tfidf_mat)
svd_mat.shape

(4617128, 300)

In [7]:
svd.explained_variance_ratio_.sum() 
# 17% with 100 components over full vocab 
# 22% with 100 components over top 100k words
# 34% with 300 components over top 100k words

0.33799431595227575

In [8]:
# Words x components matrix
svd.components_.shape

(300, 100000)

In [9]:
# word features
feats = tfidf.get_feature_names()
len(feats)

100000

## Get ranked word features by component

In [10]:
top = np.argsort(svd.components_)[:100]
top.shape

(100, 100000)

In [11]:
top_sub = top[:, :100]
top_sub.shape

(100, 100)

In [12]:
feats2names = np.vectorize(lambda x: feats[x])
top_feats = feats2names(top_sub)
top_feats

array([['zlatanstrophywife', 'gerardwaay', 'qwertyuiop', ...,
        '単身パックの見積もりハテナのマーク', '車査定ウルフの高額買取', 'シアリス通販'],
       ['you', 'the', 'me', ..., 'because', 'something', 'little'],
       ['you', 'page', 'me', ..., 'from', 'someone', 'look'],
       ..., 
       ['14', 'old', 'years', ..., 'doing', 'made', 'loves'],
       ['free', 'up', 'idk', ..., 'shut', 'world', 'download'],
       ['happy', 'anything', 'never', ..., 'fm', 'down', 'date']], 
      dtype='<U26')

In [14]:
for i, factor in enumerate(top_feats):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

Factor 0
array(['zlatanstrophywife', 'gerardwaay', 'qwertyuiop',
       '将全球游戏玩家转换成我们用户的娱乐经营理念', '提供多种多样休闲娱乐产品给全球超过3000万会员', '走向世界',
       '牢牢坚持面向亚欧', 'ooie', 'onlineko', 'bbbbb', 'hhhhhhhhh', 'jufjfhg',
       'clintbxrtn', 'yuer', 'giribouy', 'sajlor', 'gentlemangos', 'eoq',
       'edit_tumblelog', 'theme_id', '636545', 'ohmygodstiel',
       'tiniagoldstein', 'lesbianmonarch', 'tralalalala', '21rj',
       'unfuckwithable', 'mionamisugi', '32929', 'hellloooooo', 'hfndb',
       '취향대로', 'ooiid', '大家过来学', 'hiiiiiiiiiiiiiiii', 'philsshirts',
       '19さい', 'fuckthepopulation', '并深受广大玩家喜爱', '随时为广大玩家提供最为即时的线上游戏娱乐项目',
       '致力于打造一个方便快捷的线上游戏娱乐平台', 'iloveyounamjoo', 'rainyjelena',
       'myhotcomments', 'leesoohyukswife', 'npi', 'discription1',
       'just4fun', 'lemonmenace', '这里是联系方式', 'qq123456', '你看我的联系方式是什么',
       '薬通販', 'ディプリックスシアリス20', 'ディプリックス', 'シアリス20', 'klasno', 'シアリス',
       'セット割引', 'ディプロ', 'undescription', 'tdh', 'bet官方中文网址', 'バイあグラ',
       '13913374256货到付币', '872050

In [13]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topwords.npy', top_feats)

## Get ranked documents by component

In [15]:
top_docs_idx = np.argsort(svd_mat.T) # Select 10 highest components
top_docs_idx.shape

(300, 4617128)

In [16]:
top_docs_idx = top_docs_idx[:100]
top_docs_idx.shape

(100, 4617128)

In [17]:
top_sub = top_docs_idx[:, :100]
top_sub.shape

(100, 100)

In [18]:
idx2docs = np.vectorize(lambda x: blog_descs[x])
top_docs = idx2docs(top_sub)
top_docs

array([['asdflkasjflk', 'zlatanstrophywife', 'zlatanstrophywife', ...,
        'Ooie *³*', '@onlineko', '@onlineko'],
       ['love you to The 🌜and🔙', 'Love me and you',
        'Forevor. Love. You. And. Me.', ..., 'Me and You.', 'Me and you',
        'You and Me'],
       ['& you love me', 'me love you', 'You+me=Love', ..., 'You & Me',
        'YOU&ME<3', 'You&Me'],
       ..., 
       ['14 years old', '14 years old.', '14 years old 👍', ...,
        'Im 14 Years Old.', '14 year old♥', 'a 14 year old'],
       ['Get Free', 'GET FREE', 'get free', ..., 'free', 'free! 弱虫 排球',
        'Free.'],
       ['I never post anything ¯\\_(ツ)_/¯', 'I never post anything :-)',
        'I will never post anything.', ..., 'Happy =D', 'happy', 'b happy']], 
      dtype='<U1541')

In [19]:
for i, factor in enumerate(top_docs):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

Factor 0
array(['asdflkasjflk', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife biiiiitch', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife', 'zlatanstrophywife',
       'zlatanstrophywife biiiiitch', 'zlatanstrophywife', '圖檔儲存blog',
       'moepic17\u3000moepic18\u3000萌え連\u3000ネタ連\u3000詳細希望\u3000壁連\u3000和連\u3000詳細希望裏\u3000萌え裏\u3000詳細漫画裏\u3000壁裏',
       'asfasfasfsf', '@gerardwaay', '@gerardwaay ', '@gerardwaay',
       '@gerardwaay', '@gerardwaay ', '@gerardwaay', '@gerardwaay',
       '@gerardwaay', '@ge

In [20]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topdocs.npy', top_docs)