In [59]:
import pytumblr
import pandas as pd
import re
import random
from tqdm import tqdm_notebook as tqdm
import pickle

In [3]:
# OAuth

with open('../../oauth.txt') as f:
    lines = f.read().splitlines()
    
client = pytumblr.TumblrRestClient(lines[0], lines[1], lines[2], lines[3])

# Sample user ids

In [6]:
datapath = '../../data/halfday_text.pkl'

data = pd.read_pickle(datapath)
data.columns

Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'restrictedtags_200freq'],
      dtype='object')

In [9]:
data['source_url']

0                                                        NaN
1          http://thisbeautifulmelody.tumblr.com/post/127...
2          http://zoiekravitz.tumblr.com/post/26012912538...
3                                                        NaN
4                                                        NaN
5                                                        NaN
6                                                        NaN
7                                                        NaN
8          http://imagineyouricon.tumblr.com/post/4972798...
9                                                        NaN
10                        http://braddocksgirl14.tumblr.com/
11                                                       NaN
12                                                       NaN
13         http://perchu.tumblr.com/post/61024362215/fini...
14                                                       NaN
15                                                       NaN
16                      

In [35]:
p = extract_username(data.loc[8, 'source_url'])
p

'imagineyouricon'

In [43]:
data['username'] = data['source_url'].map(extract_username)
data['username']

0                         None
1          thisbeautifulmelody
2                  zoiekravitz
3                         None
4                         None
5                         None
6                         None
7                         None
8              imagineyouricon
9                         None
10             braddocksgirl14
11                        None
12                        None
13                      perchu
14                        None
15                        None
16                        None
17                        None
18             obsessive-ninja
19                        None
20                butt-requiem
21                        None
22                        None
23                        None
24                        None
25                        None
26                        None
27             deanprincesster
28                        None
29                        None
                  ...         
3078612                   None
3078613 

In [44]:
data.to_pickle('../../data/halfday_text_usernames.pkl')

In [45]:
usernames = data['username'].unique()
len(usernames)

77573

In [49]:
u1k = random.sample(list(usernames), 1000)
len(u1k)

1000

In [57]:
u10k = random.sample(list(usernames), 10000)
len(u10k)

10000

In [15]:
[u for u in data['source_url'] if (isinstance(u, str) and u.startswith('https://'))]

['https://askku-chan.tumblr.com/post/100773635251/those-eyes-she-had-no-idea-what-colour-they-were',
 'https://askku-chan.tumblr.com/post/100673104371/pxrtinglass-skomentowała-twój-postdo-i',
 'https://askku-chan.tumblr.com/post/105801252731/pxrtinglass-skomentowała-twój-postattaches-a',
 'https://askku-chan.tumblr.com/post/105806043481/pxrtinglass-skomentowała-twój-postmew-dear',
 'https://askku-chan.tumblr.com/post/106056057476/pxrtinglass-skomentowała-twój-postmew',
 'https://askku-chan.tumblr.com/post/109983318976/pxrtinglass-skomentowała-twój-postcasually',
 'https://askku-chan.tumblr.com/post/109983318976/pxrtinglass-skomentowała-twój-postcasually',
 'https://askku-chan.tumblr.com/post/123197769846/pxrtinglass-skomentowała-twój-postdelicately',
 'https://askku-chan.tumblr.com/post/125602836911/pxrtinglass-skomentowała-twój-postmew-luv',
 'https://allisonfedder.tumblr.com',
 'https://dumbshitwhocares.tumblr.com/post/49150690209/i-get-caught-off-guard-when-people-acknowledge-my',
 

In [41]:
u_p = re.compile(r'https?:\/\/(.*?)\.', re.IGNORECASE)

In [42]:
def extract_username(url):
    if isinstance(url, str):
        m = re.match(u_p, url)
        if not m:
            print(url)
        else:
            return m.group(1)
    
    else:
        return None

# Querying blog descriptions

In [60]:
desc = {}

# for name in ['otherkinfashionunder20', 'other-otherkin', 'kiramii']:
for name in tqdm(u1k[:100]):
    info = client.blog_info(name)
    if 'blog' in info:
        desc[name] = info['blog']['description']
    
print(len(desc))

outpath = '../../blog_descriptions.pkl'

with open(outpath, 'wb') as f:
    pickle.dump(list(desc.values()), f)
    
print("Wrote blog descriptions to {}".format(outpath))


87
Wrote blog descriptions to ../../blog_descriptions.pkl


# Querying tags (terf)

In [4]:
posts = client.tagged('terf', filter='text')

In [6]:
[posts[i]['body'] for i in range(20)]

KeyError: 'body'