# Inspect test and training set for user/row overlap

## Row overlap
There is overlap between reblogs and nonreblogs in training/test sets, but not overlap on individual reblog-nonreblog pairs (instances).

In [32]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
reblog_train, reblog_test = train_test_split(reblog_features, test_size=0.1, random_state=12345)
print(len(reblog_test))
# print(len(reblog_train))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
nonreblog_train, nonreblog_test = train_test_split(nonreblog_features, test_size=0.1, random_state=12345)
print(len(nonreblog_test))

print(len(reblog_train))

71267
71267
641403


In [33]:
# Search for duplicates
print(len(reblog_features))
print(len(reblog_features.drop_duplicates()))

712670
151491


In [56]:
# Search for duplicates
print(len(nonreblog_features))
print(len(nonreblog_features.drop_duplicates()))

712670
326610


In [37]:
# Search for intersection between training and test set (reblogs)
reblog_train_nodups = reblog_train.drop_duplicates()
reblog_test_nodups = reblog_test.drop_duplicates()
print(len(reblog_test_nodups))
print(len(reblog_train_nodups.merge(reblog_test_nodups, how='inner')))

58752
58432


In [39]:
# Search for intersection between training and test set (nonreblogs)
nonreblog_train_nodups = nonreblog_train.drop_duplicates()
nonreblog_test_nodups = nonreblog_test.drop_duplicates()
print(len(nonreblog_test_nodups))
print(len(nonreblog_train_nodups.merge(nonreblog_test_nodups, how='inner')))

58879
38078


In [57]:
# Search for exact intersection between reblog-nonreblog pairs

selected_cols = ['post_id', 'tumblog_id_follower', 'tumblog_id_followee']
selected_reblog_train = reblog_train[selected_cols]
selected_reblog_train.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']
selected_reblog_test = reblog_test[selected_cols]
selected_reblog_test.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']

selected_nonreblog_train = nonreblog_train[selected_cols]
selected_nonreblog_train.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']
selected_nonreblog_test = nonreblog_test[selected_cols]
selected_nonreblog_test.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']

train_matched = pd.concat([selected_reblog_train, selected_nonreblog_train], axis=1)
test_matched = pd.concat([selected_reblog_test, selected_nonreblog_test], axis=1)
print(train_matched.shape)
print(train_matched.columns)

(641403, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')


In [59]:
# Exact row overlap
train_matched_nodups = train_matched.drop_duplicates()
test_matched_nodups = test_matched.drop_duplicates()
print(len(train_matched_nodups))
print(len(test_matched_nodups))
print(len(train_matched_nodups.merge(test_matched_nodups, how='inner')))

641403
71267
0


In [31]:
# Count followers, followees in each
train_followers = set(reblog_train['tumblog_id_follower']).union(set(nonreblog_train['tumblog_id_follower']))
print(f'Number of followers in training set: {len(train_followers)}')

test_followers = set(reblog_test['tumblog_id_follower']).union(set(nonreblog_test['tumblog_id_follower']))
print(f'Number of followers in test set: {len(test_followers)}')

print(f'Overlap in followers in training set: {len(train_followers.intersection(test_followers))}')

Number of followers in training set: 705
Number of followers in test set: 599
Overlap in followers in training set: 598


In [32]:
# Count followees, followees in each
train_followees = set(reblog_train['tumblog_id_followee']).union(set(nonreblog_train['tumblog_id_followee']))
print(f'Number of followees in training set: {len(train_followees)}')

test_followees = set(reblog_test['tumblog_id_followee']).union(set(nonreblog_test['tumblog_id_followee']))
print(f'Number of followees in test set: {len(test_followees)}')

print(f'Overlap in followees in training set: {len(train_followees.intersection(test_followees))}')

Number of followees in training set: 33715
Number of followees in test set: 20414
Overlap in followees in training set: 20034


In [34]:
# See how many unique tuples of (follower, followee1, followee2) there are

train_tuples = set(zip(reblog_train['tumblog_id_follower'], reblog_train['tumblog_id_followee'], nonreblog_train['tumblog_id_followee']))
print(f'Number of unique tuples in training set: {len(train_tuples)}')

test_tuples = set(zip(reblog_test['tumblog_id_follower'], reblog_test['tumblog_id_followee'], nonreblog_test['tumblog_id_followee']))
print(f'Number of unique tuples in testing set: {len(test_tuples)}')


Number of unique tuples in training set: 252969
Number of unique tuples in testing set: 52317


# Create subsets of the profile-image included dataset for Yansen

In [60]:
# Check for overlap train/test
import pandas as pd
import os

# Load test set
test_set_dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', 'test_set', 'feature_tables')
test_set = (
    pd.read_csv(os.path.join(test_set_dirpath, 'reblog_features.csv')),
    pd.read_csv(os.path.join(test_set_dirpath, 'nonreblog_features.csv')),
    pd.read_csv(os.path.join(test_set_dirpath, 'ranking_labels.csv')),
)

len(test_set[0])

22843

In [61]:
# Load subsets of training set
subsets = {}

for n in [50000, 100000, 150000, 200000]:
    dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', str(n), 'feature_tables')
    subsets[n] = (
        pd.read_csv(os.path.join(dirpath, 'reblog_features.csv')),
        pd.read_csv(os.path.join(dirpath, 'nonreblog_features.csv')),
        pd.read_csv(os.path.join(dirpath, 'ranking_labels.csv')),
    )
    
print(len(subsets[50000][0]))

50000


In [19]:
# Check for reblog overlap

test_keys = {}
test_keys['reblog'] = set(zip(test_set[0]['tumblog_id_follower'], test_set[0]['tumblog_id_followee'], test_set[0]['post_id']))

for n in [50000, 100000, 150000, 200000]:
    sample_keys = set(zip(subsets[n][0]['tumblog_id_follower'], subsets[n][0]['tumblog_id_followee'], subsets[n][0]['post_id']))
    overlap = len(test_keys['reblog'].intersection(sample_keys))
    print(f'{n}: {overlap}')

50000: 10626
100000: 16083
150000: 18391
200000: 19253


In [65]:
# Search for exact intersection between reblog-nonreblog pairs

for n in [50000, 100000, 150000, 200000]:

    reblog_train = subsets[n][0]
    nonreblog_train = subsets[n][1]
    reblog_test = test_set[0]
    nonreblog_test = test_set[1]

    selected_cols = ['post_id', 'tumblog_id_follower', 'tumblog_id_followee']
    selected_reblog_train = reblog_train[selected_cols]
    selected_reblog_train.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']
    selected_reblog_test = reblog_test[selected_cols]
    selected_reblog_test.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']

    selected_nonreblog_train = nonreblog_train[selected_cols]
    selected_nonreblog_train.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']
    selected_nonreblog_test = nonreblog_test[selected_cols]
    selected_nonreblog_test.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']

    train_matched = pd.concat([selected_reblog_train, selected_nonreblog_train], axis=1)
    test_matched = pd.concat([selected_reblog_test, selected_nonreblog_test], axis=1)
    print(train_matched.shape)
    print(train_matched.columns)

    # Exact row overlap
    train_matched_nodups = train_matched.drop_duplicates()
    test_matched_nodups = test_matched.drop_duplicates()
    print(len(train_matched_nodups))
    print(len(test_matched_nodups))
    print(len(train_matched_nodups.merge(test_matched_nodups, how='inner')))
    print()

(50000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
50000
22843
0

(100000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
100000
22843
0

(150000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
150000
22843
0

(200000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
200000
22843
0



In [20]:
# Load profile-only dataset
import pandas as pd
import os

profile_dataset_dirpath = '/data/websci2020_yansen/icwsm2020_sample1k_profile_images/feature_tables/'
reblog_fpath = os.path.join(profile_dataset_dirpath, 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(profile_dataset_dirpath, 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

ranking_labels_fpath = os.path.join(profile_dataset_dirpath, 'ranking_labels.csv')
ranking_labels = pd.read_csv(ranking_labels_fpath)
print(len(ranking_labels))

228424
228424
228424


In [21]:
# Make test set for learning curve analysis
# test_set = (
#     reblog_features.sample(n=28424, random_state=50),
#     nonreblog_features.sample(n=28424, random_state=50),
#     ranking_labels.sample(n=28424, random_state=50)
# )

from sklearn.model_selection import train_test_split

test_set = {}
training_set = {}
training_set['reblog'], test_set['reblog'] = train_test_split(reblog_features, test_size=0.1, random_state=12345)
training_set['nonreblog'], test_set['nonreblog'] = train_test_split(nonreblog_features, test_size=0.1, random_state=12345)
training_set['ranking_labels'], test_set['ranking_labels'] = train_test_split(ranking_labels, test_size=0.1, random_state=12345)

# test_set = (
#     train_test_split(reblog_features, test_size=0.1, random_state=12345)[1],
#     train_test_split(nonreblog_features, test_size=0.1, random_state=12345)[1],
#     train_test_split(ranking_labels, test_size=0.1, random_state=12345)[1],
# )

# Save out
out_dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', 'test_set', 'feature_tables')
test_set['reblog'].to_csv(os.path.join(out_dirpath, 'reblog_features.csv'), index=False)
test_set['nonreblog'].to_csv(os.path.join(out_dirpath, 'nonreblog_features.csv'), index=False)
test_set['ranking_labels'].to_csv(os.path.join(out_dirpath, 'ranking_labels.csv'), index=False)

In [25]:
print(len(training_set['reblog']))
print(len(set(zip(
    training_set['reblog']['tumblog_id_follower'], 
    training_set['reblog']['tumblog_id_followee'],
    training_set['reblog']['post_id'],
))))

205581
58829


In [26]:
print(len(reblog_features))
print(len(set(zip(
    reblog_features['tumblog_id_follower'], 
    reblog_features['tumblog_id_followee'],
    reblog_features['post_id'],
))))

228424
59075


In [28]:
len(reblog_features)

228424

In [27]:
len(reblog_features.drop_duplicates())

59075

In [23]:
# Check for overlap

test_keys = {}
train_keys = {}

test_keys['reblog'] = set(zip(test_set['reblog']['tumblog_id_follower'], test_set['reblog']['tumblog_id_followee'], test_set['reblog']['post_id']))
train_keys['reblog'] = set(zip(training_set['reblog']['tumblog_id_follower'], training_set['reblog']['tumblog_id_followee'], training_set['reblog']['post_id']))
overlap = len(test_keys['reblog'].intersection(train_keys['reblog']))
overlap

# for n in [50000, 100000, 150000, 200000]:
#     sample_keys = set(zip(subsets[n][0]['tumblog_id_follower'], subsets[n][0]['tumblog_id_followee'], subsets[n][0]['post_id']))
#     overlap = len(test_keys['reblog'].intersection(sample_keys))
#     print(f'{n}: {overlap}')

19291

In [14]:
# Sample from the rest of the training set
samp = {}

for n in [50000, 100000, 150000, 200000]:
    samp[n] = (
        training_set['reblog'].sample(n=n, random_state=50),
        training_set['nonreblog'].sample(n=n, random_state=50),
        training_set['ranking_labels'].sample(n=n, random_state=50)
    )

In [15]:
# Save out

for n in [50000, 100000, 150000, 200000]:
    out_dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', str(n), 'feature_tables')
#     os.makedirs(out_dirpath)
    samp[n][0].to_csv(os.path.join(out_dirpath, 'reblog_features.csv'), index=False)
    samp[n][1].to_csv(os.path.join(out_dirpath, 'nonreblog_features.csv'), index=False)
    samp[n][2].to_csv(os.path.join(out_dirpath, 'ranking_labels.csv'), index=False)

# Change blog names to IDs on profile images for Yansen

In [1]:
# Load transformation
import pickle

with open('/data/websci2020_tumblr_identity/tmp/blogname2id.pkl', 'rb') as f:
    blogname2id = pickle.load(f)
    
len(blogname2id)

82169

In [6]:
# Load images
from IPython.core.debugger import set_trace
import os, shutil
from tqdm import tqdm_notebook as tqdm

current_imagepath = '/usr0/home/yansenwa/tumblr/data/processed/'
out_imagepath = '/data/websci2020_yansen/processed_profile_images/'

failures = []

for dirname in tqdm(os.listdir(current_imagepath)):
    dirpath = os.path.join(current_imagepath, dirname)
    
    # Create directory paths
    out_dirpath = os.path.join(out_imagepath, dirname)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
            
    for fname in os.listdir(dirpath):
        src_fpath = os.path.join(dirpath, fname)
        blogname = fname.split('.')[0]
        if not blogname in blogname2id:
            set_trace()
        blogid = blogname2id[blogname]
        
       # Save out
        out_fpath = os.path.join(out_dirpath, f'{blogid}.png')
            
        try:
            shutil.copy(src_fpath, out_fpath)
        except OSError as e:
            failures.append((src_fapth, out_fpath))
            
len(failures)

HBox(children=(IntProgress(value=0, max=768), HTML(value='')))




0

# Create experimental dataset for Yansen

In [9]:
# Load data
import pandas as pd
import os

data_dirpath = '/data/websci2020_yansen'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

print(reblog_features.columns)
print(nonreblog_features.columns)

712670
712670
Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_terms_followee', 'sexual orientation_terms_follower',
       'sexual orie

In [6]:
drop_cols = [
    'processed_blog_description_follower',
 'processed_blog_description_followee',
]
reduced_reblogs = reblog_features.drop(columns=drop_cols)
reduced_reblogs.columns.tolist()

['post_id',
 'tumblog_id_follower',
 'tumblog_id_followee',
 'post_tags',
 'post_type',
 'post_note_count',
 'age_terms_follower',
 'age_terms_followee',
 'ethnicity/nationality_terms_follower',
 'ethnicity/nationality_terms_followee',
 'fandoms_terms_follower',
 'fandoms_terms_followee',
 'gender_terms_follower',
 'gender_terms_followee',
 'gender/sexuality_terms_follower',
 'gender/sexuality_terms_followee',
 'interests_terms_follower',
 'interests_terms_followee',
 'location_terms_follower',
 'location_terms_followee',
 'personality type_terms_follower',
 'personality type_terms_followee',
 'pronouns_terms_follower',
 'pronouns_terms_followee',
 'relationship status_terms_follower',
 'relationship status_terms_followee',
 'roleplay_terms_follower',
 'roleplay_terms_followee',
 'roleplay/fandoms_terms_follower',
 'roleplay/fandoms_terms_followee',
 'sexual orientation_terms_follower',
 'sexual orientation_terms_followee',
 'weight_terms_follower',
 'weight_terms_followee',
 'zodiac_t

In [7]:
drop_cols = [
    'processed_blog_description_follower',
 'processed_blog_description_followee',
]
reduced_nonreblogs = nonreblog_features.drop(columns=drop_cols)
reduced_nonreblogs.columns.tolist()

['post_id',
 'tumblog_id_follower',
 'tumblog_id_followee',
 'post_tags',
 'post_type',
 'post_note_count',
 'age_terms_follower',
 'age_terms_followee',
 'ethnicity/nationality_terms_follower',
 'ethnicity/nationality_terms_followee',
 'fandoms_terms_follower',
 'fandoms_terms_followee',
 'gender_terms_follower',
 'gender_terms_followee',
 'gender/sexuality_terms_follower',
 'gender/sexuality_terms_followee',
 'interests_terms_follower',
 'interests_terms_followee',
 'location_terms_follower',
 'location_terms_followee',
 'personality type_terms_follower',
 'personality type_terms_followee',
 'pronouns_terms_follower',
 'pronouns_terms_followee',
 'relationship status_terms_follower',
 'relationship status_terms_followee',
 'roleplay_terms_follower',
 'roleplay_terms_followee',
 'roleplay/fandoms_terms_follower',
 'roleplay/fandoms_terms_followee',
 'sexual orientation_terms_follower',
 'sexual orientation_terms_followee',
 'weight_terms_follower',
 'weight_terms_followee',
 'zodiac_t

In [8]:
# Save out
reduced_reblogs.to_csv(reblog_fpath, index=False)
reduced_nonreblogs.to_csv(nonreblog_fpath, index=False)

# Get examples from ICWSM 2020 dataset reblogs, nonreblogs

In [1]:
# Match blog names to tumblog IDs
import pandas as pd

# Load reblog info
reblog_info = pd.read_csv('/usr2/mamille2/tumblr/data/sample1k/reblogs_descs_annotated/reblogs_descs.tsv', sep='\t')
reblog_info.columns

# Load pickle
nonreblog_info = pd.read_pickle('/data/websci2020_tumblr_identity/tmp/nonreblog_info.pkl')
print(nonreblog_info.columns)
len(nonreblog_info)

Index(['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower',
       'blog_name_followee'],
      dtype='object')


30765997

In [5]:
list(reblog_info.columns)

['blog_description_followee',
 'blog_name_followee',
 'blog_title_followee',
 'blog_url_followee',
 'is_group_blog_followee',
 'is_private_followee',
 'created_time_epoch_followee',
 'updated_time_epoch_followee',
 'timezone_followee',
 'language_followee',
 'blog_classifier_followee',
 'tumblog_id_follower',
 'blog_description_follower',
 'blog_name_follower',
 'blog_title_follower',
 'blog_url_follower',
 'is_group_blog_follower',
 'is_private_follower',
 'created_time_epoch_follower',
 'updated_time_epoch_follower',
 'timezone_follower',
 'language_follower',
 'blog_classifier_follower',
 'post_id_follower',
 'activity_time_epoch_post_follower',
 'is_private_post_follower',
 'post_title_follower',
 'post_short_url_follower',
 'post_slug_follower',
 'post_type_follower',
 'post_caption_follower',
 'post_format_follower',
 'post_note_count_follower',
 'post_tags_follower',
 'post_content_follower',
 'reblogged_from_post_id',
 'reblogged_from_metadata',
 'created_time_epoch_post_follow

In [3]:
pd.set_option('display.max_rows', 10000)

In [6]:
# Load ICWSM 2020 dataset
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

712670
712670


In [7]:
reblog_features.columns

Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_te

In [10]:
pd.set_option('display.max_colwidth', -1)

In [12]:
selected_columns = ['post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']
has_tags = reblog_features[reblog_features['post_tags'].map(lambda x: len(x) > 2)]
has_tags.sample(200).loc[:, selected_columns]

Unnamed: 0,post_tags,processed_blog_description_followee,processed_blog_description_follower
710215,"['muscle', 'jock']","Permanent Chastity, Physical Modifications and Chained Captivity are just realities that ObjectD must learn accept as part of ITs life... IT has no other option now. - Master C The Blog: NO OPTIONS - NO WAY OUT - EXIST TO SERVE is about the journey ObjectD will experience into absolute Objectification and Dehumanization. This Blog is curated by ObjectD and the intended purpose of these pages is to allow ObjectD an outlet to express ITs fears, difficulties in adjustment (to ITs future reality) as well as ideas of how ObjectD (and others) could be TRAINED and then USED. Thus, it is hoped that insight will be gained from these pages (and ObjectD’s own thoughts) that will help in their own quests to become or to consider such property themselves. ObjectD does experience fear and dread about ITs future... enduring pain and suffering as well, while waiting for ITs destiny to begin... when the reality happens, the actual life ObjectD once had... will irrevocably vanish! Upon that transition, ObjectD shall never again possess the freedom to choose in ITs lifetime. And ITs memories of those days gone by are forfeit and forbidden for the rest of ITs days... ObjectD is simply a thing... IT is NOT REGARDED AS A PERSON, IT does NOT HAVE OPTIONS, IT is NOT OWED ANYTHING, IT does NOT HAVE ANY RIGHTS, IT is OWNED CHATTEL - PROPERTY! ObjectD will do whatever is demanded of IT. ITs identity and humanity will be forfeit. NO OPTIONS - NO WAY OUT - IT EXISTS TO SERVE - IT SERVES TO EXIST. ObjectD cannot avoid going down this path... IT is simply EQUIPMENT, LIVESTOCK and PROPERTY... nothing more... and IT will be treated as such. It will take time for ObjectD to accept that ITs life is based upon simple rules: OBEY, TOIL, and BE USED. ObjectD has a long hard road ahead of IT now. Enjoy the blog! - Master C",Kinky fucker that loves a wide variety of kinks from vanilla to heavy BDSM
327639,['witches'],mari | 26 | victorianist society6 | redbubble,revoltingly kind
16460,"['Lesbian', 'gay', 'same love', 'gaygirls', 'homo', 'girls', 'girls who kiss girls', 'boobs', 'lovewins', 'lgbtyouth', 'butch', 'lgbtq', 'girls who love girls']",Insta: Kell_sss// Snapchat: Kell_sss//Gay as fuck//This is my PERSONAL blog//27 year old//The Netherlands//,sophia. 19. nz
385871,['tattoos'],"bean | 22, 5’2 | stoner woc w an ed hw/sw: 250 | cw: 151.6 ugw: < 110 | pro recovery, not in it days binge free: 122 my posts / personal / faq / food diary",20 Aus
656358,['trophy boy'],"Permanent Chastity, Physical Modifications and Chained Captivity are just realities that ObjectD must learn accept as part of ITs life... IT has no other option now. - Master C The Blog: NO OPTIONS - NO WAY OUT - EXIST TO SERVE is about the journey ObjectD will experience into absolute Objectification and Dehumanization. This Blog is curated by ObjectD and the intended purpose of these pages is to allow ObjectD an outlet to express ITs fears, difficulties in adjustment (to ITs future reality) as well as ideas of how ObjectD (and others) could be TRAINED and then USED. Thus, it is hoped that insight will be gained from these pages (and ObjectD’s own thoughts) that will help in their own quests to become or to consider such property themselves. ObjectD does experience fear and dread about ITs future... enduring pain and suffering as well, while waiting for ITs destiny to begin... when the reality happens, the actual life ObjectD once had... will irrevocably vanish! Upon that transition, ObjectD shall never again possess the freedom to choose in ITs lifetime. And ITs memories of those days gone by are forfeit and forbidden for the rest of ITs days... ObjectD is simply a thing... IT is NOT REGARDED AS A PERSON, IT does NOT HAVE OPTIONS, IT is NOT OWED ANYTHING, IT does NOT HAVE ANY RIGHTS, IT is OWNED CHATTEL - PROPERTY! ObjectD will do whatever is demanded of IT. ITs identity and humanity will be forfeit. NO OPTIONS - NO WAY OUT - IT EXISTS TO SERVE - IT SERVES TO EXIST. ObjectD cannot avoid going down this path... IT is simply EQUIPMENT, LIVESTOCK and PROPERTY... nothing more... and IT will be treated as such. It will take time for ObjectD to accept that ITs life is based upon simple rules: OBEY, TOIL, and BE USED. ObjectD has a long hard road ahead of IT now. Enjoy the blog! - Master C",Kinky fucker that loves a wide variety of kinks from vanilla to heavy BDSM
283867,"['stim', 'sensory', 'recipe', 'not vegan', 'music']",im Alexander and im made out of HUNDREDS of small lizards. gay guy. 17.,"Kayla, 23, USA. She/Her. Biromantic asexual. Criminal Justice grad student. Amateur artist and writer. Feminist. Dedicated Sam!girl. Proud Hufflepuff. Die-hard Marvel fan."
509737,['i love you'],[thuhn-der] a loud rumbling or crashing noise heard after a lightning flash due to the expansion of rapidly heated air,Dreamer | Mother | Wife | Photographer | Nature Lover | Green Witch | Collector of Memories
561367,['love'],A suggestion blog,"Pastel | She/Her | Icon by Reuska | Header by animemes420 | Read the ""About Me"" to see what I post"
56976,"['misc: random', 'the world is hard right now', 'and if something makes me happy', ""i'm done feeling like i need to explain it or apologize for it to anyone""]",amanda. 30. hufflepuff. actual employed adult. entirely uninteresting person.,revoltingly kind
695363,"['gouache painting', 'painting']",There are no rules of architecture for a castle in the clouds.,"Things I’m into. Anime, cats, 🍨."


In [28]:
# cond = (has_tags['processed_blog_description_followee'].map(lambda x: isinstance(x, str) and 'cis' in x)) && (has_tags['processed_blog_description_follower'].map(lambda x: isinstance(x, str) and 'cis' in x))
# cond = [(isinstance(tup[0], str) and 'cis' in tup[0]) and (isinstance(tup[1], str) and 'cis' in tup[1]) for tup in zip(has_tags['processed_blog_description_follower'], has_tags['processed_blog_description_followee'])]
cond = [(isinstance(tup[0], str) and 'cis' in tup[0]) and (isinstance(tup[1], str) and 'cis' in tup[1]) for tup in zip(has_tags['processed_blog_description_follower'], has_tags['processed_blog_description_followee'])]
cis = has_tags[cond]
print(len(cis))
cis.sample(20).loc[:, selected_columns]

0


ValueError: 'a' must be greater than 0 unless no samples are taken

In [40]:
reblog_features.loc[[678786], ['post_id', 'post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']]

Unnamed: 0,post_id,post_tags,processed_blog_description_followee,processed_blog_description_follower
678786,180215628534,"['pokemon', 'type']","pokemom sun and moon, guzma, ssalbulre, ssalbug.","Hello there, I pretty much just reblog what I like on here, mainly things I find humorous, Pokemon stuff, and biology. I hope you enjoy your visit and have a wonderful day!"


In [41]:
reblog_info.loc[reblog_info['post_id_follower']==180215628534, ['post_id_follower','post_tags_follower', 'post_short_url_follower']]

Unnamed: 0,post_id_follower,post_tags_follower,post_short_url_follower
39310,180215628534,{},https://tmblr.co/ZItvBt2dri4Bs


In [50]:
selected_rows = [691129, 224020, 521221, 465263]
reblog_features.loc[selected_rows, ['post_id', 'post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']]

Unnamed: 0,post_id,post_tags,processed_blog_description_followee,processed_blog_description_follower
691129,179816353851,"['caylee you’ve been reading my fics since day 1 i think', 'plus you’re insanely talented!!!!!!!']","Kara, 19, Ravenclaw. Multi-fandom trash. I love Tom Holland. Sometimes I write.","Kaity. Main blog for @the-winchester-gospels-and-cas, where I write SPN and Marvel fanfics."
224020,174892897307,['mod jay'],"a blog focused on spreading lgbtqia+ positivity, providing a safe space, and fighting the stigma against us queer folks. byf | faq | get help | about us posts sorted by tag get involved in activism","• Caitlin • 25 year old alien living in the suburbs of Minnesota. very happy, very sad, very confused, very gay. doing my best!"
521221,175442981889,"['upper antelope canyon', 'US', 'usa', 'na', 'travel', 'traveling', 'explore', 'destinations', 'road trip', 'places', 'summer', 'wanderlust', 'landscape', 'photographers', 'travel photography']",? World through the lens of a traveler. | World | Travel | Journey | Vacation | Road Trip | Nature | Adventure | Wanderlust | Landscape | Photography | ?,Travel Blogger - Photography - Web Design - Virtual Assistant
465263,178055187751,"['c la reprise', 'gobelins', 'crfa21', 'green', 'swimming', 'fish', 'perso', 'back again']",animation student at Gobelins (CRFA21),wHY ARE YOU HERE My Art Blog v http://zelulae.tumblr.com/


In [52]:
post_id = 178055187751
reblog_info.loc[reblog_info['post_id_follower']==post_id, ['post_id_follower','post_tags_follower', 'post_short_url_follower']]

Unnamed: 0,post_id_follower,post_tags_follower,post_short_url_follower
7048,178055187751,{},https://tmblr.co/Z3_Mbk2bqweqd


In [33]:
[el for el in reblog_info.columns if 'url' in el]

['blog_url_followee',
 'blog_url_follower',
 'post_short_url_follower',
 'source_url_follower',
 'post_short_url_followee']

In [39]:
nonreblog_features.loc[[664073], ['post_id', 'post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']]

Unnamed: 0,post_id,post_tags,processed_blog_description_followee,processed_blog_description_follower
664073,179921063762,[],"My eyes are full of raindrops, I'm not crying, I'm taking care of the plants",Sarah. 19. she/hers. Trying to be friends with the local crows. I hoard queer media and pretty rocks.


# See how much of the ICWSM 2020 dataset has all 3 users with profile images

In [3]:
# Load which usernames have default images, etc

log_fpath = '/projects/websci2020_tumblr_identity/logs/scrape_info_2020-01-17T2037.txt'
with open(log_fpath) as f:
    lines = f.read().splitlines()
    
# Find separator line indices
default_idx = lines.index("Blog names with default images:")
other_idx = lines.index("Blog names with other images:")
# print(default_idx)
# print(other_idx)

default_blognames = lines[default_idx:other_idx]
other_blognames = lines[other_idx:]

print(len(default_blognames))
print(len(other_blognames))

9163
19863


In [1]:
# Load set of blog names

outpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/blog_names.txt'
with open(outpath, 'r') as f:
    blognames = f.read().splitlines()

In [4]:
error_blognames = set(blognames) - set(default_blognames) - set(other_blognames)
len(error_blognames)

5775

In [1]:
# Match blog names to tumblog IDs
import pandas as pd

# Load reblog info
reblog_info = pd.read_csv('/usr2/mamille2/tumblr/data/sample1k/reblogs_descs_annotated/reblogs_descs.tsv', sep='\t')
reblog_info.columns

# Load pickle
nonreblog_info = pd.read_pickle('/data/websci2020_tumblr_identity/tmp/nonreblog_info.pkl')
print(nonreblog_info.columns)
len(nonreblog_info)

# Select blog names from blog IDs
selected_columns = ['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower', 'blog_name_followee']
concatenated = pd.concat([reblog_info[selected_columns], nonreblog_info])

blog_ids_names = pd.DataFrame()
blog_ids_names['tumblog_id'] = concatenated['tumblog_id_follower'].tolist() + concatenated['tumblog_id_followee'].tolist()
blog_ids_names['blog_name'] = concatenated['blog_name_follower'].astype(str).tolist() + concatenated['blog_name_followee'].astype(str).tolist()
blog_ids_names.drop_duplicates(inplace=True)
len(blog_ids_names)

Index(['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower',
       'blog_name_followee'],
      dtype='object')


83276

In [3]:
blogid2name = blog_ids_names.set_index('tumblog_id')['blog_name'].to_dict()
blogname2id = {val: key for key, val in blogid2name.items()}
len(blogname2id)

82169

In [4]:
# Save out
import pickle

with open('/data/websci2020_tumblr_identity/tmp/blogname2id_yansen.pkl', 'wb') as f:
    pickle.dump(blogname2id, f)

In [29]:
# Load ICWSM 2020 dataset
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

712670
712670


In [30]:
len(reblog_features.drop_duplicates())

151491

In [12]:
# See how many rows have follower and both followees with profile images

from tqdm import tqdm_notebook as tqdm

n_blogs_profile_images = []

for i in tqdm(range(len(nonreblog_features))):
    
    follower_name = blogid2name[reblog_features.iloc[i]['tumblog_id_follower']]
    reblog_followee_name = blogid2name[reblog_features.iloc[i]['tumblog_id_followee']]
    nonreblog_followee_name = blogid2name[nonreblog_features.iloc[i]['tumblog_id_followee']]
    
    n_blogs = sum(1 for name in [follower_name, reblog_followee_name, nonreblog_followee_name] if name in other_blognames)
    n_blogs_profile_images.append(n_blogs)
    
print(len(n_blogs_profile_images))

HBox(children=(IntProgress(value=0, max=712670), HTML(value='')))


712670


In [13]:
for i in range(4):
    print(f"{i}: {n_blogs_profile_images.count(i)}")

0: 182156
1: 110906
2: 191184
3: 228424


In [15]:
len(n_blogs_profile_images)

712670

In [16]:
228424/712670

0.32051861310283863

## Save dataset with filled out profile images

In [17]:
reblog_features['blogname_follower'] = reblog_features['tumblog_id_follower'].map(lambda x: blogid2name[x])
reblog_features['blogname_followee'] = reblog_features['tumblog_id_followee'].map(lambda x: blogid2name[x])
reblog_features['n_blogs_profile_images'] = n_blogs_profile_images

nonreblog_features['blogname_follower'] = nonreblog_features['tumblog_id_follower'].map(lambda x: blogid2name[x])
nonreblog_features['blogname_followee'] = nonreblog_features['tumblog_id_followee'].map(lambda x: blogid2name[x])
nonreblog_features['n_blogs_profile_images'] = n_blogs_profile_images

In [18]:
# Load ranking labels

labels_fpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/feature_tables/ranking_labels.csv'
ranking_labels = pd.read_csv(labels_fpath)
ranking_labels

Unnamed: 0,ranking_label
0,1
1,0
2,0
3,1
4,1
5,0
6,1
7,0
8,0
9,0


In [20]:
ranking_labels = ranking_labels.iloc[:len(reblog_features)]
len(ranking_labels)

712670

In [21]:
ranking_labels['n_blogs_profile_images'] = n_blogs_profile_images
ranking_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ranking_label,n_blogs_profile_images
0,1,0
1,0,0
2,0,1
3,1,0
4,1,0
5,0,0
6,1,0
7,0,0
8,0,0
9,0,2


In [23]:
# Select profile image dataset
selected_reblog_features = reblog_features[reblog_features['n_blogs_profile_images']==3]
selected_nonreblog_features = nonreblog_features[nonreblog_features['n_blogs_profile_images']==3]
selected_ranking_labels = ranking_labels[ranking_labels['n_blogs_profile_images']==3]

In [24]:
# Save out

out_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k_profile_images/feature_tables/'

selected_reblog_features.to_csv(os.path.join(out_dirpath, 'reblog_features.csv'), index=False)
selected_nonreblog_features.to_csv(os.path.join(out_dirpath, 'nonreblog_features.csv'), index=False)
selected_ranking_labels.to_csv(os.path.join(out_dirpath, 'ranking_labels.csv'), index=False)

In [26]:
print(len(selected_reblog_features))
print(len(selected_nonreblog_features))
print(len(selected_ranking_labels))

228424
228424
228424


In [3]:
# Load
import pandas as pd
import os

dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k_profile_images/feature_tables/'

selected_reblog_features = pd.read_csv(os.path.join(dirpath, 'reblog_features.csv'))
selected_nonreblog_features = pd.read_csv(os.path.join(dirpath, 'nonreblog_features.csv'))

tumblog_ids = set(
    selected_reblog_features['tumblog_id_follower'].tolist() + selected_reblog_features['tumblog_id_followee'].tolist() + \
    selected_nonreblog_features['tumblog_id_follower'].tolist() + selected_nonreblog_features['tumblog_id_followee'].tolist()
)

len(tumblog_ids)

14177

In [4]:
selected_reblog_features.columns

Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_te

# Get usernames in ICWSM 2020 dataset (for profile image scraping)

In [1]:
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

# Get all tumblog_ids
tumblog_ids = set(
    reblog_features['tumblog_id_follower'].tolist() + reblog_features['tumblog_id_followee'].tolist() + \
    nonreblog_features['tumblog_id_follower'].tolist() + nonreblog_features['tumblog_id_followee'].tolist()
)

print(len(tumblog_ids))

712670
712670
34798


## Match tumblog IDs with names

In [2]:
# Load reblog info
reblog_info = pd.read_csv('/usr2/mamille2/tumblr/data/sample1k/reblogs_descs_annotated/reblogs_descs.tsv', sep='\t')
reblog_info.columns

# Load pickle
nonreblog_info = pd.read_pickle('/data/websci2020_tumblr_identity/tmp/nonreblog_info.pkl')
print(nonreblog_info.columns)
len(nonreblog_info)

Index(['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower',
       'blog_name_followee'],
      dtype='object')


30765997

In [3]:
# Select blog names from blog IDs
selected_columns = ['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower', 'blog_name_followee']
concatenated = pd.concat([reblog_info[selected_columns], nonreblog_info])

blog_ids_names = pd.DataFrame()
blog_ids_names['tumblog_id'] = concatenated['tumblog_id_follower'].tolist() + concatenated['tumblog_id_followee'].tolist()
blog_ids_names['blog_name'] = concatenated['blog_name_follower'].astype(str).tolist() + concatenated['blog_name_followee'].astype(str).tolist()
blog_ids_names.drop_duplicates(inplace=True)
len(blog_ids_names)

# Check that intersect
print(len(tumblog_ids))
len(tumblog_ids.intersection(set(blog_ids_names['tumblog_id'].tolist())))

blogid2name = blog_ids_names.set_index('tumblog_id')['blog_name'].to_dict()
blogname2id = {val: key for key, val in blogid2name.items()}

# Check that intersect
print(len(tumblog_ids))
len(tumblog_ids.intersection(set(blogid2name.keys())))

# Lookup tumblog ids
blog_names = [blogid2name[tid] for tid in tumblog_ids]
len(blog_names)

34798
34798


34798

In [5]:
# Save blog names out

outpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/blog_names.txt'
with open(outpath, 'w') as f:
    for name in blog_names:
        f.write(f'{name}\n')