# Investigate unique_reblog_pairs dataset

In [20]:
# Load unique_reblogs
import os
import pandas as pd

websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'
dataset_name = 'unique_reblog_pairs'
dataset_train_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'train', 'feature_tables')
dataset_test_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'test', 'feature_tables')

reblog_train_fpath = os.path.join(dataset_train_fpath, 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(dataset_train_fpath, 'nonreblog_features.csv')
labels_train_fpath = os.path.join(dataset_train_fpath, 'ranking_labels.csv')
train_reblogs = pd.read_csv(reblog_train_fpath)
train_nonreblogs = pd.read_csv(nonreblog_train_fpath)
train_labels = pd.read_csv(labels_train_fpath)

reblog_test_fpath = os.path.join(dataset_test_fpath, 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(dataset_test_fpath, 'nonreblog_features.csv')
labels_test_fpath = os.path.join(dataset_test_fpath, 'ranking_labels.csv')
test_reblogs = pd.read_csv(reblog_test_fpath)
test_nonreblogs = pd.read_csv(nonreblog_test_fpath)
test_labels = pd.read_csv(labels_test_fpath)

In [23]:
# Stats

# Dataset size
print(f'Train set size: {len(train_reblogs)}')
print(f'Test set size: {len(test_reblogs)}')
print()

# reblog overlap training/test
reblog_overlap = len(set(train_reblogs['post_id']).intersection(set(test_reblogs['post_id'])))
print(f'Reblog overlap: {reblog_overlap}')
print()

# follower overlap training/test
print(f'Number of train set followers: {len(set(train_reblogs["tumblog_id_follower"]))}')
print(f'Number of test set followers: {len(set(test_reblogs["tumblog_id_follower"]))}')
follower_overlap = len(set(train_reblogs['tumblog_id_follower']).intersection(set(test_reblogs["tumblog_id_follower"])))
print(f'Follower overlap: {follower_overlap}')
print()

# Check follower-followee pair overlap
train_reblog_user_pairs = set(zip(train_reblogs['tumblog_id_follower'], train_reblogs['tumblog_id_followee']))
test_reblog_user_pairs = set(zip(test_reblogs['tumblog_id_follower'], test_reblogs['tumblog_id_followee']))
print(f"Train reblog user pairs: {len(train_reblog_user_pairs)}")
print(f"Test reblog user pairs: {len(test_reblog_user_pairs)}")
print(f"Overlap: {len(train_reblog_user_pairs.intersection(test_reblog_user_pairs))}")

Train set size: 23211
Test set size: 2580

Reblog overlap: 0

Number of train set followers: 692
Number of test set followers: 433
Follower overlap: 419

Train reblog user pairs: 23211
Test reblog user pairs: 2580
Overlap: 0


# Create unique_reblog_pairs dataset

In [14]:
# Load ICWSM 2020 dataset

import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

ranking_labels = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'ranking_labels.csv')).iloc[:len(reblog_features)]
print(len(ranking_labels))

712670
712670
712670


In [16]:
# Drop reblog pair duplicates

print(len(reblog_features))
unique_pair_reblogs = reblog_features.drop_duplicates(subset=['tumblog_id_follower', 'tumblog_id_followee'], keep='first')
print(len(unique_pair_reblogs))

712670
25791


In [17]:
# Get matching nonreblogs
unique_pair_nonreblogs = nonreblog_features.loc[unique_pair_reblogs.index]
unique_pair_labels = ranking_labels.loc[unique_pair_reblogs.index]
print(len(unique_pair_nonreblogs))
print(len(unique_pair_labels))

25791
25791


In [18]:
# Split training/test
from sklearn.model_selection import train_test_split

train_reblogs, test_reblogs = train_test_split(unique_pair_reblogs, test_size=0.1, random_state=12345)
train_nonreblogs, test_nonreblogs = train_test_split(unique_pair_nonreblogs, test_size=0.1, random_state=12345)
train_labels, test_labels = train_test_split(unique_pair_labels, test_size=0.1, random_state=12345)
print(len(train_reblogs))
print(len(train_nonreblogs))
print(len(train_labels))
print(len(test_reblogs))
print(len(test_nonreblogs))
print(len(test_labels))

23211
23211
23211
2580
2580
2580


In [19]:
# Save out

websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'
dataset_name = 'unique_reblog_pairs'
dataset_train_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'train', 'feature_tables')
dataset_test_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'test', 'feature_tables')

if not os.path.exists(dataset_train_fpath):
    os.makedirs(dataset_train_fpath)
if not os.path.exists(dataset_test_fpath):
    os.makedirs(dataset_test_fpath)

reblog_train_fpath = os.path.join(dataset_train_fpath, 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(dataset_train_fpath, 'nonreblog_features.csv')
labels_train_fpath = os.path.join(dataset_train_fpath, 'ranking_labels.csv')
train_reblogs.to_csv(reblog_train_fpath, index=False)
train_nonreblogs.to_csv(nonreblog_train_fpath, index=False)
train_labels.to_csv(labels_train_fpath, index=False)

reblog_test_fpath = os.path.join(dataset_test_fpath, 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(dataset_test_fpath, 'nonreblog_features.csv')
labels_test_fpath = os.path.join(dataset_test_fpath, 'ranking_labels.csv')
test_reblogs.to_csv(reblog_test_fpath, index=False)
test_nonreblogs.to_csv(nonreblog_test_fpath, index=False)
test_labels.to_csv(labels_test_fpath, index=False)

# Investigate unique_reblogs dataset

In [2]:
# Load unique_reblogs
import os
import pandas as pd

websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'
dataset_name = 'unique_reblogs'
dataset_train_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'train', 'feature_tables')
dataset_test_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'test', 'feature_tables')

reblog_train_fpath = os.path.join(dataset_train_fpath, 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(dataset_train_fpath, 'nonreblog_features.csv')
labels_train_fpath = os.path.join(dataset_train_fpath, 'ranking_labels.csv')
train_reblogs = pd.read_csv(reblog_train_fpath)
train_nonreblogs = pd.read_csv(nonreblog_train_fpath)
train_labels = pd.read_csv(labels_train_fpath)

reblog_test_fpath = os.path.join(dataset_test_fpath, 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(dataset_test_fpath, 'nonreblog_features.csv')
labels_test_fpath = os.path.join(dataset_test_fpath, 'ranking_labels.csv')
test_reblogs = pd.read_csv(reblog_test_fpath)
test_nonreblogs = pd.read_csv(nonreblog_test_fpath)
test_labels = pd.read_csv(labels_test_fpath)

In [8]:
# Stats

# reblog overlap training/test
reblog_overlap = len(set(train_reblogs['post_id']).intersection(set(test_reblogs['post_id'])))
print(f'Reblog overlap: {reblog_overlap}')

# follower overlap training/test
print(f'Number of train set followers: {len(set(train_reblogs["tumblog_id_follower"]))}')
print(f'Number of test set followers: {len(set(test_reblogs["tumblog_id_follower"]))}')
follower_overlap = len(set(train_reblogs['tumblog_id_follower']).intersection(set(test_reblogs["tumblog_id_follower"])))
print(f'Follower overlap: {follower_overlap}')

Reblog overlap: 0
Number of train set followers: 702
Number of test set followers: 508
Follower overlap: 504


In [7]:
print(len(test_reblogs))
print(len(train_reblogs))

15150
136341


In [12]:
# Check follower-followee pair overlap

train_reblog_user_pairs = set(zip(train_reblogs['tumblog_id_follower'], train_reblogs['tumblog_id_followee']))
test_reblog_user_pairs = set(zip(test_reblogs['tumblog_id_follower'], test_reblogs['tumblog_id_followee']))
print(f"Train reblog user pairs: {len(train_reblog_user_pairs)}")
print(f"Test reblog user pairs: {len(test_reblog_user_pairs)}")
print(f"Overlap: {len(train_reblog_user_pairs.intersection(test_reblog_user_pairs))}")

Train reblog user pairs: 24655
Test reblog user pairs: 7462
Overlap: 6326


# Dataset statistics

In [1]:
# Load profile-only dataset
import pandas as pd
import os

profile_dataset_dirpath = '/data/websci2020_yansen/icwsm2020_sample1k_profile_images/feature_tables/'
reblog_fpath = os.path.join(profile_dataset_dirpath, 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(profile_dataset_dirpath, 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

# Unique followers
len(set(reblog_features['tumblog_id_follower']).union(set(nonreblog_features['tumblog_id_follower'])))

228424
228424


386

In [3]:
len(set(reblog_features['tumblog_id_follower']).union(set(nonreblog_features['tumblog_id_follower']))) + \
len(set(reblog_features['tumblog_id_followee']).union(set(nonreblog_features['tumblog_id_followee'])))

14179

In [5]:
# Load ICWSM 2020 dataset
profile_dataset_dirpath = '/data/websci2020_yansen/icwsm2020_sample1k/feature_tables/'
reblog_fpath = os.path.join(profile_dataset_dirpath, 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(profile_dataset_dirpath, 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

# Unique followers
n_unique_followers = len(set(reblog_features['tumblog_id_follower']).union(set(nonreblog_features['tumblog_id_follower'])))
print(f"Number of unique followers: {n_unique_followers}")
n_unique_followees = len(set(reblog_features['tumblog_id_followee']).union(set(nonreblog_features['tumblog_id_followee'])))
print(f"Total users: {n_unique_followers + n_unique_followees}")

712670
712670
Number of unique followers: 706
Total users: 34801


# Try removing all duplicate reblogs entirely

In [12]:
# Load ICWSM 2020 dataset

import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

ranking_labels = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'ranking_labels.csv')).iloc[:len(reblog_features)]
print(len(ranking_labels))

712670
712670
712670


In [13]:
# Drop reblog duplicates

unique_reblogs = reblog_features.drop_duplicates(keep='first')
len(unique_reblogs)

151491

In [14]:
unique_reblogs.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            712013, 712019, 712023, 712025, 712037, 712039, 712041, 712213,
            712356, 712606],
           dtype='int64', length=151491)

In [16]:
# Get matching nonreblogs
unique_nonreblogs = nonreblog_features.loc[unique_reblogs.index]
unique_labels = ranking_labels.loc[unique_reblogs.index]
print(len(unique_nonreblogs))
print(len(unique_labels))

151491
151491


In [17]:
# Split training/test
from sklearn.model_selection import train_test_split

train_reblogs, test_reblogs = train_test_split(unique_reblogs, test_size=0.1, random_state=12345)
train_nonreblogs, test_nonreblogs = train_test_split(unique_nonreblogs, test_size=0.1, random_state=12345)
train_labels, test_labels = train_test_split(unique_labels, test_size=0.1, random_state=12345)
print(len(train_reblogs))
print(len(train_nonreblogs))
print(len(train_labels))
print(len(test_reblogs))
print(len(test_nonreblogs))
print(len(test_labels))

136341
136341
136341
15150
15150
15150


In [20]:
# Save out

websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'
dataset_name = 'unique_reblogs'
dataset_train_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'train', 'feature_tables')
dataset_test_fpath = os.path.join(websci2020_data_dirpath, dataset_name, 'test', 'feature_tables')

if not os.path.exists(dataset_train_fpath):
    os.makedirs(dataset_train_fpath)
if not os.path.exists(dataset_test_fpath):
    os.makedirs(dataset_test_fpath)

reblog_train_fpath = os.path.join(websci2020_data_dirpath, 'unique_reblogs', 'train', 'feature_tables', 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(websci2020_data_dirpath, 'unique_reblogs', 'train', 'feature_tables', 'nonreblog_features.csv')
labels_train_fpath = os.path.join(websci2020_data_dirpath, 'unique_reblogs', 'train', 'feature_tables', 'ranking_labels.csv')
train_reblogs.to_csv(reblog_train_fpath, index=False)
train_nonreblogs.to_csv(nonreblog_train_fpath, index=False)
train_labels.to_csv(labels_train_fpath, index=False)

reblog_test_fpath = os.path.join(websci2020_data_dirpath, 'unique_reblogs', 'test', 'feature_tables', 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(websci2020_data_dirpath, 'unique_reblogs', 'test', 'feature_tables', 'nonreblog_features.csv')
labels_test_fpath = os.path.join(websci2020_data_dirpath, 'unique_reblogs', 'test', 'feature_tables', 'ranking_labels.csv')
test_reblogs.to_csv(reblog_test_fpath, index=False)
test_nonreblogs.to_csv(nonreblog_test_fpath, index=False)
test_labels.to_csv(labels_test_fpath, index=False)

# Create/load original ICWSM dataset split to test

In [2]:
# Load split that gave good results
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/websci2020_submission/'

# Load feature info
reblog_train_fpath = os.path.join(data_dirpath, 'train', 'feature_tables', 'reblog_features.csv')
reblog_train = pd.read_csv(reblog_train_fpath)
reblog_test_fpath = os.path.join(data_dirpath, 'test', 'feature_tables', 'reblog_features.csv')
reblog_test = pd.read_csv(reblog_test_fpath)

nonreblog_train_fpath = os.path.join(data_dirpath, 'train', 'feature_tables', 'nonreblog_features.csv')
nonreblog_train = pd.read_csv(nonreblog_train_fpath)
nonreblog_test_fpath = os.path.join(data_dirpath, 'test', 'feature_tables', 'nonreblog_features.csv')
nonreblog_test = pd.read_csv(nonreblog_test_fpath)

labels_train_fpath = os.path.join(data_dirpath, 'train', 'feature_tables', 'ranking_labels.csv')
labels_train = pd.read_csv(labels_train_fpath)
labels_test_fpath = os.path.join(data_dirpath, 'test', 'feature_tables', 'ranking_labels.csv')
labels_test = pd.read_csv(labels_test_fpath)

In [3]:
# Examine columns, etc

reblog_train.columns

Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_te

In [6]:
# Compare with reblog split data
train_reblog_features.columns

Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_te

In [9]:
all(reblog_train.columns == train_reblog_features.columns)

True

In [19]:
# Save out
data_dirpath = '/data/websci2020_tumblr_identity/websci2020_submission/'

# Load feature info
reblog_train_fpath = os.path.join(data_dirpath, 'train', 'feature_tables', 'reblog_features.csv')
reblog_train.to_csv(reblog_train_fpath, index=False)
reblog_test_fpath = os.path.join(data_dirpath, 'test', 'feature_tables', 'reblog_features.csv')
reblog_test.to_csv(reblog_test_fpath, index=False)

nonreblog_train_fpath = os.path.join(data_dirpath, 'train', 'feature_tables', 'nonreblog_features.csv')
nonreblog_train.to_csv(nonreblog_train_fpath, index=False)
nonreblog_test_fpath = os.path.join(data_dirpath, 'test', 'feature_tables', 'nonreblog_features.csv')
nonreblog_test.to_csv(nonreblog_test_fpath, index=False)

labels_train_fpath = os.path.join(data_dirpath, 'train', 'feature_tables', 'ranking_labels.csv')
labels_train.to_csv(labels_train_fpath, index=False)
labels_test_fpath = os.path.join(data_dirpath, 'test', 'feature_tables', 'ranking_labels.csv')
labels_test.to_csv(labels_test_fpath, index=False)

In [5]:
# Load reblog split dataset

websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'

reblog_train_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'train', 'feature_tables', 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'train', 'feature_tables', 'nonreblog_features.csv')
labels_train_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'train', 'feature_tables', 'ranking_labels.csv')
train_reblog_features = pd.read_csv(reblog_train_fpath)
train_nonreblog_features = pd.read_csv(nonreblog_train_fpath)
train_ranking_labels = pd.read_csv(labels_train_fpath)

reblog_test_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'test', 'feature_tables', 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'test', 'feature_tables', 'nonreblog_features.csv')
labels_test_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'test', 'feature_tables', 'ranking_labels.csv')
test_reblog_features = pd.read_csv(reblog_test_fpath)
test_nonreblog_features = pd.read_csv(nonreblog_test_fpath)
test_ranking_labels = pd.read_csv(labels_test_fpath)

# Create dataset with no reblog overlap (but user overlap allowed)

In [4]:
# Load ICWSM 2020 dataset

import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

ranking_labels = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'ranking_labels.csv')).iloc[:len(reblog_features)]
print(len(ranking_labels))

712670
712670
712670


In [12]:
ranking_labels = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'ranking_labels.csv'))
ranking_labels

Unnamed: 0,ranking_label
0,1
1,0
2,0
3,1
4,1
5,0
6,1
7,0
8,0
9,0


In [15]:
ranking_labels.loc[712659:]

Unnamed: 0,ranking_label
712659,0
712660,1
712661,0
712662,1
712663,0
712664,0
712665,1
712666,1
712667,0
712668,1


In [14]:
truncated = ranking_labels.iloc[:len(reblog_features)]
truncated
# print(len(ranking_labels))

Unnamed: 0,ranking_label
0,1
1,0
2,0
3,1
4,1
5,0
6,1
7,0
8,0
9,0


In [2]:
# Split dataset on reblogged posts
reblogs = set(tuple(line) for line in reblog_features.loc[:,['post_id', 'tumblog_id_follower']].values.tolist())
print(len(reblogs))

train_reblogs, test_reblogs = train_test_split(list(reblogs), test_size=0.1, random_state=12345) # already unique
print(len(train_reblogs))
print(len(test_reblogs))

# Make a dict for O(1) lookup of keys
train_reblogs = {k: 0 for k in train_reblogs}
len(train_reblogs)

151491
136341
15150


136341

In [4]:
# Create train, test datasets with no overlapping reblogs
from tqdm import tqdm_notebook as tqdm

train_row_mask = [(post_id, tumblog_id) in train_reblogs for post_id, tumblog_id in tqdm(list(zip(reblog_features['post_id'], reblog_features['tumblog_id_follower'])))]
train_reblog_features = reblog_features[train_row_mask]
train_nonreblog_features = nonreblog_features[train_row_mask]
print(len(train_reblog_features))
print(len(train_nonreblog_features))

test_row_mask = [not el for el in train_row_mask] # inverse of train_row_mask
test_reblog_features = reblog_features[test_row_mask]
test_nonreblog_features = nonreblog_features[test_row_mask]
print(len(test_reblog_features))
print(len(test_nonreblog_features))

train_ranking_labels = ranking_labels[train_row_mask]
print(len(train_ranking_labels))

test_ranking_labels = ranking_labels[test_row_mask]
print(len(test_ranking_labels))

HBox(children=(IntProgress(value=0, max=712670), HTML(value='')))


641332
641332
71338
71338
641332
71338


In [7]:
train_row_mask

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True

In [6]:
train_reblog_features

Unnamed: 0,post_id,tumblog_id_follower,tumblog_id_followee,post_tags,post_type,post_note_count,processed_blog_description_follower,processed_blog_description_followee,age_terms_follower,age_terms_followee,...,roleplay_terms_follower,roleplay_terms_followee,roleplay/fandoms_terms_follower,roleplay/fandoms_terms_followee,sexual orientation_terms_follower,sexual orientation_terms_followee,weight_terms_follower,weight_terms_followee,zodiac_terms_follower,zodiac_terms_followee
0,174799868299,154808845,297185146,[],photo,768.0,just for fun,18+ ONLY!!!!!!!!! - Man - 22 years old - Just ...,[],['22'],...,[],[],[],[],[],[],[],['fat'],[],[]
1,174547128114,154808845,78626941,[],photo,123559.0,just for fun,515k+ followers want nudity. This blog is NSFW...,[],['20'],...,[],[],[],['fan'],[],[],[],[],[],[]
2,176674124344,154808845,14046287,[],photo,113.0,just for fun,"This blog is a hobby, put here for your enjoym...",[],[],...,[],[],[],[],[],[],[],[],[],[]
3,174547693599,154808845,86613938,"['Angelica', 'Anjelica', 'Chuck', 'Shower', 's...",photo,855.0,just for fun,"You'd lick that. 28, taken. I like making gifs.",[],['28'],...,[],[],[],[],[],[],[],[],[],[]
4,174799787254,154808845,297185146,[],photo,7873.0,just for fun,18+ ONLY!!!!!!!!! - Man - 22 years old - Just ...,[],['22'],...,[],[],[],[],[],[],[],['fat'],[],[]
5,174799570434,154808845,256555152,[],photo,2415.0,just for fun,,[],[],...,[],[],[],[],[],[],[],[],[],[]
6,174799593264,154808845,197133618,[],photo,0.0,just for fun,The same guy with a different Blog - Simply Bl...,[],[],...,[],[],[],[],[],[],[],[],[],[]
7,174799933319,154808845,297185146,[],photo,81.0,just for fun,18+ ONLY!!!!!!!!! - Man - 22 years old - Just ...,[],['22'],...,[],[],[],[],[],[],[],['fat'],[],[]
8,176674249489,154808845,38599714,[],photo,69.0,just for fun,,[],[],...,[],[],[],[],[],[],[],[],[],[]
9,174796965366,196669962,360462369,[],photo,1007.0,There's no more beauty in this world! ***IF Y...,TRIGGER WARNING. This is my story about my men...,[],"['18', '25']",...,[],[],[],[],[],['Pansexual'],[],"['eating disorder', 'eating disorders']",[],[]


In [8]:
train_nonreblog_features

Unnamed: 0,post_id,tumblog_id_follower,tumblog_id_followee,post_tags,post_type,post_note_count,processed_blog_description_follower,processed_blog_description_followee,age_terms_follower,age_terms_followee,...,roleplay_terms_follower,roleplay_terms_followee,roleplay/fandoms_terms_follower,roleplay/fandoms_terms_followee,sexual orientation_terms_follower,sexual orientation_terms_followee,weight_terms_follower,weight_terms_followee,zodiac_terms_follower,zodiac_terms_followee
0,174800187072,154808845,157637220,[],photo,465.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
1,174547890587,154808845,157637220,[],photo,18267.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
2,176673811947,154808845,157637220,[],photo,731.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
3,174547892457,154808845,157637220,[],photo,45548.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
4,174800187072,154808845,157637220,[],photo,465.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
5,174800187072,154808845,157637220,[],photo,465.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
6,174800187072,154808845,157637220,[],photo,465.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
7,174800187072,154808845,157637220,[],photo,465.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
8,176673796747,154808845,157637220,[],photo,0.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
9,174797498102,196669962,165078771,[],photo,15493.0,There's no more beauty in this world! ***IF Y...,Damien | 18 | German | he / him | FtM | Fandom...,[],['18'],...,[],[],[],['Fandom'],[],[],[],[],[],[]


In [9]:
test_reblog_features

Unnamed: 0,post_id,tumblog_id_follower,tumblog_id_followee,post_tags,post_type,post_note_count,processed_blog_description_follower,processed_blog_description_followee,age_terms_follower,age_terms_followee,...,roleplay_terms_follower,roleplay_terms_followee,roleplay/fandoms_terms_follower,roleplay/fandoms_terms_followee,sexual orientation_terms_follower,sexual orientation_terms_followee,weight_terms_follower,weight_terms_followee,zodiac_terms_follower,zodiac_terms_followee
12,179594873032,345633829,172643227,[],quote,5333.0,,| 18 | sorozatfüggő | rocker | könyvmoly | bol...,[],['18'],...,[],[],[],[],[],[],[],[],[],[]
23,176674326539,154808845,108973656,[],photo,16960.0,just for fun,Straight • male • 22 18+ NSFW Ask me anything ...,[],['22'],...,[],[],[],[],[],['Straight'],[],[],[],[]
41,176674237909,154808845,14728649,[],photo,931.0,just for fun,Welcome to Sexual Feelings! I'm here to answer...,[],[],...,[],[],[],[],[],[],[],[],[],[]
43,174547175999,154808845,14726187,[],photo,1352.0,just for fun,"James Deen-Free Since November 29, 2015 Adults...",[],[],...,[],[],[],[],[],[],[],[],[],[]
46,176674707204,154808845,127457940,[],photo,500.0,just for fun,"Primarily Erotic Art. If you are under 18 yo,...",[],[],...,[],[],[],[],[],[],[],[],[],[]
47,176674329214,154808845,139393467,[],photo,14.0,just for fun,18+ // NSFW // adult content 26 • chubby • fem...,[],['26'],...,[],[],[],[],[],[],[],[],[],[]
50,180491120522,345633829,317337335,[],quote,5991.0,,(szín)játék az élet. Írj ha szükséged van val...,[],[],...,[],[],[],[],[],[],[],[],[],[]
63,176996120992,345633829,350681517,[],text,2798.0,,bye,[],[],...,[],[],[],[],[],[],[],[],[],[]
66,180491608882,345633829,392415833,[],quote,820.0,,~Bàrmi is legyen én mindig szeretni foglak 04....,[],[],...,[],[],[],[],[],[],[],[],[],[]
79,179048679117,345633829,251782546,[],quote,2083.0,,//.16yrs.//.D?ra.//-^^-?Rock lover.?-^^-// Sna...,[],['16'],...,[],[],[],[],[],[],[],[],[],[]


In [10]:
test_nonreblog_features

Unnamed: 0,post_id,tumblog_id_follower,tumblog_id_followee,post_tags,post_type,post_note_count,processed_blog_description_follower,processed_blog_description_followee,age_terms_follower,age_terms_followee,...,roleplay_terms_follower,roleplay_terms_followee,roleplay/fandoms_terms_follower,roleplay/fandoms_terms_followee,sexual orientation_terms_follower,sexual orientation_terms_followee,weight_terms_follower,weight_terms_followee,zodiac_terms_follower,zodiac_terms_followee
12,179594866029,345633829,167252595,[],quote,59.0,,A🔒💕,[],[],...,[],[],[],[],[],[],[],[],[],[]
23,176673796747,154808845,157637220,[],photo,0.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
41,176673796747,154808845,157637220,[],photo,0.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
43,174547892457,154808845,157637220,[],photo,45548.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
46,176673811947,154808845,157637220,[],photo,731.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
47,176673796747,154808845,157637220,[],photo,0.0,just for fun,formerly theact-of - still NSFW,[],[],...,[],[],[],[],[],[],[],[],[],[]
50,180490391904,345633829,167252595,[],photo,4913.0,,A🔒💕,[],[],...,[],[],[],[],[],[],[],[],[],[]
63,176996131069,345633829,167252595,[],photo,421320.0,,A🔒💕,[],[],...,[],[],[],[],[],[],[],[],[],[]
66,180491238099,345633829,167252595,[],text,19604.0,,A🔒💕,[],[],...,[],[],[],[],[],[],[],[],[],[]
79,179049402189,345633829,167252595,[],quote,8539.0,,A🔒💕,[],[],...,[],[],[],[],[],[],[],[],[],[]


In [11]:
# Check follower train/test overlap

train_followers = set(train_reblog_features['tumblog_id_follower'])
test_followers = set(test_reblog_features['tumblog_id_follower'])
print(len(test_followers))
print(len(test_followers.intersection(train_followers)))

505
498


In [42]:
# Save out

websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'

reblog_train_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'train', 'feature_tables', 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'train', 'feature_tables', 'nonreblog_features.csv')
labels_train_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'train', 'feature_tables', 'ranking_labels.csv')
train_reblog_features.to_csv(reblog_train_fpath, index=False)
train_nonreblog_features.to_csv(nonreblog_train_fpath, index=False)
train_ranking_labels.to_csv(labels_train_fpath, index=False)

reblog_test_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'test', 'feature_tables', 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'test', 'feature_tables', 'nonreblog_features.csv')
labels_test_fpath = os.path.join(websci2020_data_dirpath, 'reblog_split', 'test', 'feature_tables', 'ranking_labels.csv')
test_reblog_features.to_csv(reblog_test_fpath, index=False)
test_nonreblog_features.to_csv(nonreblog_test_fpath, index=False)
test_ranking_labels.to_csv(labels_test_fpath, index=False)

# Create dataset with no train-test user overlap

In [1]:
# Load ICWSM 2020 dataset

import pandas as pd
import os
from sklearn.model_selection import train_test_split

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

712670
712670


In [23]:
ranking_labels = pd.read_csv(os.path.join('/data/websci2020_tumblr_identity/icwsm2020_sample1k/feature_tables', 'ranking_labels.csv')).iloc[:len(reblog_features)]
len(ranking_labels)

712670

In [9]:
# Split dataset on followers
followers = reblog_features['tumblog_id_follower'].unique() # should be the same as those in nonreblogs
print(len(followers))

# Double-check that reblog followers same as nonreblog
nonreblog_followers = nonreblog_features['tumblog_id_follower'].unique()
len(set(followers).intersection(set(nonreblog_followers)))

706

In [10]:
train_followers, test_followers = train_test_split(followers, test_size=0.1, random_state=12345)
print(len(train_followers))
print(len(test_followers))

635
71


In [11]:
# Create train, test datasets with no overlapping followers
train_row_mask = reblog_features['tumblog_id_follower'].isin(train_followers) # this should be in order
train_reblogs = reblog_features[train_row_mask]
train_nonreblogs = nonreblog_features[train_row_mask]
print(len(train_reblogs))
print(len(train_nonreblogs))

test_row_mask = reblog_features['tumblog_id_follower'].isin(test_followers) # should be inverse of train_row_mask
test_reblogs = reblog_features[test_row_mask]
test_nonreblogs = nonreblog_features[test_row_mask]
print(len(test_reblogs))
print(len(test_nonreblogs))

657844
657844
54826
54826


In [24]:
train_ranking_labels = ranking_labels[train_row_mask.tolist()]
print(len(train_ranking_labels))

test_ranking_labels = ranking_labels[test_row_mask.tolist()]
print(len(test_ranking_labels))

657844
54826


In [14]:
# Save out

websci2020_websci2020_data_dirpath = '/data/websci2020_tumblr_identity/'

reblog_train_fpath = os.path.join(websci2020_data_dirpath, 'websci2020_train', 'feature_tables', 'reblog_features.csv')
nonreblog_train_fpath = os.path.join(websci2020_data_dirpath, 'websci2020_train', 'feature_tables', 'nonreblog_features.csv')
train_reblogs.to_csv(reblog_train_fpath, index=False)
train_nonreblogs.to_csv(nonreblog_train_fpath, index=False)

reblog_test_fpath = os.path.join(websci2020_data_dirpath, 'websci2020_test', 'feature_tables', 'reblog_features.csv')
nonreblog_test_fpath = os.path.join(websci2020_data_dirpath, 'websci2020_test', 'feature_tables', 'nonreblog_features.csv')
test_reblogs.to_csv(reblog_test_fpath, index=False)
test_nonreblogs.to_csv(nonreblog_test_fpath, index=False)

In [25]:
labels_train_fpath = os.path.join(websci2020_data_dirpath, 'websci2020', 'train', 'feature_tables', 'ranking_labels.csv')
train_ranking_labels.to_csv(labels_train_fpath, index=False)

labels_test_fpath = os.path.join(websci2020_data_dirpath, 'websci2020', 'test', 'feature_tables', 'ranking_labels.csv')
test_ranking_labels.to_csv(labels_test_fpath, index=False)

In [16]:
# Save out old ICWSM 2020 data after messed accidentally overwrote the files

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features.to_csv(reblog_fpath, index=False)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features.to_csv(nonreblog_fpath, index=False)
print(len(nonreblog_features))

712670
712670


# Inspect test and training set for user/row overlap

## Row overlap
There is overlap between reblogs and nonreblogs in training/test sets, but not overlap on individual reblog-nonreblog pairs (instances).

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
reblog_train, reblog_test = train_test_split(reblog_features, test_size=0.1, random_state=12345)
print(len(reblog_test))
# print(len(reblog_train))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
nonreblog_train, nonreblog_test = train_test_split(nonreblog_features, test_size=0.1, random_state=12345)
print(len(nonreblog_test))

print(len(reblog_train))

71267
71267
641403


In [33]:
# Search for duplicates
print(len(reblog_features))
print(len(reblog_features.drop_duplicates()))

712670
151491


In [56]:
# Search for duplicates
print(len(nonreblog_features))
print(len(nonreblog_features.drop_duplicates()))

712670
326610


In [2]:
# Search for intersection between training and test set (reblogs)
reblog_train_nodups = reblog_train.drop_duplicates()
reblog_test_nodups = reblog_test.drop_duplicates()
print(len(reblog_test_nodups))
print(len(reblog_train_nodups.merge(reblog_test_nodups, how='inner')))

58752
58432


In [5]:
reblog_test.columns

Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_te

In [9]:
train_reblogs = set(tuple(line) for line in reblog_train.loc[:,['post_id', 'tumblog_id_follower']].values.tolist())
train_reblogs

{(175801888191, 373830812),
 (177581510928, 147385488),
 (177325822255, 253308707),
 (180410274603, 216821993),
 (175315673122, 198137822),
 (178105229005, 92761434),
 (178853627353, 191248972),
 (178246237983, 281404113),
 (174716192668, 118557948),
 (179976237872, 328371677),
 (180140811834, 375582244),
 (180452229984, 9223044),
 (177240815686, 252117914),
 (177704732201, 28708966),
 (176157725968, 34908236),
 (178767286963, 233199282),
 (178298548711, 260692064),
 (178819187415, 23372159),
 (177485378767, 69758563),
 (180289443024, 244035848),
 (180443647841, 242685316),
 (179534951659, 82226842),
 (178418517386, 402194613),
 (177708122257, 69758563),
 (175545748029, 154808845),
 (175378549411, 197386162),
 (179768083466, 131748961),
 (176507442397, 79655929),
 (177725300936, 14629968),
 (179359689165, 112843253),
 (179529240329, 258489292),
 (176200198831, 34015569),
 (177374459787, 14629968),
 (177868252865, 23372159),
 (179807750933, 444855732),
 (178004977166, 281404113),
 (1768

In [10]:
# Search for intersection between training and test set (reblogs)
print(len(reblog_test))
reblog_test['reblog_in_train'] = [pair in train_reblogs for pair in list(zip(reblog_test['post_id'], reblog_test['tumblog_id_follower']))]
reblog_test['reblog_in_train'].sum()

71267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


70903

In [3]:
# Search for intersection between training and test set (nonreblogs)
nonreblog_train_nodups = nonreblog_train.drop_duplicates()
nonreblog_test_nodups = nonreblog_test.drop_duplicates()
print(len(nonreblog_test_nodups))
print(len(nonreblog_train_nodups.merge(nonreblog_test_nodups, how='inner')))

58879
38078


In [57]:
# Search for exact intersection between reblog-nonreblog pairs

selected_cols = ['post_id', 'tumblog_id_follower', 'tumblog_id_followee']
selected_reblog_train = reblog_train[selected_cols]
selected_reblog_train.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']
selected_reblog_test = reblog_test[selected_cols]
selected_reblog_test.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']

selected_nonreblog_train = nonreblog_train[selected_cols]
selected_nonreblog_train.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']
selected_nonreblog_test = nonreblog_test[selected_cols]
selected_nonreblog_test.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']

train_matched = pd.concat([selected_reblog_train, selected_nonreblog_train], axis=1)
test_matched = pd.concat([selected_reblog_test, selected_nonreblog_test], axis=1)
print(train_matched.shape)
print(train_matched.columns)

(641403, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')


In [59]:
# Exact row overlap
train_matched_nodups = train_matched.drop_duplicates()
test_matched_nodups = test_matched.drop_duplicates()
print(len(train_matched_nodups))
print(len(test_matched_nodups))
print(len(train_matched_nodups.merge(test_matched_nodups, how='inner')))

641403
71267
0


In [31]:
# Count followers, followees in each
train_followers = set(reblog_train['tumblog_id_follower']).union(set(nonreblog_train['tumblog_id_follower']))
print(f'Number of followers in training set: {len(train_followers)}')

test_followers = set(reblog_test['tumblog_id_follower']).union(set(nonreblog_test['tumblog_id_follower']))
print(f'Number of followers in test set: {len(test_followers)}')

print(f'Overlap in followers in training set: {len(train_followers.intersection(test_followers))}')

Number of followers in training set: 705
Number of followers in test set: 599
Overlap in followers in training set: 598


In [32]:
# Count followees, followees in each
train_followees = set(reblog_train['tumblog_id_followee']).union(set(nonreblog_train['tumblog_id_followee']))
print(f'Number of followees in training set: {len(train_followees)}')

test_followees = set(reblog_test['tumblog_id_followee']).union(set(nonreblog_test['tumblog_id_followee']))
print(f'Number of followees in test set: {len(test_followees)}')

print(f'Overlap in followees in training set: {len(train_followees.intersection(test_followees))}')

Number of followees in training set: 33715
Number of followees in test set: 20414
Overlap in followees in training set: 20034


In [34]:
# See how many unique tuples of (follower, followee1, followee2) there are

train_tuples = set(zip(reblog_train['tumblog_id_follower'], reblog_train['tumblog_id_followee'], nonreblog_train['tumblog_id_followee']))
print(f'Number of unique tuples in training set: {len(train_tuples)}')

test_tuples = set(zip(reblog_test['tumblog_id_follower'], reblog_test['tumblog_id_followee'], nonreblog_test['tumblog_id_followee']))
print(f'Number of unique tuples in testing set: {len(test_tuples)}')


Number of unique tuples in training set: 252969
Number of unique tuples in testing set: 52317


# Create subsets of the profile-image included dataset for Yansen

In [60]:
# Check for overlap train/test
import pandas as pd
import os

# Load test set
test_set_dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', 'test_set', 'feature_tables')
test_set = (
    pd.read_csv(os.path.join(test_set_dirpath, 'reblog_features.csv')),
    pd.read_csv(os.path.join(test_set_dirpath, 'nonreblog_features.csv')),
    pd.read_csv(os.path.join(test_set_dirpath, 'ranking_labels.csv')),
)

len(test_set[0])

22843

In [61]:
# Load subsets of training set
subsets = {}

for n in [50000, 100000, 150000, 200000]:
    dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', str(n), 'feature_tables')
    subsets[n] = (
        pd.read_csv(os.path.join(dirpath, 'reblog_features.csv')),
        pd.read_csv(os.path.join(dirpath, 'nonreblog_features.csv')),
        pd.read_csv(os.path.join(dirpath, 'ranking_labels.csv')),
    )
    
print(len(subsets[50000][0]))

50000


In [19]:
# Check for reblog overlap

test_keys = {}
test_keys['reblog'] = set(zip(test_set[0]['tumblog_id_follower'], test_set[0]['tumblog_id_followee'], test_set[0]['post_id']))

for n in [50000, 100000, 150000, 200000]:
    sample_keys = set(zip(subsets[n][0]['tumblog_id_follower'], subsets[n][0]['tumblog_id_followee'], subsets[n][0]['post_id']))
    overlap = len(test_keys['reblog'].intersection(sample_keys))
    print(f'{n}: {overlap}')

50000: 10626
100000: 16083
150000: 18391
200000: 19253


In [65]:
# Search for exact intersection between reblog-nonreblog pairs

for n in [50000, 100000, 150000, 200000]:

    reblog_train = subsets[n][0]
    nonreblog_train = subsets[n][1]
    reblog_test = test_set[0]
    nonreblog_test = test_set[1]

    selected_cols = ['post_id', 'tumblog_id_follower', 'tumblog_id_followee']
    selected_reblog_train = reblog_train[selected_cols]
    selected_reblog_train.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']
    selected_reblog_test = reblog_test[selected_cols]
    selected_reblog_test.columns = ['post_id_reblog', 'tumblog_id_follower_reblog', 'tumblog_id_followee_reblog']

    selected_nonreblog_train = nonreblog_train[selected_cols]
    selected_nonreblog_train.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']
    selected_nonreblog_test = nonreblog_test[selected_cols]
    selected_nonreblog_test.columns = ['post_id_nonreblog', 'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog']

    train_matched = pd.concat([selected_reblog_train, selected_nonreblog_train], axis=1)
    test_matched = pd.concat([selected_reblog_test, selected_nonreblog_test], axis=1)
    print(train_matched.shape)
    print(train_matched.columns)

    # Exact row overlap
    train_matched_nodups = train_matched.drop_duplicates()
    test_matched_nodups = test_matched.drop_duplicates()
    print(len(train_matched_nodups))
    print(len(test_matched_nodups))
    print(len(train_matched_nodups.merge(test_matched_nodups, how='inner')))
    print()

(50000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
50000
22843
0

(100000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
100000
22843
0

(150000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
150000
22843
0

(200000, 6)
Index(['post_id_reblog', 'tumblog_id_follower_reblog',
       'tumblog_id_followee_reblog', 'post_id_nonreblog',
       'tumblog_id_follower_nonreblog', 'tumblog_id_followee_nonreblog'],
      dtype='object')
200000
22843
0



In [20]:
# Load profile-only dataset
import pandas as pd
import os

profile_dataset_dirpath = '/data/websci2020_yansen/icwsm2020_sample1k_profile_images/feature_tables/'
reblog_fpath = os.path.join(profile_dataset_dirpath, 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(profile_dataset_dirpath, 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

ranking_labels_fpath = os.path.join(profile_dataset_dirpath, 'ranking_labels.csv')
ranking_labels = pd.read_csv(ranking_labels_fpath)
print(len(ranking_labels))

228424
228424
228424


In [21]:
# Make test set for learning curve analysis
# test_set = (
#     reblog_features.sample(n=28424, random_state=50),
#     nonreblog_features.sample(n=28424, random_state=50),
#     ranking_labels.sample(n=28424, random_state=50)
# )

from sklearn.model_selection import train_test_split

test_set = {}
training_set = {}
training_set['reblog'], test_set['reblog'] = train_test_split(reblog_features, test_size=0.1, random_state=12345)
training_set['nonreblog'], test_set['nonreblog'] = train_test_split(nonreblog_features, test_size=0.1, random_state=12345)
training_set['ranking_labels'], test_set['ranking_labels'] = train_test_split(ranking_labels, test_size=0.1, random_state=12345)

# test_set = (
#     train_test_split(reblog_features, test_size=0.1, random_state=12345)[1],
#     train_test_split(nonreblog_features, test_size=0.1, random_state=12345)[1],
#     train_test_split(ranking_labels, test_size=0.1, random_state=12345)[1],
# )

# Save out
out_dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', 'test_set', 'feature_tables')
test_set['reblog'].to_csv(os.path.join(out_dirpath, 'reblog_features.csv'), index=False)
test_set['nonreblog'].to_csv(os.path.join(out_dirpath, 'nonreblog_features.csv'), index=False)
test_set['ranking_labels'].to_csv(os.path.join(out_dirpath, 'ranking_labels.csv'), index=False)

In [25]:
print(len(training_set['reblog']))
print(len(set(zip(
    training_set['reblog']['tumblog_id_follower'], 
    training_set['reblog']['tumblog_id_followee'],
    training_set['reblog']['post_id'],
))))

205581
58829


In [26]:
print(len(reblog_features))
print(len(set(zip(
    reblog_features['tumblog_id_follower'], 
    reblog_features['tumblog_id_followee'],
    reblog_features['post_id'],
))))

228424
59075


In [28]:
len(reblog_features)

228424

In [27]:
len(reblog_features.drop_duplicates())

59075

In [23]:
# Check for overlap

test_keys = {}
train_keys = {}

test_keys['reblog'] = set(zip(test_set['reblog']['tumblog_id_follower'], test_set['reblog']['tumblog_id_followee'], test_set['reblog']['post_id']))
train_keys['reblog'] = set(zip(training_set['reblog']['tumblog_id_follower'], training_set['reblog']['tumblog_id_followee'], training_set['reblog']['post_id']))
overlap = len(test_keys['reblog'].intersection(train_keys['reblog']))
overlap

# for n in [50000, 100000, 150000, 200000]:
#     sample_keys = set(zip(subsets[n][0]['tumblog_id_follower'], subsets[n][0]['tumblog_id_followee'], subsets[n][0]['post_id']))
#     overlap = len(test_keys['reblog'].intersection(sample_keys))
#     print(f'{n}: {overlap}')

19291

In [14]:
# Sample from the rest of the training set
samp = {}

for n in [50000, 100000, 150000, 200000]:
    samp[n] = (
        training_set['reblog'].sample(n=n, random_state=50),
        training_set['nonreblog'].sample(n=n, random_state=50),
        training_set['ranking_labels'].sample(n=n, random_state=50)
    )

In [15]:
# Save out

for n in [50000, 100000, 150000, 200000]:
    out_dirpath = os.path.join('/data/websci2020_yansen/icwsm2020_sample1k_profile_images/', str(n), 'feature_tables')
#     os.makedirs(out_dirpath)
    samp[n][0].to_csv(os.path.join(out_dirpath, 'reblog_features.csv'), index=False)
    samp[n][1].to_csv(os.path.join(out_dirpath, 'nonreblog_features.csv'), index=False)
    samp[n][2].to_csv(os.path.join(out_dirpath, 'ranking_labels.csv'), index=False)

# Change blog names to IDs on profile images for Yansen

In [1]:
# Load transformation
import pickle

with open('/data/websci2020_tumblr_identity/tmp/blogname2id.pkl', 'rb') as f:
    blogname2id = pickle.load(f)
    
len(blogname2id)

82169

In [6]:
# Load images
from IPython.core.debugger import set_trace
import os, shutil
from tqdm import tqdm_notebook as tqdm

current_imagepath = '/usr0/home/yansenwa/tumblr/data/processed/'
out_imagepath = '/data/websci2020_yansen/processed_profile_images/'

failures = []

for dirname in tqdm(os.listdir(current_imagepath)):
    dirpath = os.path.join(current_imagepath, dirname)
    
    # Create directory paths
    out_dirpath = os.path.join(out_imagepath, dirname)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)
            
    for fname in os.listdir(dirpath):
        src_fpath = os.path.join(dirpath, fname)
        blogname = fname.split('.')[0]
        if not blogname in blogname2id:
            set_trace()
        blogid = blogname2id[blogname]
        
       # Save out
        out_fpath = os.path.join(out_dirpath, f'{blogid}.png')
            
        try:
            shutil.copy(src_fpath, out_fpath)
        except OSError as e:
            failures.append((src_fapth, out_fpath))
            
len(failures)

HBox(children=(IntProgress(value=0, max=768), HTML(value='')))




0

# Create experimental dataset for Yansen

In [9]:
# Load data
import pandas as pd
import os

data_dirpath = '/data/websci2020_yansen'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

print(reblog_features.columns)
print(nonreblog_features.columns)

712670
712670
Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_terms_followee', 'sexual orientation_terms_follower',
       'sexual orie

In [6]:
drop_cols = [
    'processed_blog_description_follower',
 'processed_blog_description_followee',
]
reduced_reblogs = reblog_features.drop(columns=drop_cols)
reduced_reblogs.columns.tolist()

['post_id',
 'tumblog_id_follower',
 'tumblog_id_followee',
 'post_tags',
 'post_type',
 'post_note_count',
 'age_terms_follower',
 'age_terms_followee',
 'ethnicity/nationality_terms_follower',
 'ethnicity/nationality_terms_followee',
 'fandoms_terms_follower',
 'fandoms_terms_followee',
 'gender_terms_follower',
 'gender_terms_followee',
 'gender/sexuality_terms_follower',
 'gender/sexuality_terms_followee',
 'interests_terms_follower',
 'interests_terms_followee',
 'location_terms_follower',
 'location_terms_followee',
 'personality type_terms_follower',
 'personality type_terms_followee',
 'pronouns_terms_follower',
 'pronouns_terms_followee',
 'relationship status_terms_follower',
 'relationship status_terms_followee',
 'roleplay_terms_follower',
 'roleplay_terms_followee',
 'roleplay/fandoms_terms_follower',
 'roleplay/fandoms_terms_followee',
 'sexual orientation_terms_follower',
 'sexual orientation_terms_followee',
 'weight_terms_follower',
 'weight_terms_followee',
 'zodiac_t

In [7]:
drop_cols = [
    'processed_blog_description_follower',
 'processed_blog_description_followee',
]
reduced_nonreblogs = nonreblog_features.drop(columns=drop_cols)
reduced_nonreblogs.columns.tolist()

['post_id',
 'tumblog_id_follower',
 'tumblog_id_followee',
 'post_tags',
 'post_type',
 'post_note_count',
 'age_terms_follower',
 'age_terms_followee',
 'ethnicity/nationality_terms_follower',
 'ethnicity/nationality_terms_followee',
 'fandoms_terms_follower',
 'fandoms_terms_followee',
 'gender_terms_follower',
 'gender_terms_followee',
 'gender/sexuality_terms_follower',
 'gender/sexuality_terms_followee',
 'interests_terms_follower',
 'interests_terms_followee',
 'location_terms_follower',
 'location_terms_followee',
 'personality type_terms_follower',
 'personality type_terms_followee',
 'pronouns_terms_follower',
 'pronouns_terms_followee',
 'relationship status_terms_follower',
 'relationship status_terms_followee',
 'roleplay_terms_follower',
 'roleplay_terms_followee',
 'roleplay/fandoms_terms_follower',
 'roleplay/fandoms_terms_followee',
 'sexual orientation_terms_follower',
 'sexual orientation_terms_followee',
 'weight_terms_follower',
 'weight_terms_followee',
 'zodiac_t

In [8]:
# Save out
reduced_reblogs.to_csv(reblog_fpath, index=False)
reduced_nonreblogs.to_csv(nonreblog_fpath, index=False)

# Get examples from ICWSM 2020 dataset reblogs, nonreblogs

In [1]:
# Match blog names to tumblog IDs
import pandas as pd

# Load reblog info
reblog_info = pd.read_csv('/usr2/mamille2/tumblr/data/sample1k/reblogs_descs_annotated/reblogs_descs.tsv', sep='\t')
reblog_info.columns

# Load pickle
nonreblog_info = pd.read_pickle('/data/websci2020_tumblr_identity/tmp/nonreblog_info.pkl')
print(nonreblog_info.columns)
len(nonreblog_info)

Index(['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower',
       'blog_name_followee'],
      dtype='object')


30765997

In [2]:
pd.set_option('display.max_rows', 10000)

# Load ICWSM 2020 dataset
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

712670
712670


In [4]:
reblog_features.columns.tolist()

['post_id',
 'tumblog_id_follower',
 'tumblog_id_followee',
 'post_tags',
 'post_type',
 'post_note_count',
 'processed_blog_description_follower',
 'processed_blog_description_followee',
 'age_terms_follower',
 'age_terms_followee',
 'ethnicity/nationality_terms_follower',
 'ethnicity/nationality_terms_followee',
 'fandoms_terms_follower',
 'fandoms_terms_followee',
 'gender_terms_follower',
 'gender_terms_followee',
 'gender/sexuality_terms_follower',
 'gender/sexuality_terms_followee',
 'interests_terms_follower',
 'interests_terms_followee',
 'location_terms_follower',
 'location_terms_followee',
 'personality type_terms_follower',
 'personality type_terms_followee',
 'pronouns_terms_follower',
 'pronouns_terms_followee',
 'relationship status_terms_follower',
 'relationship status_terms_followee',
 'roleplay_terms_follower',
 'roleplay_terms_followee',
 'roleplay/fandoms_terms_follower',
 'roleplay/fandoms_terms_followee',
 'sexual orientation_terms_follower',
 'sexual orientation

In [11]:
pd.set_option('display.max_colwidth', -1)

selected_columns = ['post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']
reblog_has_tags = reblog_features[reblog_features['post_tags'].map(lambda x: len(x) > 2)]
reblog_has_tags.sample(200).loc[:, selected_columns]
nonreblog_has_tags = nonreblog_features[nonreblog_features['post_tags'].map(lambda x: len(x) > 2)]
nonreblog_has_tags.sample(200).loc[:, selected_columns]

Unnamed: 0,post_tags,processed_blog_description_followee,processed_blog_description_follower
596268,['hello <3'],"And now I'm gone, but it don't hurt me anymore.",
366676,"['gastrictank', 'boyfriend', 'paperdoll', 'mixed media', 'traditional']",ABOUT ME MY CHARACTERS BOYFRIEND'S ART BLOG DEVIANTART / INSTAGRAM STORENVY / SOCIETY6 COMMISSIONS- filled SLOTS TAGS #ART #FANART #ORIGINAL #POKEMON #HOMESTUCK #PUNCH-OUT!! #GASTRICTANK,"This is the same as my old blog, I'll reblog my commissions and gift art, but won't sort much, and will reblog other stuff. Commission/gift blog over here"
266061,['abortion'],"Chinese-Canadian, panromantic ace, she/her pronouns. People of all races, faiths, nationalities, and orientations are welcome here. Anti-choicers and exclusionists of any kind will be called out and blocked. Rest in peace Anton Yelchin. #BlackLivesMatter. If you need me to tag something for you just ask!",sophia. 19. nz
666640,"['Roy Lavi', 'underwear', 'ABS', 'small']","Not literally. Although you never know. Please be old enough to legally look at naked men. Name Matthew Location New England Stats 33, 6'4"", hairy, bearded Photos here Profession homoerotic art magazine editor FAQ here Twitter here PG-rated Tumblr here email mixtapesforhookers@gmail.com Hot people I've Interviewed: Bennett Anthony Derek Anthony Boomer Banks Dario Beck Sailor Blue Aleks Buldoček Dirk Caber (part 2) Dirk Caber (again) Nick Capra Kennedy Carter Dale Cooper Race Cooper Adam Dacre Jake Deckard Bravo Delta Mike De Marko Diphallic Dude Ben Driver Landon El Mike Enders Darius Ferdyd Dominic Ford Brayden Forrester Mike Gaite Tayte Hanson Chad Hunter Jesse Jackman (part 2) (part 3) Andrew Justice Colby Keller Kevin Lee Shay Michaels Seamus O'Reilly Deviant Otter David Pevsner Steven Ponce JP Richards Adam Russo Bryan Slater Aaron Tone (part 2) Tyson Tyler Austin Wilde",Love a man with body hair. Don't care too much for the shaved look.... hot or not. :)
607228,['aimee'],✩ kyy. 19. female. ✩ stranger things + it. ✩ would die for finn wolfhard. tracking #userkyy.,karina. 25. mostly just a multifandom blog
316392,['Cow Chop Fan Art'],Welcome to my blog here you will mostly find J2 & SPN with a few NSFW posts. There is NO destiel or cockles here. My inbox is always open. ALWAYS KEEP FIGHTING.,"Kayla, 23, USA. She/Her. Biromantic asexual. Criminal Justice grad student. Amateur artist and writer. Feminist. Dedicated Sam!girl. Proud Hufflepuff. Die-hard Marvel fan."
415105,['dm me for cheap promos!'],dm me for cheap promos get paid for walking,
653004,"['Kiss', 'Romance', 'Gif', 'Romantic', 'Cute', 'Relationship', 'Relationships']",You and me forever please ?,
21391,['music'],"Images that capture my eye and make me smile... or.. more. Pics are NSFW and +18. For those of you who prefer color images, I have a color blog that is NSFW, shes-sweet-in-color.tumblr.com. All pics are reposted unless I state otherwise. I also have a SFW blog https://giddyaboutpics.tumblr.com. Any audio clips can be found if you do a search on my page by typing in #erotica. The voice you hear is yours truly. I hope you enjoy.",
204581,"['league of legends', 'lamb', 'kindred lamb', 'sleeping lamb', 'cute kindred', 'maskless lamb']",NSFW Artblog by Lamb of Anubis. Kindred artist.,"This is the same as my old blog, I'll reblog my commissions and gift art, but won't sort much, and will reblog other stuff. Commission/gift blog over here"


In [10]:
# Eth/nat reblogged matches

cond = [len(tup[0]) > 2 and len(tup[1]) > 2 for tup in zip(has_tags['ethnicity/nationality_terms_follower'], has_tags['ethnicity/nationality_terms_followee'])]
selected = has_tags[cond]
print(len(selected))
additional_columns = ['ethnicity/nationality_terms_follower', 'ethnicity/nationality_terms_followee']
selected.sample(20).loc[:, additional_columns + selected_columns]

107


Unnamed: 0,ethnicity/nationality_terms_follower,ethnicity/nationality_terms_followee,post_tags,processed_blog_description_followee,processed_blog_description_follower
407860,['Latina'],"['English', 'German', 'Japanese']",['not hetalia'],"I post about Hetalia (mostly UsUk) & Other Random stuff. I can fluently speak English, German and my own language Farsi \(^^ ) Sometimes I Translate short Comics. I'm studying Japanese by myself, so I can't take requests. I tag everything~*",ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
679135,['mexican'],['English'],"['Tokyo Ghoul:re', 'Kaneki Ken', 'Kaneki Ichika', 'so cute!!']","Currently going through my backlog of translations owo Tokyo Ghoul blog. NOT SPOILER FREE, but I will tag new :re chapters as ""TG spoilers"" until the English translated chapter comes out. I translate TG:re for Jaimini's Box, and do lots of colouring and edits.",24-year-old mexican Architect
172180,['Latina'],"['English', 'German', 'Japanese']","[""lmao x'D"", 'Marukaite Chiqueue']","I post about Hetalia (mostly UsUk) & Other Random stuff. I can fluently speak English, German and my own language Farsi \(^^ ) Sometimes I Translate short Comics. I'm studying Japanese by myself, so I can't take requests. I tag everything~*",ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
673899,"['Chinese', 'Vietnamese']",['white'],['james baldwin'],"Hi, I'm Ian (they/them)! I'm a white nb wlw and I love books and bad puns. Let me know if you need anything tagged!","I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
707130,['Filipino'],['Filipino'],"['GHGHGHGOODDD', 'q']",hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online. Icon by jasontddd"
198440,['mexican'],['English'],"['Tokyo Ghoul:re', 'Kaneki Ken', 'Kaneki Ichika', 'so cute!!']","Currently going through my backlog of translations owo Tokyo Ghoul blog. NOT SPOILER FREE, but I will tag new :re chapters as ""TG spoilers"" until the English translated chapter comes out. I translate TG:re for Jaimini's Box, and do lots of colouring and edits.",24-year-old mexican Architect
311641,['Filipino'],['Filipino'],"['deltarune', 'q']",hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online. Icon by jasontddd"
370337,['Latina'],"['English', 'German', 'Japanese']","['imagine your otp', 'Marukaite Chiqueue']","I post about Hetalia (mostly UsUk) & Other Random stuff. I can fluently speak English, German and my own language Farsi \(^^ ) Sometimes I Translate short Comics. I'm studying Japanese by myself, so I can't take requests. I tag everything~*",ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
595185,"['Chinese', 'Vietnamese']",['white'],['check please'],"Hi, I'm Ian (they/them)! I'm a white nb wlw and I love books and bad puns. Let me know if you need anything tagged!","I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
332920,['Latina'],"['English', 'German', 'Japanese']",['not hetalia'],"I post about Hetalia (mostly UsUk) & Other Random stuff. I can fluently speak English, German and my own language Farsi \(^^ ) Sometimes I Translate short Comics. I'm studying Japanese by myself, so I can't take requests. I tag everything~*",ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹


In [12]:
# Eth/nat non-reblogged follower presents, followee doesn't present

cond = [len(tup[0]) > 2 and len(tup[1]) <= 2 for tup in zip(nonreblog_has_tags['ethnicity/nationality_terms_follower'], nonreblog_has_tags['ethnicity/nationality_terms_followee'])]
selected = nonreblog_has_tags[cond]
print(len(selected))
additional_columns = ['ethnicity/nationality_terms_follower', 'ethnicity/nationality_terms_followee']
selected.sample(20).loc[:, additional_columns + selected_columns]

3027


Unnamed: 0,ethnicity/nationality_terms_follower,ethnicity/nationality_terms_followee,post_tags,processed_blog_description_followee,processed_blog_description_follower
549970,['Canadian'],[],"['the ghost of lunch tomorrow', 'classic hits']","screencaps of the hit pbs kid's show arthur. out of context, except when they're not. why not suss some great classic hits ? all questions and queries are welcome, but be sure to read the FAQ ! and please let me know if you'd like me to respond to a message privately! also, can i interest you in some fine apparel ?",23 | Canadian | Loves Bees
538461,['Canadian'],[],"['lol', 'garfield minus garfield', 'webcomic', 'monster']",,23 | Canadian | Loves Bees
276839,"['Chinese', 'Vietnamese']",[],"['I NEED TO FIND OUT WHO THE CVS ARE', 'bnha spoilers']",Cathy | she/her | cis woman | 25 | panro ace this blog is on indefinite hiatus. pls follow my side blog! @pugszler,"I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
393210,['Latina'],[],['rhps'],"gothiethefairy's the name, but I'm cool if ya call me Gothie. I'm 27-years old. I'll just be blogging about random shit. Lots of random fandoms. I also write! Mostly fanfics and drabbles. icon and background was created by @mabuhaylyn",ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
566736,['Latina'],[],"['catholic', 'sacred heart of jesus']","And I will be a father to you and you shall be sons and daughters to Me, says the Lord Almighty. - 2 Cor 6:18",ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
276870,"['Chinese', 'Vietnamese']",[],['!!!!'],Cathy | she/her | cis woman | 25 | panro ace this blog is on indefinite hiatus. pls follow my side blog! @pugszler,"I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
602130,['Latina'],[],"['aww', 'photos']",,ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
520074,"['Chinese', 'Vietnamese']",[],"['linguistics', 'uvula', 'alan wrench', 'ultrasound', 'body horror']","A daily blog about all things linguistic by Gretchen McCulloch. I also cohost a linguistics podcast, Lingthusiasm.","I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
602432,['Latina'],[],"['aww', 'photos']",,ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹
283219,['Latina'],[],"[""{i feel ya anon... please don't let anything of the sort damage your self worth though... as hard as it is}"", 'jyushimatsu matsuno', 'juushimatsu', 'jyushimatsu']",Hi! Name's AKHTS. Over 25 now – holy shit. I'm a Bob's Burgers... Watcher or something. From France. I'm useless. You probably don't know about me. You shouldn't. Feel free to ask me stuff I guess! I like to talk. Usually. ASK MY ART DEVIANTART ABOUT,ヾ(*´∀｀*)ﾉ She/Her🌹 20🌹Christian🌹Latina🌹 Hopeless Romantic🌹 INFJ 🌹Libertarian🌹Meme King 🌹Bisexual🌹


In [13]:
# Eth/nat non-reblogged both present

cond = [len(tup[0]) > 2 and len(tup[1]) > 2 for tup in zip(nonreblog_has_tags['ethnicity/nationality_terms_follower'], nonreblog_has_tags['ethnicity/nationality_terms_followee'])]
selected = nonreblog_has_tags[cond]
print(len(selected))
additional_columns = ['ethnicity/nationality_terms_follower', 'ethnicity/nationality_terms_followee']
selected.sample(20).loc[:, additional_columns + selected_columns]

177


Unnamed: 0,ethnicity/nationality_terms_follower,ethnicity/nationality_terms_followee,post_tags,processed_blog_description_followee,processed_blog_description_follower
122562,['Filipino'],['Filipino'],['q'],hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online"
582905,"['Chinese', 'Vietnamese']",['white'],['bruise'],"Too liberal for the radicals. Too radical for the liberals. She/Her Fuck off Terfs. Bi, disabled, ex-swer, abuse victim, cis, white DO NOT SCREENCAP IF I HAVE BLOCKED YOU. DON'T MESSAGE ME IF I HAVE BLOCKED YOU. DO NOT MESSAGE ME IF YOU ARE A TERF/SWERF/EXCLUSIONISTS","I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
122768,['Filipino'],['Filipino'],"['answer', 'undertale', 'undertale spoilers']",hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online"
122583,['Filipino'],['Filipino'],"['adventure time', 'doodles']",hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online"
651443,['mexican'],['Canadian'],['about me'],Canadian writer/editor/cat mama/dress addict. I write quite a lot of fic. My main fandoms are Dragon Age and Mass Effect. I'm currently head over heels for the show Lucifer. Currently working on a bunch of original fic (including a novel co-written with my bestest bestie: @w0rdinista) My avatar is by the incomparable @aelwen-art.,24-year-old mexican Architect
582986,"['Chinese', 'Vietnamese']",['white'],['ed'],"Too liberal for the radicals. Too radical for the liberals. She/Her Fuck off Terfs. Bi, disabled, ex-swer, abuse victim, cis, white DO NOT SCREENCAP IF I HAVE BLOCKED YOU. DON'T MESSAGE ME IF I HAVE BLOCKED YOU. DO NOT MESSAGE ME IF YOU ARE A TERF/SWERF/EXCLUSIONISTS","I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
122766,['Filipino'],['Filipino'],"['fdsjgkshjgkng', 'arts']",hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online"
692887,"['Chinese', 'Vietnamese']","['Asian', 'Asian', 'American']","['anon', 'tw sensitive material']","SE Asian . 20s. Sharing news, issues, and thoughts for the Asian American community. I don't own anything I share. Feel free to follow or ask me anything!","I'm Pi. She/her. Chinese & Vietnamese. Arospec/Acespec? Maybe 2xdemi? Idk??? Social justice, Overwatch, Sherlock, Legend of Zelda, Star Trek, Harry Potter, Pokémon, shitposts...etc about"
122574,['Filipino'],['Filipino'],"['ac', 'q']",hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online"
122758,['Filipino'],['Filipino'],['q'],hi i'm mon and you're the coolest!! 20 | ♍ | INFP | ICUP | Filipino say hi to your pets for me! shop ♡ patreon art tag ♡ doodle tag ♡ scribble tag commission tag ♡ art only blog about ♡ faq youtube ♡ twitter deviantart ♡ instagram ( please don't edit/repost my art without credit ) ( COMMISSION STATUS: CLOSED! ty!! ) have a nice day!,"Hi I'm KC, 20, Filipino, CisBi (She/Her), and I do not know how to interact w/ people online"


In [28]:
# cond = (has_tags['processed_blog_description_followee'].map(lambda x: isinstance(x, str) and 'cis' in x)) && (has_tags['processed_blog_description_follower'].map(lambda x: isinstance(x, str) and 'cis' in x))
# cond = [(isinstance(tup[0], str) and 'cis' in tup[0]) and (isinstance(tup[1], str) and 'cis' in tup[1]) for tup in zip(has_tags['processed_blog_description_follower'], has_tags['processed_blog_description_followee'])]
cond = [(isinstance(tup[0], str) and 'cis' in tup[0]) and (isinstance(tup[1], str) and 'cis' in tup[1]) for tup in zip(has_tags['processed_blog_description_follower'], has_tags['processed_blog_description_followee'])]
cis = has_tags[cond]
print(len(cis))
cis.sample(20).loc[:, selected_columns]

0


ValueError: 'a' must be greater than 0 unless no samples are taken

In [40]:
reblog_features.loc[[678786], ['post_id', 'post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']]

Unnamed: 0,post_id,post_tags,processed_blog_description_followee,processed_blog_description_follower
678786,180215628534,"['pokemon', 'type']","pokemom sun and moon, guzma, ssalbulre, ssalbug.","Hello there, I pretty much just reblog what I like on here, mainly things I find humorous, Pokemon stuff, and biology. I hope you enjoy your visit and have a wonderful day!"


In [41]:
reblog_info.loc[reblog_info['post_id_follower']==180215628534, ['post_id_follower','post_tags_follower', 'post_short_url_follower']]

Unnamed: 0,post_id_follower,post_tags_follower,post_short_url_follower
39310,180215628534,{},https://tmblr.co/ZItvBt2dri4Bs


In [50]:
selected_rows = [691129, 224020, 521221, 465263]
reblog_features.loc[selected_rows, ['post_id', 'post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']]

Unnamed: 0,post_id,post_tags,processed_blog_description_followee,processed_blog_description_follower
691129,179816353851,"['caylee you’ve been reading my fics since day 1 i think', 'plus you’re insanely talented!!!!!!!']","Kara, 19, Ravenclaw. Multi-fandom trash. I love Tom Holland. Sometimes I write.","Kaity. Main blog for @the-winchester-gospels-and-cas, where I write SPN and Marvel fanfics."
224020,174892897307,['mod jay'],"a blog focused on spreading lgbtqia+ positivity, providing a safe space, and fighting the stigma against us queer folks. byf | faq | get help | about us posts sorted by tag get involved in activism","• Caitlin • 25 year old alien living in the suburbs of Minnesota. very happy, very sad, very confused, very gay. doing my best!"
521221,175442981889,"['upper antelope canyon', 'US', 'usa', 'na', 'travel', 'traveling', 'explore', 'destinations', 'road trip', 'places', 'summer', 'wanderlust', 'landscape', 'photographers', 'travel photography']",? World through the lens of a traveler. | World | Travel | Journey | Vacation | Road Trip | Nature | Adventure | Wanderlust | Landscape | Photography | ?,Travel Blogger - Photography - Web Design - Virtual Assistant
465263,178055187751,"['c la reprise', 'gobelins', 'crfa21', 'green', 'swimming', 'fish', 'perso', 'back again']",animation student at Gobelins (CRFA21),wHY ARE YOU HERE My Art Blog v http://zelulae.tumblr.com/


In [52]:
post_id = 178055187751
reblog_info.loc[reblog_info['post_id_follower']==post_id, ['post_id_follower','post_tags_follower', 'post_short_url_follower']]

Unnamed: 0,post_id_follower,post_tags_follower,post_short_url_follower
7048,178055187751,{},https://tmblr.co/Z3_Mbk2bqweqd


In [33]:
[el for el in reblog_info.columns if 'url' in el]

['blog_url_followee',
 'blog_url_follower',
 'post_short_url_follower',
 'source_url_follower',
 'post_short_url_followee']

In [39]:
nonreblog_features.loc[[664073], ['post_id', 'post_tags', 'processed_blog_description_followee', 'processed_blog_description_follower']]

Unnamed: 0,post_id,post_tags,processed_blog_description_followee,processed_blog_description_follower
664073,179921063762,[],"My eyes are full of raindrops, I'm not crying, I'm taking care of the plants",Sarah. 19. she/hers. Trying to be friends with the local crows. I hoard queer media and pretty rocks.


# See how much of the ICWSM 2020 dataset has all 3 users with profile images

In [3]:
# Load which usernames have default images, etc

log_fpath = '/projects/websci2020_tumblr_identity/logs/scrape_info_2020-01-17T2037.txt'
with open(log_fpath) as f:
    lines = f.read().splitlines()
    
# Find separator line indices
default_idx = lines.index("Blog names with default images:")
other_idx = lines.index("Blog names with other images:")
# print(default_idx)
# print(other_idx)

default_blognames = lines[default_idx:other_idx]
other_blognames = lines[other_idx:]

print(len(default_blognames))
print(len(other_blognames))

9163
19863


In [1]:
# Load set of blog names

outpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/blog_names.txt'
with open(outpath, 'r') as f:
    blognames = f.read().splitlines()

In [4]:
error_blognames = set(blognames) - set(default_blognames) - set(other_blognames)
len(error_blognames)

5775

In [1]:
# Match blog names to tumblog IDs
import pandas as pd

# Load reblog info
reblog_info = pd.read_csv('/usr2/mamille2/tumblr/data/sample1k/reblogs_descs_annotated/reblogs_descs.tsv', sep='\t')
reblog_info.columns

# Load pickle
nonreblog_info = pd.read_pickle('/data/websci2020_tumblr_identity/tmp/nonreblog_info.pkl')
print(nonreblog_info.columns)
len(nonreblog_info)

# Select blog names from blog IDs
selected_columns = ['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower', 'blog_name_followee']
concatenated = pd.concat([reblog_info[selected_columns], nonreblog_info])

blog_ids_names = pd.DataFrame()
blog_ids_names['tumblog_id'] = concatenated['tumblog_id_follower'].tolist() + concatenated['tumblog_id_followee'].tolist()
blog_ids_names['blog_name'] = concatenated['blog_name_follower'].astype(str).tolist() + concatenated['blog_name_followee'].astype(str).tolist()
blog_ids_names.drop_duplicates(inplace=True)
len(blog_ids_names)

Index(['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower',
       'blog_name_followee'],
      dtype='object')


83276

In [3]:
blogid2name = blog_ids_names.set_index('tumblog_id')['blog_name'].to_dict()
blogname2id = {val: key for key, val in blogid2name.items()}
len(blogname2id)

82169

In [4]:
# Save out
import pickle

with open('/data/websci2020_tumblr_identity/tmp/blogname2id_yansen.pkl', 'wb') as f:
    pickle.dump(blogname2id, f)

In [29]:
# Load ICWSM 2020 dataset
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

712670
712670


In [30]:
len(reblog_features.drop_duplicates())

151491

In [12]:
# See how many rows have follower and both followees with profile images

from tqdm import tqdm_notebook as tqdm

n_blogs_profile_images = []

for i in tqdm(range(len(nonreblog_features))):
    
    follower_name = blogid2name[reblog_features.iloc[i]['tumblog_id_follower']]
    reblog_followee_name = blogid2name[reblog_features.iloc[i]['tumblog_id_followee']]
    nonreblog_followee_name = blogid2name[nonreblog_features.iloc[i]['tumblog_id_followee']]
    
    n_blogs = sum(1 for name in [follower_name, reblog_followee_name, nonreblog_followee_name] if name in other_blognames)
    n_blogs_profile_images.append(n_blogs)
    
print(len(n_blogs_profile_images))

HBox(children=(IntProgress(value=0, max=712670), HTML(value='')))


712670


In [13]:
for i in range(4):
    print(f"{i}: {n_blogs_profile_images.count(i)}")

0: 182156
1: 110906
2: 191184
3: 228424


In [15]:
len(n_blogs_profile_images)

712670

In [16]:
228424/712670

0.32051861310283863

## Save dataset with filled out profile images

In [17]:
reblog_features['blogname_follower'] = reblog_features['tumblog_id_follower'].map(lambda x: blogid2name[x])
reblog_features['blogname_followee'] = reblog_features['tumblog_id_followee'].map(lambda x: blogid2name[x])
reblog_features['n_blogs_profile_images'] = n_blogs_profile_images

nonreblog_features['blogname_follower'] = nonreblog_features['tumblog_id_follower'].map(lambda x: blogid2name[x])
nonreblog_features['blogname_followee'] = nonreblog_features['tumblog_id_followee'].map(lambda x: blogid2name[x])
nonreblog_features['n_blogs_profile_images'] = n_blogs_profile_images

In [18]:
# Load ranking labels

labels_fpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/feature_tables/ranking_labels.csv'
ranking_labels = pd.read_csv(labels_fpath)
ranking_labels

Unnamed: 0,ranking_label
0,1
1,0
2,0
3,1
4,1
5,0
6,1
7,0
8,0
9,0


In [20]:
ranking_labels = ranking_labels.iloc[:len(reblog_features)]
len(ranking_labels)

712670

In [21]:
ranking_labels['n_blogs_profile_images'] = n_blogs_profile_images
ranking_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ranking_label,n_blogs_profile_images
0,1,0
1,0,0
2,0,1
3,1,0
4,1,0
5,0,0
6,1,0
7,0,0
8,0,0
9,0,2


In [23]:
# Select profile image dataset
selected_reblog_features = reblog_features[reblog_features['n_blogs_profile_images']==3]
selected_nonreblog_features = nonreblog_features[nonreblog_features['n_blogs_profile_images']==3]
selected_ranking_labels = ranking_labels[ranking_labels['n_blogs_profile_images']==3]

In [24]:
# Save out

out_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k_profile_images/feature_tables/'

selected_reblog_features.to_csv(os.path.join(out_dirpath, 'reblog_features.csv'), index=False)
selected_nonreblog_features.to_csv(os.path.join(out_dirpath, 'nonreblog_features.csv'), index=False)
selected_ranking_labels.to_csv(os.path.join(out_dirpath, 'ranking_labels.csv'), index=False)

In [26]:
print(len(selected_reblog_features))
print(len(selected_nonreblog_features))
print(len(selected_ranking_labels))

228424
228424
228424


In [3]:
# Load
import pandas as pd
import os

dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k_profile_images/feature_tables/'

selected_reblog_features = pd.read_csv(os.path.join(dirpath, 'reblog_features.csv'))
selected_nonreblog_features = pd.read_csv(os.path.join(dirpath, 'nonreblog_features.csv'))

tumblog_ids = set(
    selected_reblog_features['tumblog_id_follower'].tolist() + selected_reblog_features['tumblog_id_followee'].tolist() + \
    selected_nonreblog_features['tumblog_id_follower'].tolist() + selected_nonreblog_features['tumblog_id_followee'].tolist()
)

len(tumblog_ids)

14177

In [4]:
selected_reblog_features.columns

Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'roleplay/fandoms_te

# Get usernames in ICWSM 2020 dataset (for profile image scraping)

In [1]:
import pandas as pd
import os

data_dirpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/'

# Load feature info
reblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv')
reblog_features = pd.read_csv(reblog_fpath)
print(len(reblog_features))

nonreblog_fpath = os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv')
nonreblog_features = pd.read_csv(nonreblog_fpath)
print(len(nonreblog_features))

# Get all tumblog_ids
tumblog_ids = set(
    reblog_features['tumblog_id_follower'].tolist() + reblog_features['tumblog_id_followee'].tolist() + \
    nonreblog_features['tumblog_id_follower'].tolist() + nonreblog_features['tumblog_id_followee'].tolist()
)

print(len(tumblog_ids))

712670
712670
34798


## Match tumblog IDs with names

In [2]:
# Load reblog info
reblog_info = pd.read_csv('/usr2/mamille2/tumblr/data/sample1k/reblogs_descs_annotated/reblogs_descs.tsv', sep='\t')
reblog_info.columns

# Load pickle
nonreblog_info = pd.read_pickle('/data/websci2020_tumblr_identity/tmp/nonreblog_info.pkl')
print(nonreblog_info.columns)
len(nonreblog_info)

Index(['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower',
       'blog_name_followee'],
      dtype='object')


30765997

In [3]:
# Select blog names from blog IDs
selected_columns = ['tumblog_id_follower', 'tumblog_id_followee', 'blog_name_follower', 'blog_name_followee']
concatenated = pd.concat([reblog_info[selected_columns], nonreblog_info])

blog_ids_names = pd.DataFrame()
blog_ids_names['tumblog_id'] = concatenated['tumblog_id_follower'].tolist() + concatenated['tumblog_id_followee'].tolist()
blog_ids_names['blog_name'] = concatenated['blog_name_follower'].astype(str).tolist() + concatenated['blog_name_followee'].astype(str).tolist()
blog_ids_names.drop_duplicates(inplace=True)
len(blog_ids_names)

# Check that intersect
print(len(tumblog_ids))
len(tumblog_ids.intersection(set(blog_ids_names['tumblog_id'].tolist())))

blogid2name = blog_ids_names.set_index('tumblog_id')['blog_name'].to_dict()
blogname2id = {val: key for key, val in blogid2name.items()}

# Check that intersect
print(len(tumblog_ids))
len(tumblog_ids.intersection(set(blogid2name.keys())))

# Lookup tumblog ids
blog_names = [blogid2name[tid] for tid in tumblog_ids]
len(blog_names)

34798
34798


34798

In [5]:
# Save blog names out

outpath = '/data/websci2020_tumblr_identity/icwsm2020_sample1k/blog_names.txt'
with open(outpath, 'w') as f:
    for name in blog_names:
        f.write(f'{name}\n')