# Split dataset based on community breakdown of instances

## Make a community match-only dataset for training

In [13]:
# Load communities
import pandas as pd

commpath = '/data/tumblr_community_identity/dataset114k/louvain_communities.txt'
comms = {}
with open(commpath) as f:
    for i,line in enumerate(f.read().splitlines()):
        comms[i+1] = [int(tumblog_id) for tumblog_id in line.split()]
print(len(comms))

id2comm = {}
for n, comm in comms.items():
    for tumblog_id in comm:
        id2comm[tumblog_id] = n
        
# Load reblog prediction dataset
data_fpath = '/data/tumblr_community_identity/dataset114k/matched_reblogs_nonreblogs_dataset114k.csv'
data = pd.read_csv(data_fpath)

# Add community columns to reblog data
for user in ['follower_reblog', 'followee_reblog', 'followee_nonreblog']:
    data[f'community_{user}'] = data[f'tumblog_id_{user}'].map(id2comm.get)
print(data.shape)

# Filter dataset
split_defs = [ # (name, description, query)
    ('all3_match', 'All 3 match',
    'community_follower_reblog == community_followee_reblog == community_followee_nonreblog'),
    ('not_all3_match', "At least one doesn't match",
    'community_follower_reblog != community_followee_reblog or '
     'community_follower_reblog != community_followee_nonreblog or '
        'community_followee_reblog != community_followee_nonreblog'),
]
splits = {}

for name, desc, query in split_defs:
    splits[name] = data.query(query)
    
splits['all3_match'].shape

82


  interactivity=interactivity, compiler=compiler, result=result)


(110922, 79)


(77547, 79)

In [14]:
splits['not_all3_match'].shape

(33375, 79)

In [15]:
# Save out
# splits['all3_match'].to_csv('/data/tumblr_community_identity/dataset114k/matched_reblogs_variants/matched_reblogs_communities_match.csv')
splits['not_all3_match'].to_csv('/data/tumblr_community_identity/dataset114k/matched_reblogs_variants/matched_reblogs_communities_no_match.csv')

## Perfomance on existing dev set

In [1]:
# Load dev set instances
from sklearn.model_selection import train_test_split
import pandas as pd

# Load reblog prediction dataset
data_fpath = '/data/tumblr_community_identity/dataset114k/matched_reblogs_nonreblogs_dataset114k.csv'
data = pd.read_csv(data_fpath)

# Load communities
commpath = '/data/tumblr_community_identity/dataset114k/louvain_communities.txt'
comms = {}
with open(commpath) as f:
    for i,line in enumerate(f.read().splitlines()):
        comms[i+1] = [int(tumblog_id) for tumblog_id in line.split()]
print(len(comms))

id2comm = {}
for n, comm in comms.items():
    for tumblog_id in comm:
        id2comm[tumblog_id] = n

# Add community columns to reblog data
for user in ['follower_reblog', 'followee_reblog', 'followee_nonreblog']:
    data[f'community_{user}'] = data[f'tumblog_id_{user}'].map(id2comm.get)
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


82


(110922, 79)

In [2]:
# Load gold labels
import numpy as np

def add_random_labels(data):
    """ Add random 0 and 1 labels for ordering reblog/nonreblogs
        for learning-to-rank organization """
    half_len = int(len(data)/2)
    np.random.seed(9)
    labels = [0]*half_len + [1]*half_len
    np.random.shuffle(labels)
    data['label'] = labels
    return data

data = add_random_labels(data)
data.label

0         0
1         1
2         0
3         1
4         1
         ..
110917    0
110918    1
110919    1
110920    1
110921    1
Name: label, Length: 110922, dtype: int64

In [3]:
# Split into train and test sets
train, test = train_test_split(
    data, test_size=.2, random_state=9)
train, dev, = train_test_split(
    train, test_size=len(test), random_state=9)
print(train.shape)
print(dev.shape)
print(test.shape)

(66552, 80)
(22185, 80)
(22185, 80)


In [4]:
# Load predictions
preds_path = '/projects/tumblr_community_identity/output/post+text_unigrams_lr/dev_preds.txt'
preds = np.loadtxt(preds_path)
preds.shape

(22185,)

In [6]:
# Compare predictions with labels
dev['pred'] = preds
dev['correct'] = dev['pred']==dev['label']
dev['correct']

# Get dev set community makeup
# Split when communities do and don't match
split_defs = [ # (name, description, query)
    ('all3_match', 'All 3 match',
    'community_follower_reblog == community_followee_reblog == community_followee_nonreblog'),
    ('not_all3_match', "At least one doesn't match",
    'community_follower_reblog != community_followee_reblog or '
     'community_follower_reblog != community_followee_nonreblog or '
        'community_followee_reblog != community_followee_nonreblog'),
    ('follower_matches_someone', "Followees don't match, but follower matches one", 
     'community_followee_reblog != community_followee_nonreblog and '
                                                '(community_follower_reblog == community_followee_reblog or '
                                                 'community_follower_reblog == community_followee_nonreblog)'),
    ('reblogger_match', 'Reblog followee matches only follower', 'community_follower_reblog == community_followee_reblog '
                                             'and community_follower_reblog != community_followee_nonreblog'),
    ('nonreblogger_match', 'Reblog followee matches only non-follower', 'community_follower_reblog != community_followee_reblog '
                                             'and community_follower_reblog == community_followee_nonreblog'),
]
splits = {}

for name, desc, query in split_defs:
    splits[name] = dev.query(query)
    print(f'{desc}: {len(splits[name])}, {len(splits[name])/len(dev): .1%} ({len(splits[name])}/{len(dev)})')

All 3 match: 15467,  69.7% (15467/22185)
At least one doesn't match: 6718,  30.3% (6718/22185)
Followees don't match, but follower matches one: 5160,  23.3% (5160/22185)
Reblog followee matches only follower: 3022,  13.6% (3022/22185)
Reblog followee matches only non-follower: 2138,  9.6% (2138/22185)


In [7]:
# Breakdown in correct proportion
for name, desc, _ in split_defs:
    num = splits[name]['correct'].sum()
    den = len(splits[name])
    print(f'{desc} correct: {num/den: .1%} ({num}/{den})')

All 3 match correct:  59.5% (9196/15467)
At least one doesn't match correct:  60.9% (4093/6718)
Followees don't match, but follower matches one correct:  60.8% (3139/5160)
Reblog followee matches only follower correct:  60.6% (1830/3022)
Reblog followee matches only non-follower correct:  61.2% (1309/2138)


In [9]:
# Test for significance between follower matches reblogger's community and doesn't
from scipy.stats import ttest_ind
ttest_ind(splits['reblogger_match']['correct'], splits['nonreblogger_match']['correct'])

Ttest_indResult(statistic=-0.4852741123775066, pvalue=0.6275025262877671)

In [8]:
# Test for significance between all 3 match and those that don't
from scipy.stats import ttest_ind
ttest_ind(splits['all3_match']['correct'], splits['not_all3_match']['correct'])

Ttest_indResult(statistic=-2.053164524748214, pvalue=0.04006832435079456)