# Read in tables

In [2]:
import csv
import os
import codecs

data_dirpath = '/usr2/mamille2/tumblr/data/sample1k'

feature_tables_dir = os.path.join(data_dirpath, 'feature_tables')
filenames = ['reblog_features.csv', 'nonreblog_features.csv', 'ranking_labels.csv']
joined_filenames = [os.path.join(feature_tables_dir, filename) for filename in filenames]
# csv_readers = [csv.DictReader(codecs.open(filename, 'rU', 'utf-16')) for filename in joined_filenames]
csv_readers = [csv.DictReader(x.replace('\0', '') for x in open(filename, 'r')) for filename in joined_filenames]

In [4]:
instances = []
instance_labels = []
for row in zip(*csv_readers):
    reblog_features = row[0]
    nonreblog_features = row[1]
    label = int(row[2]['ranking_label'])
    instance = (reblog_features, nonreblog_features) # reblog always first, nonreblog always second
    instances.append(instance)
    instance_labels.append(label)
    
print(len(instances), len(instance_labels))

712670 712670


# Feature Extraction

### Create tag vocabulary

In [5]:
from collections import defaultdict
from IPython.core.debugger import set_trace

def _str2list(in_str):
    return [el[1:-1] for el in in_str[1:-1].split(', ')]

def update_tag_counts(tag_counts, counted_ids, candidate): # for hashtags
#     candidate_tags = [tag.lower() for tag in eval(candidate['post_tags'])] # uses tokens provided in feature tables
    candidate_tags = [tag.lower() for tag in _str2list(candidate['post_tags'])] # uses tokens provided in feature tables
    followee_id = candidate['tumblog_id_followee']    
    for tag in candidate_tags:
        if not followee_id in counted_ids[tag]: # only counts the tag if user hasn't already used the tag
            tag_counts[tag] += 1
            counted_ids[tag].add(followee_id)
        
counted_ids = defaultdict(lambda: set()) # for each tag, a set of followees who used those tags
tag_counts = defaultdict(int) # count of unique followees who used each tag
for reblog_candidate, nonreblog_candidate in instances:
    update_tag_counts(tag_counts, counted_ids, reblog_candidate)
    update_tag_counts(tag_counts, counted_ids, nonreblog_candidate)

tag_counts_filtered = {k:v for k,v in tag_counts.items() if v > 1} # at least 2 users used the tag
tag_vocab = tag_counts_filtered.keys()
print(len(tag_vocab))

14318


In [6]:
identity_categories = ['age', 'ethnicity/nationality', 'fandoms', 'gender',
                       'interests', 'location', 'personality type', 'pronouns', 'relationship status', 'roleplay',
                       'sexual orientation', 'weight', 'zodiac']
len(identity_categories)

13

### Count category label instances

In [7]:
category_label_counts = defaultdict(lambda: defaultdict(int)) # {category: {value: count_of_unique_users}}
# counted_ids = set()
for category in identity_categories:
    counted_ids = set() # for each category, ids already considered
    for reblog_candidate, nonreblog_candidate in instances:
        category_followee = category + '_terms_followee'
        followee_id = reblog_candidate['tumblog_id_followee']
        if not followee_id in counted_ids: # only counts labels from first instance seen of a followee, since constant
            category_value = [x.lower() for x in eval(reblog_candidate[category_followee])]
            for value in category_value:
                category_label_counts[category][value] += 1
            counted_ids.add(followee_id)
            
        followee_id = nonreblog_candidate['tumblog_id_followee']
        if not followee_id in counted_ids:
            category_value = [x.lower() for x in eval(nonreblog_candidate[category_followee])]
            for value in category_value:
                category_label_counts[category][value] += 1
            counted_ids.add(followee_id)
        
        category_follower = category + '_terms_follower'
        follower_id = reblog_candidate['tumblog_id_follower']
        if not follower_id in counted_ids:
            category_value = [x.lower() for x in eval(reblog_candidate[category_follower])]
            for value in category_value:
                category_label_counts[category][value] += 1
            counted_ids.add(follower_id)

### Create category label vocabulary

In [8]:
category_vocabs = defaultdict(lambda: set())
for identity_category in category_label_counts:
    category_labels_filtered_vocab = set([k for k,v in category_label_counts[identity_category].items() if v > 1]) # min 2 users using label
    category_vocabs[identity_category] = category_labels_filtered_vocab
    print(identity_category, len(category_vocabs[identity_category]))
    print(category_vocabs[identity_category])
    print('-----------------')
    print()
    
    
    """
    sorted_category_labels = sorted(category_labels_filtered[category].items(), key=lambda x: x[1], reverse=True)    
    print(category)
    print('-----------------')
    for i in range(1, 21):
        if i == len(sorted_category_labels):
            break
        print(sorted_category_labels[i][0], sorted_category_labels[i][1])
    print('-----------------')
    """

age 65
{'12', '22', '34', '56', '47', 'forty', '13', 'twelve', '31', '29', '44', '50', '39', 'age', '55', '38', '17', '45', 'seventeen', '43', '11', 'fifty', '57', 'nineteen', '58', 'xxiv', '28', '32', '23', '25', 'y/o', 'twenty', '46', 'xix', '42', '33', '53', '37', '36', '26', '16', '48', 'fifteen', '14', '41', 'eighteen', '54', 'xxix', '10', '40', '20', '51', '24', 'fourteen', '27', '21', '49', '30', '35', 'sixteen', '15', '19', '18', '59', '52'}
-----------------

ethnicity/nationality 81
{'chilean', 'finnish', 'russian', 'mexicana', 'coeur', 'singaporean', 'thais', 'dutch', 'americans', 'austrian', 'english', 'spanish', 'armenian', 'thai', 'asian', 'jamaican', 'black', 'swedish', 'chilena', 'portuguese', 'moroccan', 'tigre', 'swiss', 'romanian', 'indian', 'canadian', 'german', 'puerto', 'indonesian', 'malay', 'greek', 'turks', 'filipino', 'serbian', 'chinese', 'cuban', 'belgian', 'southern', 'vietnamese', 'chileno', 'dakota', 'danish', 'colombian', 'american', 'malaysian', 'south 

### Post baseline

In [7]:
def extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label):
    features = defaultdict(float) # {feat: count} for each instance
    # Comparison space features
    def _extract_features_post_baseline_candidate(candidate, incr):
        candidate_tags = [tag.lower() for tag in eval(candidate['post_tags'])]
        for tag in candidate_tags:
            if tag.lower() in tag_vocab:
                feat_tag = ('tag=%s' % tag.lower())
                features[feat_tag] += incr

        post_type = candidate['post_type']
        feat_tag = ('post_type=%s' % post_type)
        features[feat_tag] += incr
        
        try:
            post_note_count = float(candidate['post_note_count'])
        except ValueError as e:
            post_note_count = 0.0
            
        features['post_note_count'] += incr * post_note_count
        
    # if randomly-generated label is 1, second candidate is reblog, so flip: -1 is whatever candidate should consider first
    if label == 1: 
        _extract_features_post_baseline_candidate(nonreblog_candidate, incr=-1)
        _extract_features_post_baseline_candidate(reblog_candidate, incr=1)
    else:
        _extract_features_post_baseline_candidate(reblog_candidate, incr=-1)
        _extract_features_post_baseline_candidate(nonreblog_candidate, incr=1)

    return features

### Experiment 1 - Identity framing, presence of variables

In [76]:
def extract_features_experiment_1(reblog_candidate, nonreblog_candidate, label, categories):
    # Baseline features
#     features = defaultdict(float)
    features = extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label)
    
    # Follower features
#     for identity_category in identity_categories:
#         identity_category_follower = eval(reblog_candidate[identity_category + '_terms_follower'])
#         follower_presence = len(identity_category_follower) > 0
#         if follower_presence:
#             feat_tag = ('follower_cat=%s' % identity_category)
#             features[feat_tag] += 1
            
    # Follower-followee comparison space features
    def _extract_features_experiment_1_candidate(candidate, incr):
        
        num_matches = 0
        num_mismatched_follower_presents = 0
        num_mismatched_followee_presents = 0
        
#         for identity_category in identity_categories:
        for identity_category in categories:
            identity_category_follower = eval(reblog_candidate[identity_category + '_terms_follower'])
            follower_presence = len(identity_category_follower) > 0
            identity_category_followee = eval(candidate[identity_category + '_terms_followee'])
            followee_presence = len(identity_category_followee) > 0
#             if followee_presence:
#                 feat_tag = ('followee_cat=%s' % identity_category)
#                 features[feat_tag] += incr

            # Alignment features
#             if ((follower_presence and followee_presence) or
#                 (not follower_presence and not followee_presence)):
            # AND
            if (follower_presence and followee_presence): # AND
                feat_tag = ('aligned_cat=%s' % identity_category)
                features[feat_tag] += incr
                num_matches += 1
                
            # XOR
            if (follower_presence and not followee_presence): # XOR
                feat_tag = ('mismatched_follower_presents_cat=%s' % identity_category)
                features[feat_tag] += incr
                feat_tag = ('xor_cat=%s' % identity_category)
                features[feat_tag] += incr
                num_mismatched_follower_presents += 1
            elif (not follower_presence and followee_presence): # XOR
                feat_tag = ('mismatched_followee_presents_cat=%s' % identity_category)
                features[feat_tag] += incr
                feat_tag = ('xor_cat=%s' % identity_category)
                features[feat_tag] += incr
                num_mismatched_followee_presents += 1
                
        # Number of matches
        features['num_matches'] += num_matches * incr
        features['num_mismatched_follower_presents'] += num_mismatched_follower_presents * incr
        features['num_mismatched_followee_presents'] += num_mismatched_followee_presents * incr
            
    if label == 1:
        _extract_features_experiment_1_candidate(nonreblog_candidate, incr=-1)
        _extract_features_experiment_1_candidate(reblog_candidate, incr=1)
    else:
        _extract_features_experiment_1_candidate(reblog_candidate, incr=-1)
        _extract_features_experiment_1_candidate(nonreblog_candidate, incr=1)

    return features

### Experiment 2 - Compatibility

In [15]:
def extract_features_experiment_2(reblog_candidate, nonreblog_candidate, label):
    # Baseline features
#     features = defaultdict(float)
#     features = extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label)
    features = extract_features_experiment_1(reblog_candidate, nonreblog_candidate, label)

    
    # Follower features
    for identity_category in identity_categories:
        identity_category_follower = [x.lower() for x in eval(reblog_candidate[identity_category + '_terms_follower'])]
        for identity_label in identity_category_follower:
            if identity_label in category_vocabs[identity_category]:
                feat_tag = ('cat=%s,follower_lab=%s' % (identity_category, identity_label))
                features[feat_tag] += 1
            
    # Comparison space features
    def _extract_features_experiment_2_candidate(candidate, incr):
        for identity_category in identity_categories:
            identity_category_follower = [x.lower() for x in eval(reblog_candidate[identity_category + '_terms_follower'])]
            identity_category_followee = [x.lower() for x in eval(reblog_candidate[identity_category + '_terms_followee'])]
            for identity_label_followee in identity_category_followee:
                if identity_label_followee in category_vocabs[identity_category]:
                    feat_tag = ('cat=%s,followee_lab=%s' % (identity_category, identity_label_followee))
                    features[feat_tag] += incr
                    
                    # Compatibility features: explicit marking of follower and followee labels together
                    for identity_label_follower in identity_category_follower:
                        if identity_label_follower in category_vocabs[identity_category]:
                            feat_tag = ('cat=%s,follower_lab=%s,followee_lab=%s' % (identity_category,
                                                                                    identity_label_follower,
                                                                                    identity_label_followee))
                            features[feat_tag] += incr
            
                
    if label == 1:
        _extract_features_experiment_2_candidate(nonreblog_candidate, incr=-1)
        _extract_features_experiment_2_candidate(reblog_candidate, incr=1)
    else:
        _extract_features_experiment_2_candidate(reblog_candidate, incr=-1)
        _extract_features_experiment_2_candidate(nonreblog_candidate, incr=1)

    return features

# Run models

In [9]:
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import svm
import numpy as np

### Post baseline

In [17]:
X = []
y = []
for (reblog_candidate, nonreblog_candidate), label in zip(instances, instance_labels):
    X.append(extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label))
    y.append(label)
    
post_features_vectorizer = feature_extraction.DictVectorizer()
post_features_scaler = preprocessing.StandardScaler(with_mean=False) # normalization standard scaler
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=12345)
X_train = post_features_vectorizer.fit_transform(X_train)
X_train = post_features_scaler.fit_transform(X_train)
X_test = post_features_vectorizer.transform(X_test)
X_test = post_features_scaler.transform(X_test)

baseline_model = linear_model.LogisticRegressionCV(cv=10, n_jobs=10, max_iter=1000, verbose=2).fit(X_train, y_train) # default 5 folds
print(baseline_model.score(X_test, y_test))
baseline_pred = baseline_model.predict(X_test)

# Save predictions
np.savetxt(os.path.join(data_dirpath, 'results', 'baseline.txt'), baseline_pred)

# Save classifier (with weights)
with open(os.path.join(data_dirpath, 'models', 'lr_baseline.pkl'), 'wb') as f:
    pickle.dump(baseline_model, f)

[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:  5.5min remaining: 12.9min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  6.4min finished


0.626980229279


### Experiment 1 - Identity framing, presence of variables

In [113]:
import pickle

category = 'zodiac'

X = []
y = []
for (reblog_candidate, nonreblog_candidate), label in zip(instances, instance_labels):
    X.append(extract_features_experiment_1(reblog_candidate, nonreblog_candidate, label, [category]))
    y.append(label)
    
features_vectorizer_experiment_1 = feature_extraction.DictVectorizer()
features_scaler_experiment_1 = preprocessing.StandardScaler(with_mean=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=12345)
X_train = features_vectorizer_experiment_1.fit_transform(X_train)
X_train = features_scaler_experiment_1.fit_transform(X_train)
X_test = features_vectorizer_experiment_1.transform(X_test)
X_test = features_scaler_experiment_1.transform(X_test)

experiment_1_model = linear_model.LogisticRegressionCV(cv=10, n_jobs=10, max_iter=1000, verbose=2).fit(X_train, y_train)
print(experiment_1_model.score(X_test, y_test))
experiment_1_pred = experiment_1_model.predict(X_test)

# Save predictions
np.savetxt(os.path.join(data_dirpath, 'results', f'baseline_exp1_{category.replace("/", "_").replace(" ", "_")}.txt'), experiment_1_pred)

# Save classifier (with weights)
with open(os.path.join(data_dirpath, 'models', f'lr_baseline_exp1_{category.replace("/", "_").replace(" ", "_")}.pkl'), 'wb') as f:
    pickle.dump(experiment_1_model, f)

[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:  4.4min remaining: 10.3min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  9.2min finished


0.628411466738


### Experiment 2 - Compatibility

In [18]:
X = []
y = []
for (reblog_candidate, nonreblog_candidate), label in zip(instances, instance_labels):
    X.append(extract_features_experiment_2(reblog_candidate, nonreblog_candidate, label))
    y.append(label)
    
features_vectorizer_experiment_2 = feature_extraction.DictVectorizer()
features_scaler_experiment_2 = preprocessing.StandardScaler(with_mean=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=12345)
X_train = features_vectorizer_experiment_2.fit_transform(X_train)
X_train = features_scaler_experiment_2.fit_transform(X_train)
X_test = features_vectorizer_experiment_2.transform(X_test)
X_test = features_scaler_experiment_2.transform(X_test)

experiment_2_model = linear_model.LogisticRegressionCV(cv=10, max_iter=1000, n_jobs=5, verbose=2).fit(X_train, y_train)
print(experiment_2_model.score(X_test, y_test))
experiment_2_pred = experiment_2_model.predict(X_test)

# Save predictions
np.savetxt(os.path.join(data_dirpath, 'results', 'baseline_exp1_exp2.txt'), experiment_2_pred)

[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed: 10.8min remaining:  4.6min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 11.5min finished


0.641678476714


# McNemar's Test (Significance)

In [114]:
a = 0
b = 0 # Baseline correct, experiment incorrect
c = 0 # Baseline incorrect, experiment correct
d = 0
for b_pred, ex_pred, true in zip(baseline_pred, experiment_1_pred, y_test):
    if b_pred == true and ex_pred == true:
        a += 1
    elif b_pred == true and ex_pred != true:
        b += 1
    elif b_pred != true and ex_pred == true:
        c += 1
    else:
        d += 1
        
table = [[a, b],
         [c, d]]
print(table)

[[88419, 947], [1151, 52017]]


In [115]:
# Example of calculating the mcnemar test
from statsmodels.stats.contingency_tables import mcnemar
# calculate mcnemar test
result = mcnemar(table, exact=False, correction=False)
# summarize the finding
print('statistic=%.3f, p-value=%.6f' % (result.statistic, result.pvalue))
# interpret the p-value
alpha = 0.05
if result.pvalue > alpha:
	print('Same proportions of errors (fail to reject H0)')
else:
	print('Different proportions of errors (reject H0)')

statistic=19.836, p-value=0.000008
Different proportions of errors (reject H0)


# Error analysis

## Informative features

In [2]:
import pickle

# Load model
cat = 'pronouns'
model_path = os.path.join(data_dirpath, 'models', 'lr_baseline+exp1_pronouns_filtered.pkl')
with open(model_path, 'rb') as f:
    model = pickle.load(f)

In [4]:
# Load feature vectorizer
path = os.path.join(data_dirpath, 'results', 'lr_baseline_pronouns_feature_vec.pkl')

with open(path, 'rb') as f:
    features_vectorizer_experiment_1 = pickle.load(f)

In [16]:
from operator import itemgetter
import numpy as np

def print_informative_features(feature_vectorizer, model, n=10000):
    feats_index2name = {v: k for k, v in features_vectorizer_experiment_1.vocabulary_.items()}
    feature_weights = model.coef_[0]
    
    top_indices = np.argsort(feature_weights)[-1*n:]
    top_weights = np.sort(feature_weights)[-1*n:]
    bottom_indices = np.argsort(feature_weights)[:n]
    bottom_weights = np.sort(feature_weights)[:n]
#     tb_weights = sorted(np.hstack([top_weights, bottom_weights], key=lambda row: np.abs(row)))
    
#     bottom_feats = set(feats_index2name[j] for j in bottom_indices)
#     top_feats = set(feats_index2name[j] for j in bottom_indices)

    lines = [] # to sort and print
    
    for i, (j, w) in enumerate(zip(reversed(top_indices), reversed(top_weights))):
        feature_name = feats_index2name[j]
        if not feature_name.startswith('tag'):
            lines.append([i, feature_name, w, abs(w)])
#             print(f"{i}\t{feature_name}\t{w: .3f}")
            
    for i, (j, w) in enumerate(zip(bottom_indices, bottom_weights)):
        feature_name = feats_index2name[j]
        if not feature_name.startswith('tag'):
            lines.append([i, feature_name, w, abs(w)])
#             print(f"{i}\t{feature_name}\t{w: .3f}")
#         print(f"{i}\t{feature_name}\t{w: .3f}")

    for l in list(reversed(sorted(lines, key=itemgetter(3)))):
        print(l)

In [17]:
print_informative_features(features_vectorizer_experiment_1, model)

[0, 'post_type=answer', -0.9529166535554745, 0.9529166535554745]
[26, 'post_type=video', 0.15112336801370957, 0.15112336801370957]
[42, 'post_type=photo', 0.14049289929989009, 0.14049289929989009]
[491, 'post_type=link', -0.063957875935738193, 0.063957875935738193]
[330, 'post_type=text', 0.060734708732589279, 0.060734708732589279]
[424, 'post_type=quote', 0.052820641574165633, 0.052820641574165633]
[655, 'post_type=chat', 0.039829447750114044, 0.039829447750114044]
[1019, 'post_note_count', 0.028999594056302563, 0.028999594056302563]
[4051, 'mismatched_follower_presents_cat=pronouns', -0.020885706207246293, 0.020885706207246293]
[4050, 'num_mismatched_follower_presents', -0.020885706207246293, 0.020885706207246293]
[1486, 'aligned_cat=pronouns', 0.020885706207246293, 0.020885706207246293]
[1485, 'num_matches', 0.020885706207246293, 0.020885706207246293]
[7236, 'post_type=audio', -0.0093910114863629598, 0.0093910114863629598]
[7052, 'post_type=audio', -0.0093910114863629598, 0.00939101

In [43]:
def print_top_features(feature_vectorizer, model, n=100):
    feats_index2name = {v: k for k, v in features_vectorizer_experiment_1.vocabulary_.items()}
    feature_weights = experiment_1_model.coef_[0]
    
    top_indices = np.argsort(feature_weights)[-1*n:]
    top_weights = np.sort(feature_weights)[-1*n:]
    
    for i, (j, w) in enumerate(zip(reversed(top_indices), reversed(top_weights))):
        feature_name = feats_index2name[j]
        if not feature_name.startswith('tag'):
            print(f"{i}\t{feature_name}\t{w: .3f}")

In [70]:
from IPython.core.debugger import set_trace

def print_bottom_features(feature_vectorizer, model, n=100):
    feats_index2name = {v: k for k, v in features_vectorizer_experiment_1.vocabulary_.items()}
    feature_weights = experiment_1_model.coef_[0]
    
    bottom_indices = np.argsort(feature_weights)[:n]
    bottom_weights = np.sort(feature_weights)[:n]
#     set_trace()
    
    for i, (j, w) in enumerate(zip(bottom_indices, bottom_weights)):
        feature_name = feats_index2name[j]
        if not feature_name.startswith('tag'):
            print(f"{i}\t{feature_name}\t{w: .3f}")
#         print(f"{i}\t{feature_name}\t{w: .3f}")

In [69]:
print_top_features(features_vectorizer_experiment_1, experiment_1_model, n=1000)

6	post_type=video	 0.143
11	post_type=photo	 0.125
166	post_type=text	 0.057
182	post_type=quote	 0.054
213	mismatched_followee_presents_cat=location	 0.052
335	post_type=chat	 0.043
381	aligned_cat=pronouns	 0.041
415	xor_cat=location	 0.039
629	post_note_count	 0.031
792	mismatched_followee_presents_cat=weight	 0.027
824	xor_cat=weight	 0.026


In [72]:
print_bottom_features(features_vectorizer_experiment_1, experiment_1_model, n=1000)

0	post_type=answer	-0.884
144	mismatched_followee_presents_cat=interests	-0.073
192	xor_cat=interests	-0.065
226	mismatched_followee_presents_cat=age	-0.062
245	post_type=link	-0.060
347	xor_cat=age	-0.052
664	mismatched_follower_presents_cat=pronouns	-0.041


## Predictions for when categories match, don't match on presence

For each category, what is accuracy when follower and followee align for either reblog or nonreblog?

In [9]:
# identity_categories = ['age']
identity_categories = ['age', 'ethnicity/nationality', 'fandoms', 'gender', 'interests', 'location', 'personality type',
                        'pronouns', 'relationship status', 'roleplay', 'sexual orientation', 'zodiac']
len(identity_categories)

12

In [7]:
import pickle

# Load themes
with open(os.path.join(data_dirpath, 'themes.pkl'), 'rb') as f:
    themes = pickle.load(f)

print(len(themes))

12


In [10]:
# Load sample1k training set

import pandas as pd
from sklearn import model_selection
import os
data_dirpath = '/usr2/mamille2/tumblr/data/sample1k'

reblog_feats = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'reblog_features.csv'))
nonreblog_feats = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'nonreblog_features.csv'))
print(len(reblog_feats))
print(len(nonreblog_feats))
print(reblog_feats.columns)

# Load instance labels
labels_df = pd.read_csv(os.path.join(data_dirpath, 'feature_tables', 'ranking_labels.csv'))
labels = labels_df['ranking_label'].values.tolist()[:len(reblog_feats)]

train = {}

train['reblogs'], _, _, _ = model_selection.train_test_split(reblog_feats, labels, test_size=0.1, random_state=12345)
train['nonreblogs'], _, y_train, _ = model_selection.train_test_split(nonreblog_feats, labels, test_size=0.1, random_state=12345)
print(train['reblogs'].shape)
print(train['nonreblogs'].shape)

712670
712670
Index(['post_id', 'tumblog_id_follower', 'tumblog_id_followee', 'post_tags',
       'post_type', 'post_note_count', 'processed_blog_description_follower',
       'processed_blog_description_followee', 'age_terms_follower',
       'age_terms_followee', 'ethnicity/nationality_terms_follower',
       'ethnicity/nationality_terms_followee', 'fandoms_terms_follower',
       'fandoms_terms_followee', 'gender_terms_follower',
       'gender_terms_followee', 'gender/sexuality_terms_follower',
       'gender/sexuality_terms_followee', 'interests_terms_follower',
       'interests_terms_followee', 'location_terms_follower',
       'location_terms_followee', 'personality type_terms_follower',
       'personality type_terms_followee', 'pronouns_terms_follower',
       'pronouns_terms_followee', 'relationship status_terms_follower',
       'relationship status_terms_followee', 'roleplay_terms_follower',
       'roleplay_terms_followee', 'roleplay/fandoms_terms_follower',
       'rolep

In [11]:
import numpy as np
from tqdm import tqdm_notebook as tqdm
import itertools
from IPython.core.debugger import set_trace

overlaps = {c: {'reblogs': [], 'nonreblogs': []} for c in identity_categories}

presence_matches = {'reblogs': {c: list() for c in identity_categories},
                    'nonreblogs': {c: list() for c in identity_categories}} # cat: set((i, follower_terms, followee_terms), ...)

# Load training set predictions, build presence matches, etc
for category in tqdm(identity_categories):
    model_name = f'lr_baseline+exp1_{category.replace("/", "_").replace(" ", "_")}'
    pred_fpath = os.path.join(data_dirpath, 'output', 'predictions', f'{model_name}_train_preds.txt')
    
    preds = np.loadtxt(pred_fpath)

    for r in ['reblogs', 'nonreblogs']:
        for follower_id, followee_id, follower_terms, followee_terms, pred, actual in zip(
                    train[r][f'tumblog_id_follower'], 
                    train[r][f'tumblog_id_followee'], 
                    train[r][f'{category}_terms_follower'], 
                    train[r][f'{category}_terms_followee'],
                    preds,
                    y_train
                ):
            
            if pred == actual:
                pred_str = 'correct_prediction'
            else:
                pred_str = 'incorrect_prediction'
            
            follower_terms = eval(follower_terms)
            followee_terms = eval(followee_terms)
            if len(follower_terms) > 0 and len(followee_terms) > 0:
                follower_themes = []
                followee_themes = []
                
                for f in follower_terms:
                    follower_themes += themes[category][f.lower()]
                for f in followee_terms:
                    followee_themes += themes[category][f.lower()]
                
                # Specific values of themes (including multiples)
#                 presence_matches[r][category].append(
#                     (follower_id, followee_id, 
# #                     set([f.lower() for f in follower_terms]), 
# #                     set([f.lower() for f in followee_terms]),
#                     set(follower_themes), 
#                     set(followee_themes),
#                     pred_str
#                     ))
                
                # No multiples
                for follower_theme, followee_theme in itertools.product(follower_themes, followee_themes):
                    presence_matches[r][category].append(
                        (follower_id, followee_id, 
                        follower_theme, 
                        followee_theme,
                        pred_str
                        ))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr0/home/mamille2/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr0/home/mamille2/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr0/home/mamille2/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [34]:
# Measure overlap
for category in identity_categories:
    for r in ['reblogs', 'nonreblogs']:
        for instance in presence_matches[r][category]:
            overlaps[category][r].append(len(instance[-3].intersection(instance[-2]))/len(instance[-3].union(instance[-2])))
        
for category in overlaps:
    for r in ['reblogs', 'nonreblogs']:
        print(f'{category} {r}: {np.mean(overlaps[category][r])}')
    print()

AttributeError: 'str' object has no attribute 'intersection'

What is the accuracy for each category label pairing for either reblog or nonreblog?

In [12]:
from collections import Counter, defaultdict
from operator import itemgetter

# overlap_counter = {c: {'reblogs': Counter(), 'nonreblogs': Counter()} for c in identity_categories}
overlap_counter = {c: Counter() for c in identity_categories}
correct_preds = {c: defaultdict(list) for c in identity_categories}

# Measure overlap
for category in identity_categories:
    for r in ['reblogs', 'nonreblogs']:
        for instance in presence_matches[r][category]:
#             value_pairing = (tuple(instance[-3]), tuple(instance[-2]))
            value_pairing = (instance[-3], instance[-2])
            overlap_counter[category][value_pairing] += 1
            if instance[-1].startswith('correct'):
                correct_preds[category][value_pairing].append(1)
            else:
                correct_preds[category][value_pairing].append(0)
        
for category in overlap_counter:
    lines = []
    
    print(category)
    for (labels1, labels2), count in overlap_counter[category].most_common(20):
        lines.append([f'{labels1}, {labels2}', count, np.mean(correct_preds[category][(labels1, labels2)])])
#         print(f'{labels1}, {labels2}\t{count}\t{np.mean(correct_preds[category][(labels1, labels2)])}')
    for l in list(reversed(sorted(lines, key=itemgetter(-1)))):
        print(l)
        
    print()

age
['teens, 30+', 294, 0.9285714285714286]
['early-20s, early-20s', 36594, 0.70232825053287429]
['late-20s, early-20s', 1852, 0.68682505399568039]
['30+, 30+', 788, 0.67131979695431476]
['early-20s, teens', 22450, 0.6334521158129176]
['late-20s, 30+', 221, 0.6244343891402715]
['30+, late-20s', 250, 0.60799999999999998]
['late-20s, teens', 563, 0.60213143872113672]
['teens, late-20s', 333, 0.58858858858858853]
['early-20s, 30+', 3519, 0.58141517476555837]
['early-20s, late-20s', 5179, 0.54991311063911952]
['30+, teens', 1398, 0.54935622317596566]
['teens, teens', 2571, 0.53908984830805129]
['teens, early-20s', 4909, 0.53147280505194539]
['late-20s, late-20s', 206, 0.529126213592233]
['30+, early-20s', 1353, 0.46119733924611972]

ethnicity/nationality
['race/ethnicity, race/ethnicity', 29, 0.96551724137931039]
['race/ethnicity, nationality', 387, 0.86046511627906974]
['nationality, race/ethnicity', 893, 0.73348264277715569]
['nationality, nationality', 419, 0.6467780429594272]

fandoms


## Confusion matrix (N/A)

In [63]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, experiment_1_pred).ravel()
print(f"True positives: {tp}")
print(f"True negatives: {tn}")
print(f"False positives: {fp}")
print(f"False negatives: {fn}")
[[tn, fp],
[fn, tp]]

True positives: 43065
True negatives: 48379
False positives: 22776
False negatives: 28314


[[48379, 22776], [28314, 43065]]

## Specific errors
Look for specific times when have a feature but is still giving an error

In [None]:
feature_name = 'aligned_cat=pronouns'
feature_index = features_vectorizer_experiment_1.vocab_[feature_name]
positive_examples = 