# Read in tables

In [2]:
import csv
import os
import codecs

data_dirpath = '/usr2/mamille2/tumblr/data/sample1k'

feature_tables_dir = os.path.join(data_dirpath, 'feature_tables')
filenames = ['reblog_features.csv', 'nonreblog_features.csv', 'ranking_labels.csv']
joined_filenames = [os.path.join(feature_tables_dir, filename) for filename in filenames]
# csv_readers = [csv.DictReader(codecs.open(filename, 'rU', 'utf-16')) for filename in joined_filenames]
csv_readers = [csv.DictReader(x.replace('\0', '') for x in open(filename, 'r')) for filename in joined_filenames]

In [3]:
instances = []
instance_labels = []
for row in zip(*csv_readers):
    reblog_features = row[0]
    nonreblog_features = row[1]
    label = int(row[2]['ranking_label'])
    instance = (reblog_features, nonreblog_features)
    instances.append(instance)
    instance_labels.append(label)
    
print(len(instances), len(instance_labels))

712670 712670


# Feature Extraction

### Create tag vocabulary

In [4]:
from collections import defaultdict
from IPython.core.debugger import set_trace

def _str2list(in_str):
    return [el[1:-1] for el in in_str[1:-1].split(', ')]

def update_tag_counts(tag_counts, counted_ids, candidate): # for hashtags
#     candidate_tags = [tag.lower() for tag in eval(candidate['post_tags'])] # uses tokens provided in feature tables
    candidate_tags = [tag.lower() for tag in _str2list(candidate['post_tags'])] # uses tokens provided in feature tables
    followee_id = candidate['tumblog_id_followee']    
    for tag in candidate_tags:
        if not followee_id in counted_ids[tag]: # only counts the tag if user hasn't already used the tag
            tag_counts[tag] += 1
            counted_ids[tag].add(followee_id)
        
counted_ids = defaultdict(lambda: set()) # for each tag, a set of followees who used those tags
tag_counts = defaultdict(int) # count of unique followees who used each tag
for reblog_candidate, nonreblog_candidate in instances:
    update_tag_counts(tag_counts, counted_ids, reblog_candidate)
    update_tag_counts(tag_counts, counted_ids, nonreblog_candidate)

tag_counts_filtered = {k:v for k,v in tag_counts.items() if v > 1} # at least 2 users used the tag
tag_vocab = tag_counts_filtered.keys()
print(len(tag_vocab))

14318


In [5]:
identity_categories = ['age', 'ethnicity/nationality', 'fandoms', 'gender',
                       'interests', 'location', 'personality type', 'pronouns', 'relationship status', 'roleplay',
                       'sexual orientation', 'weight', 'zodiac']

### Count category label instances

In [6]:
category_label_counts = defaultdict(lambda: defaultdict(int))
counted_ids = set()
for category in identity_categories:
    counted_ids = set()
    for reblog_candidate, nonreblog_candidate in instances:
        category_followee = category + '_terms_followee'
        followee_id = reblog_candidate['tumblog_id_followee']
        if not followee_id in counted_ids:
            category_value = [x.lower() for x in eval(reblog_candidate[category_followee])]
            for value in category_value:
                category_label_counts[category][value] += 1
            counted_ids.add(followee_id)
            
        followee_id = nonreblog_candidate['tumblog_id_followee']
        if not followee_id in counted_ids:
            category_value = [x.lower() for x in eval(nonreblog_candidate[category_followee])]
            for value in category_value:
                category_label_counts[category][value] += 1
            counted_ids.add(followee_id)
        
        category_follower = category + '_terms_follower'
        follower_id = reblog_candidate['tumblog_id_follower']
        if not follower_id in counted_ids:
            category_value = [x.lower() for x in eval(reblog_candidate[category_follower])]
            for value in category_value:
                category_label_counts[category][value] += 1
            counted_ids.add(follower_id)


### Create category label vocabulary

In [7]:
category_vocabs = defaultdict(lambda: set())
for identity_category in category_label_counts:
    category_labels_filtered_vocab = set([k for k,v in category_label_counts[identity_category].items() if v > 1]) # min 2 users using label
    category_vocabs[identity_category] = category_labels_filtered_vocab
    print(identity_category, len(category_vocabs[identity_category]))
    print(category_vocabs[identity_category])
    print('-----------------')
    print()
    """
    sorted_category_labels = sorted(category_label_filtered[category].items(), key=lambda x: x[1], reverse=True)    
    print(category)
    print('-----------------')
    for i in range(1, 21):
        if i == len(sorted_category_labels):
            break
        print(sorted_category_labels[i][0], sorted_category_labels[i][1])
    print('-----------------')
    """

age 65
{'56', 'xxiv', 'nineteen', 'y/o', '28', '29', '58', 'age', '24', 'eighteen', '44', '25', '21', '46', 'xix', '18', '26', '17', '33', 'seventeen', '52', 'forty', 'fourteen', 'fifteen', '36', '42', '43', '16', '14', '23', '20', 'sixteen', '48', '11', '32', '15', '31', '55', '59', 'xxix', '13', '30', '22', '54', '12', '45', '35', '37', '47', '57', 'twelve', '19', '38', '49', 'fifty', '10', '27', '53', '51', '40', '39', '41', '50', '34', 'twenty'}
-----------------

ethnicity/nationality 81
{'black', 'thai', 'haitian', 'turkish', 'malay', 'papel', 'austrian', 'colombian', 'norwegian', 'japanese', 'european', 'turks', 'dakota', 'lithuanian', 'ottawa', 'thais', 'scandinavian', 'portuguese', 'romanian', 'south african', 'polish', 'saudi', 'italian', 'hungarian', 'filipina', 'canadian', 'scottish', 'puerto', 'moroccan', 'african', 'english', 'spanish', 'french', 'chilena', 'american', 'swedish', 'latino', 'swede', 'coeur', 'singaporean', 'czech', 'dominican', 'mexican', 'serbian', 'pakis

### Post baseline

In [8]:
def extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label):
    features = defaultdict(float)
    # Comparison space features
    def _extract_features_post_baseline_candidate(candidate, incr):
        candidate_tags = [tag.lower() for tag in eval(candidate['post_tags'])]
        for tag in candidate_tags:
            if tag.lower() in tag_vocab:
                feat_tag = ('tag=%s' % tag.lower())
                features[feat_tag] += incr

        post_type = candidate['post_type']
        feat_tag = ('post_type=%s' % post_type)
        features[feat_tag] += incr
        
        post_note_count = float(candidate['post_note_count'])
        features['post_note_count'] += incr * post_note_count

    if label == 1:
        _extract_features_post_baseline_candidate(nonreblog_candidate, incr=-1)
        _extract_features_post_baseline_candidate(reblog_candidate, incr=1)
    else:
        _extract_features_post_baseline_candidate(reblog_candidate, incr=-1)
        _extract_features_post_baseline_candidate(nonreblog_candidate, incr=1)

    return features

### Experiment 1 - Identity framing, presence of variables

In [12]:
def extract_features_experiment_1(reblog_candidate, nonreblog_candidate, label):
    # Baseline features
    #features = defaultdict(float)
    features = extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label)
    
    # Follower features
    for identity_category in identity_categories:
        identity_category_follower = eval(reblog_candidate[identity_category + '_terms_follower'])
        follower_presence = len(identity_category_follower) > 0
        if follower_presence:
            feat_tag = ('follower_cat=%s' % identity_category)
            features[feat_tag] += 1
            
    # Comparison space features
    def _extract_features_experiment_1_candidate(candidate, incr):
        for identity_category in identity_categories:
            identity_category_follower = eval(reblog_candidate[identity_category + '_terms_follower'])
            follower_presence = len(identity_category_follower) > 0
            identity_category_followee = eval(candidate[identity_category + '_terms_followee'])
            followee_presence = len(identity_category_followee) > 0
            if followee_presence:
                feat_tag = ('followee_cat=%s' % identity_category)
                features[feat_tag] += incr

            # Alignment features
            if ((follower_presence and followee_presence) or
                (not follower_presence and not followee_presence)):
                feat_tag = ('aligned_cat=%s' % identity_category)
                features[feat_tag] += incr
                
    if label == 1:
        _extract_features_experiment_1_candidate(nonreblog_candidate, incr=-1)
        _extract_features_experiment_1_candidate(reblog_candidate, incr=1)
    else:
        _extract_features_experiment_1_candidate(reblog_candidate, incr=-1)
        _extract_features_experiment_1_candidate(nonreblog_candidate, incr=1)

    return features

### Experiment 2 - Compatibility

In [None]:
def extract_features_experiment_2(reblog_candidate, nonreblog_candidate, label):
    # Baseline features
    #features = defaultdict(float)
    #features = extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label)
    features = extract_features_experiment_1(reblog_candidate, nonreblog_candidate, label)

    
    # Follower features
    for identity_category in identity_categories:
        identity_category_follower = [x.lower() for x in eval(reblog_candidate[identity_category + '_terms_follower'])]
        for identity_label in identity_category_follower:
            if identity_label in category_vocabs[identity_category]:
                feat_tag = ('cat=%s,follower_lab=%s' % (identity_category, identity_label))
                features[feat_tag] += 1
            
    # Comparison space features
    def _extract_features_experiment_2_candidate(candidate, incr):
        for identity_category in identity_categories:
            identity_category_follower = [x.lower() for x in eval(reblog_candidate[identity_category + '_terms_follower'])]
            identity_category_followee = [x.lower() for x in eval(reblog_candidate[identity_category + '_terms_followee'])]
            for identity_label_followee in identity_category_followee:
                if identity_label_followee in category_vocabs[identity_category]:
                    feat_tag = ('cat=%s,followee_lab=%s' % (identity_category, identity_label_followee))
                    features[feat_tag] += incr
                    
                    # Compatibility features
                    for identity_label_follower in identity_category_follower:
                        if identity_label_follower in category_vocabs[identity_category]:
                            feat_tag = ('cat=%s,follower_lab=%s,followee_lab=%s' % (identity_category,
                                                                                    identity_label_follower,
                                                                                    identity_label_followee))
                            features[feat_tag] += incr
            
                
    if label == 1:
        _extract_features_experiment_2_candidate(nonreblog_candidate, incr=-1)
        _extract_features_experiment_2_candidate(reblog_candidate, incr=1)
    else:
        _extract_features_experiment_2_candidate(reblog_candidate, incr=-1)
        _extract_features_experiment_2_candidate(nonreblog_candidate, incr=1)

    return features

# Run models

In [9]:
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import model_selection
from sklearn import neural_network
from sklearn import preprocessing
from sklearn import svm
import numpy as np

### Post baseline

In [11]:
X = []
y = []
for (reblog_candidate, nonreblog_candidate), label in zip(instances, instance_labels):
    X.append(extract_features_post_baseline(reblog_candidate, nonreblog_candidate, label))
    y.append(label)
    
post_features_vectorizer = feature_extraction.DictVectorizer()
post_features_scaler = preprocessing.StandardScaler(with_mean=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=12345)
X_train = post_features_vectorizer.fit_transform(X_train)
X_train = post_features_scaler.fit_transform(X_train)
X_test = post_features_vectorizer.transform(X_test)
X_test = post_features_scaler.transform(X_test)

model_base = svm.LinearSVC(dual=False, max_iter=10000, verbose=2)
# parameters = {'C':[.1, 1, 10], 'penalty':['l1', 'l2']}
parameters = {'C':[.01, .1, 1, 10, 100], 'penalty':['l2']}
baseline_model = model_selection.GridSearchCV(model_base, parameters, n_jobs=5, cv=10, verbose=2).fit(X_train, y_train)
print(baseline_model.score(X_test, y_test))
baseline_pred = baseline_model.predict(X_test)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[LibLinear][CV] ............................... C=0.01, penalty=l2, total= 8.8min
[CV] C=0.1, penalty=l2 ...............................................
[LibLinear][CV] ............................... C=0.01, penalty=l2, total= 9

[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed: 32.8min


[LibLinear][CV] .................................. C=1, penalty=l2, total= 5.4min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................ C=0.1, penalty=l2, total=20.9min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] .................................. C=1, penalty=l2, total=10.8min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] .................................. C=1, penalty=l2, total=14.7min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] .................................. C=1, penalty=l2, total= 7.8min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................ C=0.1, penalty=l2, total=26.1min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................. C=10, penalty=l2, total= 9.1min


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 73.5min finished


[LibLinear]0.626362832728


### Experiment 1 - Identity framing, presence of variables

In [13]:
X = []
y = []
for (reblog_candidate, nonreblog_candidate), label in zip(instances, instance_labels):
    X.append(extract_features_experiment_1(reblog_candidate, nonreblog_candidate, label))
    y.append(label)
    
features_vectorizer_experiment_1 = feature_extraction.DictVectorizer()
features_scaler_experiment_1 = preprocessing.StandardScaler(with_mean=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=12345)
X_train = features_vectorizer_experiment_1.fit_transform(X_train)
X_train = features_scaler_experiment_1.fit_transform(X_train)
X_test = features_vectorizer_experiment_1.transform(X_test)
X_test = features_scaler_experiment_1.transform(X_test)

experiment_1_model = model_selection.GridSearchCV(model_base, parameters, n_jobs=5, cv=10, verbose=2).fit(X_train, y_train)
print(experiment_1_model.score(X_test, y_test))
experiment_1_pred = experiment_1_model.predict(X_test)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[LibLinear][CV] ............................... C=0.01, penalty=l2, total= 8.1min
[CV] C=0.01, penalty=l2 ..............................................
[LibLinear][CV] ............................... C=0.01, penalty=l2, total=11.1min
[CV] C=0.01, penalty=l2 ..............................................
[LibLinear][CV] ............................... C=0.01, penalty=l2, total=11.6min
[CV] C=0.01, penalty=l2 ..............................................
[LibLinear][CV] ............................... C=0.01, penalty=l2, total=12.5min
[CV] C=0.01, penalty=l2 ...................

[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed: 65.9min


[LibLinear][CV] .................................. C=1, penalty=l2, total=15.7min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................. C=10, penalty=l2, total=12.0min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................. C=10, penalty=l2, total= 6.5min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................. C=10, penalty=l2, total= 7.3min
[CV] C=10, penalty=l2 ................................................
[LibLinear][CV] ................................. C=10, penalty=l2, total= 5.1min
[CV] C=100, penalty=l2 ...............................................
[LibLinear][CV] ................................. C=10, penalty=l2, total=11.7min
[CV] C=100, penalty=l2 ...............................................
[LibLinear][CV] ................................ C=100, penalty=l2, total= 6.7min


KeyboardInterrupt: 

### Experiment 2 - Compatibility

In [None]:
X = []
y = []
for (reblog_candidate, nonreblog_candidate), label in zip(instances, instance_labels):
    X.append(extract_features_experiment_2(reblog_candidate, nonreblog_candidate, label))
    y.append(label)
    
features_vectorizer_experiment_2 = feature_extraction.DictVectorizer()
features_scaler_experiment_2 = preprocessing.StandardScaler(with_mean=False)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=12345)
X_train = features_vectorizer_experiment_2.fit_transform(X_train)
X_train = features_scaler_experiment_2.fit_transform(X_train)
X_test = features_vectorizer_experiment_2.transform(X_test)
X_test = features_scaler_experiment_2.transform(X_test)

experiment_2_model = model_selection.GridSearchCV(model_base, parameters, cv=10).fit(X_train, y_train)
print(experiment_2_model.score(X_test, y_test))
experiment_2_pred = experiment_2_model.predict(X_test)

# McNemar's Test (Significance)

In [None]:
a = 0
b = 0 # Baseline correct, experiment incorrect
c = 0 # Baseline incorrect, experiment correct
d = 0
for b_pred, ex_pred, true in zip(baseline_pred, experiment_1_pred, y_test):
    if b_pred == true and ex_pred == true:
        a += 1
    elif b_pred == true and ex_pred != true:
        b += 1
    elif b_pred != true and ex_pred == true:
        c += 1
    else:
        d += 1
        
table = [[a, b],
         [c, d]]
print(table)

In [None]:
# Example of calculating the mcnemar test
from statsmodels.stats.contingency_tables import mcnemar
# calculate mcnemar test
result = mcnemar(table, exact=False, correction=False)
# summarize the finding
print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
# interpret the p-value
alpha = 0.05
if result.pvalue > alpha:
	print('Same proportions of errors (fail to reject H0)')
else:
	print('Different proportions of errors (reject H0)')