# Check for multicollinearity among features

In [15]:
# Load experimental dataset
import pickle

path = '/projects/tumblr_community_identity/tmp/post_tags+comms_unigrams_lr_data.pkl'
with open(path, 'rb') as f:
    X_train, y_train, X_dev, y_dev, X_test, y_test = pickle.load(f)
X_train.shape

(66538, 6)

In [16]:
import pandas as pd
df = pd.DataFrame(X_train, columns=[f'factor{i}' for i in range(5)] + ['comm_match_follower-followee'])
df

Unnamed: 0,factor0,factor1,factor2,factor3,factor4,comm_match_follower-followee
0,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-0.000249
1,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-0.000249
2,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-0.000249
3,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-2.074561
4,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-0.000249
...,...,...,...,...,...,...
66533,0.066341,0.383799,-0.024714,0.045792,0.118698,2.074062
66534,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-2.074561
66535,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-0.000249
66536,-0.001475,-0.003261,0.000722,-0.002868,-0.004665,-0.000249


In [19]:
for col in df.columns:
    print(col)
    print(df[col].std())
    print(df[col].mean())
    print(df[col].min())
    print(df[col].max())
    print()

factor0
1.0000075145879785
1.039175959206089e-17
-69.34031918606424
205.17869276323248

factor1
1.000007514587718
9.39730090277568e-18
-61.507230554150524
28.912657795221122

factor2
1.0000075145878806
4.591862941129025e-18
-41.973429601321286
43.63687051616032

factor3
1.0000075145879364
7.42173196298761e-18
-28.0980183039563
32.23440654910987

factor4
1.0000075145878413
-5.099103614858394e-18
-22.329827289891597
35.50683132816204

comm_match_follower-followee
1.000007514587963
1.5590976497786924e-17
-2.0745608474127666
2.0740620499860447



In [12]:
df.corr().round(2)

Unnamed: 0,factor0,factor1,factor2,factor3,factor4,comm_match_follower-followee
factor0,1.0,-0.08,0.02,0.06,0.03,-0.0
factor1,-0.08,1.0,-0.02,-0.02,-0.09,0.01
factor2,0.02,-0.02,1.0,-0.01,-0.01,0.02
factor3,0.06,-0.02,-0.01,1.0,0.04,-0.01
factor4,0.03,-0.09,-0.01,0.04,1.0,-0.01
comm_match_follower-followee,-0.0,0.01,0.02,-0.01,-0.01,1.0


# PCA on post baseline hashtags
Now integrated into extract_features.py

In [1]:
# Load baseline hashtag features
import pandas as pd
data = pd.read_csv('/data/tumblr_community_identity/dataset114k/matched_reblogs_nonreblogs_dataset114k.csv')
[col for col in data.columns if 'tags' in col]

  interactivity=interactivity, compiler=compiler, result=result)


['post_tags_reblog_str', 'post_tags_nonreblog_str']

In [4]:
# Load hashtag vectorizer
import pickle

vec_fpath = '/projects/tumblr_community_identity/tmp/post_tag_names_vec.pkl'
with open(vec_fpath, 'rb') as f:
    vec = pickle.load(f)
print(len(vec.get_feature_names()))

6543


In [2]:
def string_list2str(string_list):
    """ Convert a list in string form, like '[one, two]', to a space-separated
        string of the items. For exampel '[one, two]' -> 'one two'
    """
    if isinstance(string_list, float):
        return ''
    return ' '.join(string_list[1:-1].split(', '))

In [6]:
from sklearn.model_selection import train_test_split
import scipy.sparse

data['post_tags_reblog'] = data['post_tags_reblog_str'].map(string_list2str)
data['post_tags_nonreblog'] = data['post_tags_nonreblog_str'].map(string_list2str)
data_train, data_test = train_test_split(data, test_size=.1, random_state=9)

X_train = scipy.sparse.vstack([vec.transform(data_train['post_tags_reblog']), vec.transform(data_train['post_tags_nonreblog'])])
X_train.shape

(199658, 6543)

In [8]:
# Learn PCA
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(10)
reduced = svd.fit_transform(X_train)

In [9]:
svd.components_.shape

(10, 6543)

In [12]:
import numpy as np

def feats_for_factors(feature_names, pca, n_factors=20, n_feats=40):
    top = np.flip(np.argsort(pca.components_)[:n_factors, -1*n_feats:], axis=1)
    vec = np.vectorize(lambda x: feature_names[x])
    return vec(top)

In [14]:
factor_loadings = feats_for_factors(vec.get_feature_names(), svd)
factor_loadings

array([['tumblr', 'com', 'https', 'media', 'height', 'width', 'jpg',
        'url', '78', 'data', '66', 'src', '500', '75', 'figure',
        'caption', '100', '400', 'false', 'exif', 'is_panorama',
        'original_size', 'alt_sizes', '250', 'blockquote', 'img', '640',
        'image', 'alt', '540', 'post', 'http', 'mexico', 'the', 'href',
        'you', 'class', 'mentions', 'tags', 'format'],
       ['and', 'the', 'to', 'of', 'this', 'it', 'is', 'my', 'you', 'so',
        'in', 'data', 'me', 'for', 'that', 'but', 'people', 'like', 'br',
        'by', 'love', 'was', 'on', 'with', 'just', 'src', 'all', 'not',
        'he', 'be', 'have', 'they', 'are', 'can', 'at', 'art', 'one',
        'about', 'im', 'figure'],
       ['data', 'src', 'figure', '78', '500', 'img', 'image', 'alt',
        'mexico', 'jpg', 'you', 'thank', '333', 'png', 'blockquote',
        'know', 'target', 'half', 'where', 'class', 'tumblr_blog',
        'post', 'culture', 'writer', 'colors', 'food', 'title', 'write',


# Check for pairs with specific terms (from domain-specific unigram features)

In [1]:
# Load pairs of blog descriptions
import pandas as pd

path = '/data/tumblr_community_identity/dataset114k/matched_reblogs_nonreblogs_dataset114k.csv'
pairs = pd.read_csv(path)
# pairs

# desc_cols = [col for col in pairs.columns if 'description' in col and not 'processed' in col]
desc_cols = [col for col in pairs.columns if 'description' in col and 'processed' in col]
# desc_cols

id_cols = [col for col in pairs.columns if 'tumblog_id' in col]
# id_cols

# Load communities
commpath = '/data/tumblr_community_identity/dataset114k/louvain_communities.txt'
with open(commpath) as f:
    comms = [[int(tumblog_id) for tumblog_id in comm.split()] for comm in f.read().splitlines()]
print(len(comms))
# comms[0]

  interactivity=interactivity, compiler=compiler, result=result)


82


In [5]:
# Investigate
comm = 3
terms = ['new', 'll']

commlines = pairs[pairs['tumblog_id_follower_reblog'].isin(comms[comm-1]) & pairs['tumblog_id_followee_reblog'].isin(comms[comm-1]) & \
                 pairs['tumblog_id_followee_nonreblog'].isin(comms[comm-1])]
commlines

import re
import pdb
pd.set_option('display.max_colwidth', None)

def terms_in_descs(descs):
    """ Search for the search terms being in the descriptions """
    follower, followee_reblog, followee_nonreblog = descs
    if not isinstance(follower, str):
        return False
    pat1 = '\\b{}\\b'.format(terms[0])
    pat2 = '\\b{}\\b'.format(terms[1])
    if re.search(pat1, follower.lower()):
        if re.search(pat2, followee_reblog) or re.search(pat2, followee_nonreblog):
            return True
    elif re.search(pat2, follower.lower()):
        if re.search(pat1, followee_reblog) or re.search(pat1, followee_nonreblog):
            return True
    return False

paired_descs = list(zip(*[commlines[colname] for colname in desc_cols]))
matches = commlines.loc[[terms_in_descs(descs) for descs in paired_descs], desc_cols]
matches.sample(min(30, len(matches)))

Unnamed: 0,processed_tumblr_blog_description_follower_reblog,processed_tumblr_blog_description_followee_reblog,processed_tumblr_blog_description_followee_nonreblog
45787,just trying to make new friends everyday,"guhh ... ? my name is ande [ he/him ] , 24 . isfj . i 'm just me and kind of exist and like everything .","i 'm a * cough-cough * year old shy , geeky , introverted `` bear '' . i 've been with my huzbear ( i think the term is cute ) for almost 18 years ... wow , time flies . i do n't have any singular 'fandom ' ; i like to think my interests are fairly broad . if you see what i post , you 'll get an idea of what i enjoy . it 's just a bunch of stuff i find interesting , fun , odd and/or sexy . there 's no rhyme or reason to it - much like my brain . i might even post something i know absolutely nothing about just because it looks neat . feel free to drop a note , a question , a comment , a photo , a `` hi '' or whatever . i 'll answer just about anything as honestly and openly as i can . you should be at least 18 years old to view some of this stuff . there is a lot of naughty stuff here . not safe for work . most are images i 've picked up elsewhere ."
104586,"men i dream about ! if i accidentally post an image that you own , please let me know and i 'll remove it .","hello . so ... i 'm refocusing this tumblr . material will be at most a hard r , but no longer ☓ for anything new . pretty gay , loads of wieners , soft and hard , but nothing overtly sexual from now onwards . still nsfw , so caution and your own discretion a must .",nsfw over 18 only . men only🍆💦please reblog and follow me
43000,"thanks for checking out my blog .. this is me seen through various different forms of expression . each posting has a piece of me incased in it ! message me , i 'll write back . : ) anon is ok too . get to know me ! - jai kik : jaixboixrl snapchat : jaixboixrl",cali boy with new york dreams ...,"hi there ! welcome to my page where i mainly reblog stuff and some times post rants about life . if you want to talk or get to know me , message me !"
65655,"getting started , new to this : thanks for your patience . re-blogging to start , 25 yrs of archives to come . adult blog ! under age leave now !",it 's so fapping good for you !,probably the hottest things you ’ ll watch today ffollow me for more ! 😜💦🇳🇱here
30946,21 years old~ gay🌈new yorker turned cali ( long beach ) ~ just a few boring selfies and things that peak my interests .,"local tired boy ( tm ) i draw and do other stuff . do n't really know where i 'm going , but it 's somewhere . wan na come with me ? it 'll be fun , probably .","aquele gordinho lindo , charmoso , tesudo , gostoso e maravilhoso que eu amo ❤🐻😍 sou 👉🏿 interior sp 🇧🇷"
50443,"bit of a chav , bit of an exhibitionist ! like to show off myself and my boyfriend . you 'll see some of us and some reblogging of things that turn me on . feel free to reblog my posts ( i get a thrill from it really )",my wank bank . my selfie aviators,"manchester/leeds , uk . 20 . butts and other things i like ( but mostly butts ) |🦄👨🏻‍🔬✌🏻| whorifices for porn | me | insta | - new name , same ho"
57764,"i am new to tumblr ! not really sure how to use it ... but anyways , im gerardo i am : 20 yrs old 5'10 thats it for now lol",ig : da_danieladame,"mexicano🇲🇽 . anime , butts and chingaderas , tagged/ me . feel free to ask , trying to make friends ! don ’ t follow me if you ’ re under 18 , or you ’ ll be blocked ."
2798,"the name is jacob/pup kirby , i 'm 22 , and live in san antonio . i am the proud sir/alpha to 2 amazing pups , kai and rowan . my posts are selfies , butts , cuties , bearded men , pokemon , anime , funny things . nsfw you youngins , so i must ask you politely to go away . chat me up . i love new friends . snapchat or kik ?","chicago based blog that is really just an expression of me . basically a window into me ... damn that was kinda deep . i have been looking at tumblr accounts and decided that it is time that i do one.if there is a picture of you that you want removed , please let me know . also , this is nsfw and 18+ only . i hope you enjoy ! ! !","( nsfw ) i am a bipoly pagan cub from the portland , oregon area . if you want to see what i 'm into , follow my other blog elven cub house ( @ cuboftheelvenvariety ) . for my artistic side check out @ elvengallery . feel free to ask me anything or just say hi . i love interacting with my followers . wish list , you buy it , i 'll model it out or use it for you ( or use it on someone else for you ) http : --amazon-com-registry-wishlist-112iqvlctx8ec"
28107,"i 'm jim parker , i 've been writing and shooting photos & videos for over a decade . i 'm now looking to put more content out , sharing the bits that have been held back as well as create new series .",adults only if you are under 18 go away !,"bearded/furry/sexy/hot handsome men of all shapes and sizes ... enjoy ! `` must be 18 yrs . or older to view ! '' most if not all pic 's taken from other blog 's or the net , if any pic 's need to be removed of yourself , i 'll do so ! ! to all who follows ... thanks enjoy !"
67573,jockstrap lover northern new jersey here any other guys in new jersey ? ? ? ?,"vers/btm guy here that likes long , sweaty , connected sessions of marathon sex . or a quick fuck or bj if you promise we 'll do the other stuff later . you 'll be back .",only guys who fuck real


# Examine features in training and test sets for different datasets

In [18]:
# Load features
import os
import pickle
from tqdm.notebook import tqdm

zero_test_features = {}
total_features = {}
runs = [
    'baseline_randomtest+exp1+exp2_all_lr'
    'dataset114k+exp1+exp2_all_lr',
]

for run in runs:
    print(run)
    run_fname = run + '_features.pkl'
    features_dirpath = '/projects/websci2020_tumblr_identity/output/features'
    features_fpath = os.path.join(features_dirpath, run_fname)
    with open(features_fpath, 'rb') as f:
        X_train, y_train, X_test, y_test = pickle.load(f)
    print(X_train.shape)
    print(X_test.shape)
    print()

    # Count features seen in training set that are never seen in the test set
    X_train_arr = X_train.A
    empty_features_train = []
    for col in tqdm(range(X_train.shape[1])):
        if not X_train_arr[:,col].any():
            empty_features_train.append(col)
    print(len(empty_features_train)) # why are there any?
    total_features[run] = len(X_train.shape[1])
    print(X_train.shape[1])
    print()

    X_test_arr = X_test.A
    empty_features_test = []
    for col in tqdm(range(X_test.shape[1])):
        if not X_test_arr[:,col].any():
            empty_features_test.append(col)
    print(len(empty_features_test))
    zero_test_features[run] = len(set(empty_features_test) - set(empty_features_train))
    print(zero_test_features[run])
    print()

dataset114k+exp1+exp2_all_lr
(101327, 27551)
(11259, 27551)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=27551.0), HTML(value='')))


218
27551



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=27551.0), HTML(value='')))


18163
17955
baseline_randomtest+exp1+exp2_all_lr
(641403, 17744)
(71267, 17744)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17744.0), HTML(value='')))


8
17744



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=17744.0), HTML(value='')))


5683
5675


In [20]:
total_features = {'baseline_randomtest+exp1+exp2_all_lr': 17744,
    'dataset114k+exp1+exp2_all_lr': 27551}

In [21]:
proportions_empty_features = {key: zero_test_features[key]/total_features[key] for key in zero_test_features}
proportions_empty_features

{'dataset114k+exp1+exp2_all_lr': 0.6517004827410984,
 'baseline_randomtest+exp1+exp2_all_lr': 0.3198264201983769}