In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from scipy.spatial.distance import cosine, euclidean
from tqdm import tqdm_notebook as tqdm

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Embedding, Flatten

import os,sys,inspect
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# from .. import gaussian_mixture_cotrain
from gaussian_mixture_cotrain import GaussianMixtureCotrain

from collections import Counter, defaultdict

from IPython.core.debugger import set_trace
from IPython.display import display

import matplotlib.pyplot as plt

import fasttext as ft
from pprint import pprint

Using TensorFlow backend.


In [2]:
# Restrict GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"

# CNN to predict category mentions

In [3]:
# Load descriptions
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_100posts.pkl')
print(descs.columns)
print(len(descs))

Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'segments', 'restr_segments_25', 'segments_25_nopunct', 'age', 'gender',
       'sexual orientation', 'pronouns', 'personality type',
       'ethnicity/nationality', 'relationship status', 'sexuality/gender'],
      dtype='object')
6902


In [4]:
# Load text posts
posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/textposts_recent100_100posts.pkl')
print(posts.columns)
len(posts)

Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'body_toks', 'body_str'],
      dtype='object')


690200

In [5]:
tids = sorted(descs['tumblog_id'].tolist())

## Prepare posts

In [6]:
# Text posts to word indices (Keras way)
texts = [' '.join(posts[posts['tumblog_id']==tid]['body_str']) for tid in tids] # concatenated posts
len(texts)

6902

In [8]:
MAX_VOCAB_SIZE = 100000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE,
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique words')

Found 528107 unique words


In [9]:
MAX_SEQUENCE_LENGTH = 20000
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
len(data)

6902

In [10]:
vocab = list(word_index.keys())[:MAX_VOCAB_SIZE] # lower indices are words kept
len(vocab)

100000

## Prepare description categories (labels)

In [11]:
cats = descs.columns.tolist()[-8:]
labels = list(zip(*[descs[cat] for cat in cats]))
labels = np.array(labels, dtype=int)
labels.shape

(6902, 8)

In [12]:
# Shuffle, split into train/test
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

TEST_SPLIT = 0.1
num_test_samples = int(TEST_SPLIT * data.shape[0])
x_train = data[:-num_test_samples]
print(x_train.shape)
y_train = labels[:-num_test_samples]
print(y_train.shape)
x_test = data[-num_test_samples:]
print(x_test.shape)
y_test = labels[-num_test_samples:]
print(y_test.shape)

(6212, 20000)
(6212, 8)
(690, 20000)
(690, 8)


## Build CNN model in Keras

In [13]:
# Load vocab embeddings
vocab_embed = np.load('/usr0/home/mamille2/tumblr/data/recent100_100posts_embeds.npy')

In [14]:
# Prepare model

EMBEDDING_DIM = 300
embedding_layer = Embedding(len(vocab),
                            EMBEDDING_DIM,
                            weights = [vocab_embed],
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable=False
                           )

model = Sequential()

model.add(embedding_layer)
model.add(Conv1D(1024, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Conv1D(128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(8, activation='sigmoid')) # final classification layer

model.compile(loss='binary_crossentropy', optimizer='adam')

## Train classifier

In [16]:
model.fit(x_train, y_train,
         batch_size=16, epochs=3, validation_data=(x_test, y_test))

Train on 6212 samples, validate on 690 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe4e7e5d978>

In [18]:
model.save('/usr0/home/mamille2/tumblr/data/100posts_cnn.h5')

## 1-time

In [13]:
# Load word embeddings (from Tumblr halfday)
wd_embed = ft.load_model('/usr0/home/mamille2/tumblr/data/halfday_ft.bin')

In [27]:
# Build lookup table
vocab_embed = np.empty((len(vocab),300))
for i, wd in enumerate(vocab):
    vocab_embed[i,:] = wd_embed[wd]
    
vocab_embed.shape

(100000, 300)

In [28]:
# Save vocab embeddings
np.save('/usr0/home/mamille2/tumblr/data/recent100_100posts_embeds.npy', vocab_embed)

In [32]:
cutoff = 20000
len([l for l in lens if l <= cutoff])/len(lens)

0.894233555491162

In [24]:
# Examine sequence lengths
lens = [len(s) for s in sequences]

print(np.mean(lens))
print(np.median(lens))
print(max(lens))

10382.1618371
7459.0
416462


In [6]:
# Text posts to word indices (Graham's way)
post_inds = []

w2i = defaultdict(lambda: len(w2i))
UNK = w2i["<unk>"] # 0 index

for tid in tids:
    toks = [t for p in posts[posts['tumblog_id']==tid]['body_toks'].tolist() for t in p]
    inds = [w2i[t] for t in toks]
    post_inds.append(inds) 
    
len(post_inds)

6902

In [7]:
# Vocab size
len(w2i)

616144

# Sample instances marked for certain identity categories

In [38]:
# Load labeled data
split = {}
s = 'train1000'
split[s] = pd.read_csv(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.csv', index_col=0)
print(split[s].columns)

Index(['tumblog_id', 'restr_segments_25', 'non-English', 'age', 'name',
       'personal description/commentary', 'location', 'interests',
       'adult content', 'sexual orientation', 'pronouns', 'gender', 'fandoms',
       'link to external content', 'occupation', 'astrological sign',
       'personality type', 'ethnicity/nationality', 'relationship status',
       'mental health', 'other/notes'],
      dtype='object')


In [39]:
pd.set_option('display.max_colwidth', -1)

In [40]:
# Sample from each column
# for c in list(split[s].columns)[2:-1]:
for c in ['non-English', 'age', 'name', 'location', 'interests', 'adult content',
         'sexual orientation', 'gender', 'pronouns', 'fandoms', 'link to external content',
         'occupation', 'personality type', 'astrological sign', 'ethnicity/nationality',
         'relationship status', 'mental health', 'personal description/commentary', 'other/notes']:
    print(c)
    display(split[s][split[s][c]==1].sample(n=5).loc[:, ['restr_segments_25', c]])
    print()

non-English


Unnamed: 0,restr_segments_25,non-English
5673831,"['sin tus caricias', 'nena', '¿que va a ser de mi?']",1.0
5589904,"['blog bandar ceme online', 'poker88', 'domino qiu qiu', 'capsa']",1.0
6036680,['czas nie leczy ran'],1.0
4963722,"[""et c'est du lolz en barre""]",1.0
2961991,['bisserl was von allem'],1.0



age


Unnamed: 0,restr_segments_25,age
4675110,"['stark depressiv', '19 jahre']",1.0
5679736,"['19', '👅naturally unbothered ✨', 'love your melanin 🍯']",1.0
4803395,"['18', 'cali', 'bi']",1.0
3689594,"['ani', '16']",1.0
2704999,"['19', 'infp', 'cap', 'sensitive black person']",1.0



name


Unnamed: 0,restr_segments_25,name
4621537,"['claudia', 'xxi']",1.0
1050064,"['call me grace ^^', 'libra', 'pansexual to the core', 'she/her', 'in university']",1.0
2358217,"['dustin', '20']",1.0
2098049,"['frankie wolff', '21', 'hella gay']",1.0
4741580,"['catalina', '15', 'chile', 'sagitario', '24-05-17']",1.0



location


Unnamed: 0,restr_segments_25,location
3911020,"['ash', '21', 'uk ♥ 18+ side blog ♥']",1.0
3639661,"['24 years old', 'lost angeles', 'fat princess']",1.0
1243880,"['hi i\'m shiba nagame"", \'living in japan', 'writing fanfics']",1.0
2363701,['birmingham'],1.0
1656661,"['aberto a sugestões', 'sou de portugal', 'aceito tudo menos homens']",1.0



interests


Unnamed: 0,restr_segments_25,interests
110729,"['fashion', 'luxury', 'homme', 'paris', 'morocco germany,bochum']",1.0
745368,"['22', 'jesus follower', 'nature', 'coffee', 'traveler']",1.0
1243880,"['hi i\'m shiba nagame"", \'living in japan', 'writing fanfics']",1.0
5088035,"['writer', 'high school sophomore', 'homeschooled']",1.0
4914497,"['merel', 'she/her', 'dutch', 'likes tacos and bad music']",1.0



adult content


Unnamed: 0,restr_segments_25,adult content
2337898,"['female', '25', 'in a good working rl', 'german', 'filthy 18+ only!!!']",1.0
4712484,"['consent is a must', 'enjoy']",1.0
859624,"['i love', ':-) only 18+!!!']",1.0
6276166,"['nsfw 18+ blog he/him', 'gay', '19 snapchat: donk811']",1.0
5405668,"['jack', '18', 'uk', 'nsfw']",1.0



sexual orientation


Unnamed: 0,restr_segments_25,sexual orientation
2153632,"['24, genderfluid, pan', 'bisexual, she', 'her, and they']",1.0
5465464,"['28', 'straight', '18+']",1.0
685360,"['she/her', '22', 'pagan', 'pansexual', 'in shipping hell']",1.0
2324390,"['meow\', ""i\'m also very ace,so']",1.0
3047905,"['alex or aldamert', 'he/him', 'pan as fuck']",1.0



gender


Unnamed: 0,restr_segments_25,gender
4465201,"['24', 'a place for myself', 'a curious girl exploring', '18+', 'submit if you want']",1.0
5449959,"['20 yo girl', 'italian', 'sad stuff lover']",1.0
6137916,"['20', 'lg(b)t', 'east coast 🤔']",1.0
359256,"['s~twenty~female', 'star wars']",1.0
2142277,"['closeted bisexual', 'cis female', 'presbyterian']",1.0



pronouns


Unnamed: 0,restr_segments_25,pronouns
1965062,"['charlotte', 'she/her', 'i like sims and cats']",1.0
5107489,"['archer', 'va -> nyc', '22', 'she/her']",1.0
4914497,"['merel', 'she/her', 'dutch', 'likes tacos and bad music']",1.0
3774318,"['slytherin', 'libra', 'intp', 'she/her']",1.0
4944383,"['ezra', 'he']",1.0



fandoms


Unnamed: 0,restr_segments_25,fandoms
5771853,"['not many understand this', '-slh']",1.0
5703809,"['hey! katie', 'she/hers', 'def not straight', 'infj', 'too many fandoms to count']",1.0
944531,"['actually quite mellow', 'gnu terry pratchett']",1.0
5696938,"['yaoi', 'manhwa', 'persona 5', 'danganronpa', 'memes', 'stupidity', 'occasional nsfw', 'spoilers, probably']",1.0
5012730,['multi-verse friendly'],1.0



link to external content


Unnamed: 0,restr_segments_25,link to external content
3472827,"['23', 'auckland', 'sc - thikcock99']",1.0
722189,"['mother earth goddess', 'philosopher portfolio', 'original account', 'soundcloud', 'instagram']",1.0
5388946,"['www', 'neifatti', 'it']",1.0
5259815,"['eden', 'wa', 'sc: eden5601', 'ig: edenhackett_', 'various mcelroy podcast', 'r&m', 'anti-onision', 'pro-dogs', 'mcu']",1.0
3653510,"['♌ leonina', 'instagram : myllabitte', 'twitter : @myllab19', 'snapchat : myllabitte']",1.0



occupation


Unnamed: 0,restr_segments_25,occupation
773799,"['i\'m a writer"", \'a very unsuccessful one']",1.0
4510358,"['liz menco', '30', 'museum archivist', 'spiritualist']",1.0
4165856,"['dob : 2', '1999 university student']",1.0
117754,"['photographer', 'videographer', 'coffee lover', 'outdoors enthusiast', 'adventure seeker', 'wild at heart']",1.0
1852102,"['sam', 'female', 'writer']",1.0



personality type


Unnamed: 0,restr_segments_25,personality type
1320569,"['🥀 they', 'them or he', 'him , infp , pisces, bi 🥀']",1.0
4796799,"['17', 'entp', 'chaotic neutral']",1.0
1949761,"['c a i t l y n', 'i n f j', 'a r t i s t', 'r e a d e r', 'e t c']",1.0
1919024,"['zana', '| infj-t', '| xvii', '| aquarius', '| ravenclaw']",1.0
2112798,"['infp - eighteen - england', 'gap year 07', '2017-08']",1.0



astrological sign


Unnamed: 0,restr_segments_25,astrological sign
4329683,"['crystal', '19', 'us', 'aries', 'tags']",1.0
976880,"['callum', '18', 'leo']",1.0
3596669,"['23', 'hufflepuff', 'pukwudgie', 'aquarius']",1.0
3529381,"['scorpio moon', 'cancer moon']",1.0
2822579,"['georgie', '19', 'aro-ace', 'she/her', 'cancer/tiger']",1.0



ethnicity/nationality


Unnamed: 0,restr_segments_25,ethnicity/nationality
5482224,"['italian girl of age 19', 'deeply feminist']",1.0
31575,"['twenty three', 'social justice advocate', 'lover of jesus & people']",1.0
5449959,"['20 yo girl', 'italian', 'sad stuff lover']",1.0
4833409,"['dom', 'russian', 'he/him', 'can draw']",1.0
4981270,"['william', '18', 'french & politics', '🏳️\\u200d🌈']",1.0



relationship status


Unnamed: 0,restr_segments_25,relationship status
5908287,"['eli (eee-lie)', 'he', 'him', 'mlm', 'i love my bf', 'my bf', 'posts about him']",1.0
2115340,['happily married couple'],1.0
6222904,"['engaged', 'bored', 'tired']",1.0
5914412,"[""i'm a single male""]",1.0
5353918,"['nsfw', 'taken']",1.0



mental health


Unnamed: 0,restr_segments_25,mental health
3784240,"['19', 'sad', 'lonely\\\\\\\\miserable\\\\\\\\']",1.0
5541049,"['depressed teen', '16']",1.0
5592086,['anxieties'],1.0
5897239,"['fitness', 'foodie', 'self-love', 'body-positive']",1.0
6273890,"['pro-recovery', 'stay safe']",1.0



personal description/commentary


Unnamed: 0,restr_segments_25,personal description/commentary
3745083,"['weird', 'non-binary', 'otaku', 'aires', 'pansexual', 'writer', 'gamer', 'avenged sevenfold fan\', ""they, their, they\'re']",1.0
1254008,['also write fanfiction too'],1.0
3478993,"['just a potato', 'crazy space potato']",1.0
4675110,"['stark depressiv', '19 jahre']",1.0
3402670,"['13', 'i have an adorable cat', 'artist']",1.0



other/notes


ValueError: a must be greater than 0

# Look for specific category values

In [2]:
# Load blog descriptions
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/blog_descriptions_recent100.pkl')
print(descs.columns)
len(descs)

Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description'],
      dtype='object')


5238440

In [3]:
pd.set_option('display.max_colwidth', -1)

In [5]:
# search_term = r'bun\W'
# search_term = r'bun pronoun'
# search_term = r'\brl\b'
search_term = r'\b[0-9]w[0-9]\b'
selected = descs[descs['parsed_blog_description'].map(lambda x: True if re.search(search_term,x) else False)]
print(len(selected))
selected.loc[:,['parsed_blog_description']]

1664


Unnamed: 0,parsed_blog_description
872,"He/him/they/them|INTP|19|5w4|Pisces|My main blog full of spells, aesthetics, and all sorts of things. All are welcome."
3527,Esther. ENTP / Libra / 8w7 / Slytherin
10312,ISFJ | Hufflepuff | 2w3 22 | Taurus | USA
10919,9w1 . INFP . 963 Ravenclaw/Wampus. Unconventional Capricorn.
18341,"INFJ~4w3~Trying to find my place in this crazy, insane, messed-up, beautiful world. 🌍"
22331,"hannah, taurus, infp, 4w5"
24324,// You are likely to be eaten by a Grue // Pop-surrealist // 4w5 // INT(F)J // ಠ_ಠ
24871,"INTP. 5w6, 9w8, 2w3. True Neutral. Melancholic/Phlegmatic. Medical Student."
30867,27(4/13)* 420Lover* StarWarsNerd* ChicagoBurbs* PetrolHead* HondaFanatic* INTP*6w5*Aries♈sun* Sagittarius♐moon*Virgo♍rising*Scorpio♏lillith
67126,nat // 20 // entp // atl // 6w7


# Word and character ngrams for identity category mention prediction

## Change CSV to pickle

In [None]:
# Load labeled data
split = {'dev200': None}
for s in split:
    split[s] = pd.read_csv(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.csv', index_col=0)
    print(split[s].columns)

In [None]:
def split_rm_punct(segments):
    """ Return segments split on punctuation, punctuation removed """
    
    new_segs = []
    
    for seg in segments:
        new_seg = ' '.join(re.split(r'\W', seg))
        new_seg = re.sub(r'\W', ' ', new_seg)
        new_seg = re.sub(r'\s+', ' ', new_seg).strip()
        new_segs.append(new_seg)
        
    return new_segs

In [None]:
# String representation to list
# split[s]['restr_segments_25'] = split[s]['restr_segments_25'].map(lambda x: x[2:-2].split("', '"))
split[s]['restr_segments_25'] = split[s]['restr_segments_25'].map(lambda x: ast.literal_eval(x))
split[s]['restr_segments_25']

In [None]:
split[s]['segments_25_nopunct'] = list(map(split_rm_punct, tqdm(split[s]['restr_segments_25'].tolist())))

In [None]:
# NaNs -> 0
for c in ['gender', 'sexual orientation', 'pronouns']:
    split[s][c] = split[s][c].fillna(0)

In [None]:
split[s]['sexuality/gender'] = [max(tup) for tup in zip(split[s]['sexual orientation'], split[s]['gender'], split[s]['pronouns'])]
len(split[s][split[s]['sexuality/gender'] == 1])

In [None]:
split[s].to_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')

## Load data

In [None]:
# Load labeled data
split = {'train1000': None, 'dev200': None}
for s in split:
    split[s] = pd.read_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')
    print(split[s].columns)

## Unigrams and bag of character ngrams

In [None]:
vec_dict = {'unigrams': TfidfVectorizer(), 'char 1-4grams': TfidfVectorizer(analyzer='char', ngram_range=(1,4))}
clf_dict = {'NB': MultinomialNB(), 'SVM': svm.SVC()}
outcome_classes = ['sexual orientation', 'pronouns', 'gender', 'sexuality/gender']
bow = {s: {} for s in split}
labels = {s: {} for s in split}
outlines = []

# Get features
# Fit
for vec_name in vec_dict:
    data = [' '.join(segs) for segs in split['train1000']['segments_25_nopunct'].tolist()]
    vec_dict[vec_name].fit(data)

for s in split:
    for vec_name, vec in vec_dict.items():
        data = [' '.join(segs) for segs in split[s]['segments_25_nopunct'].tolist()]
        bow[s][vec_name] = vec.transform(data)
    
    # Get labels
    for l in outcome_classes:
        labels[s][l] = split[s][l].values

# Training
for vec_name in vec_dict:
    for l in outcome_classes:
        for clf_name, clf in clf_dict.items():
            clf.fit(bow['train1000'][vec_name], labels['train1000'][l])
            
            # Testing
            for s in split:
                preds = clf.predict(bow[s][vec_name])
                pos = preds==1 # positive guesses
                true_pos = labels[s][l]==1 # true positives
                matches = sum([all(tup) for tup in zip(pos, true_pos)])
                prec = f'{matches}/{sum(pos)} ({matches/sum(pos):.1%})'
                rec = f'{matches}/{sum(true_pos)} ({matches/sum(true_pos):.1%})'
            
                outlines.append([vec_name, clf_name, s, l, prec, rec])
        
pd.DataFrame(outlines, columns=['features', 'classifier', 'dataset', 'predicted class', 'precision', 'recall'])

## 1-time

In [None]:
test = ['•draw for life•', '•a student•', '•18•']
split_rm_punct(test)

# Pattern matching for mentions of identity categories

In [2]:
# Load labeled data
split = {'train1000': None, 'dev200': None}
for s in split:
    split[s] = pd.read_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')
    print(split[s].columns)

Index(['tumblog_id', 'restr_segments_25', 'non-English', 'age', 'name',
       'personal description/commentary', 'location', 'interests',
       'adult content', 'sexual orientation', 'pronouns', 'gender', 'fandoms',
       'link to external content', 'occupation', 'astrological sign',
       'personality type', 'ethnicity/nationality', 'relationship status',
       'mental health', 'other/notes', 'segments_25_nopunct',
       'sexuality/gender'],
      dtype='object')
Index(['tumblog_id', 'restr_segments_25', 'non-English', 'age', 'name',
       'personal description/commentary', 'location', 'interests',
       'adult content', 'sexual orientation', 'pronouns', 'gender', 'fandoms',
       'link to external content', 'occupation', 'astrological sign',
       'personality type', 'ethnicity/nationality', 'relationship status',
       'mental health', 'other/notes', 'segments_25_nopunct',
       'sexuality/gender'],
      dtype='object')


In [3]:
# Load US states
fpath = '/usr0/home/mamille2/tumblr/data/states.csv'
states = [s.lower() for s in pd.read_csv(fpath)['State'].tolist()]

# Load nationalities
fpath = '/usr0/home/mamille2/tumblr/data/nationalities.txt'
with open(fpath) as f:
    nats = [nat.lower() for nat in f.read().splitlines() if (len(nat) > 3 and not nat in states)]
    
print(len(nats))

# Load ethnicities
fpath = '/usr0/home/mamille2/tumblr/data/ethnicities.txt'
outlist = states + ['coast']
with open(fpath) as f:
    eths = [e.split()[0].lower() for e in f.read().splitlines() if (len(e.split()[0]) > 4 and not e.split()[0].lower() in outlist)]
    
print(len(eths))

194
1035


In [4]:
# Regex patterns
terms = {
        'age': [r'(?:[^-+\w]|^)([1-6]{1}[0-9]{1})[^-+0-9]|^([1-6]{1}[0-9]{1})$',
               r'twelve',
               r'thirteen',
               r'fourteen',
               r'fifteen',
               r'sixteen',
               r'seventeen',
               r'eighteen',
               r'nineteen',
               r'twenty',
               r'thirty',
               r'forty',
               r'fifty',
               r'sixty'],
#         'location': [],
        'gender': [r'male\b', r'female', 
                    r'trans', r'ftm', r'mtf', r'cis',
                    r'girl\b', r'boy\b', r'\bman\b', r'guy\b', r'woman', r'gu+rl', r'gii+rl',
                    r'non-binary', r'nonbinary', r'nb', r'agender', r'neutrois',
                    r'\bmom\b', r'\bdad\b', r'wife', r'husband', r'\bbrother\b', r'\bson\b', r'\bsister\b',
                    r'bigender', r'lgbt'],
        'sexual orientation': 
                     [r'gay', r'straight', r'lesbian', r'\bhomo',
                       r'bisexual', r'\bbi\b', r'pansexual', r'\bpan\b',
                       r'lgbt', r'queer',
                       r'\bace\b', r'\basexual', r'aro-ace', r'aro/ace',
                     ],
         'pronouns': [
             r'(?:\W|\b)she(?:\W|\b)', r'(?:\W|\b)her(?:\W|\b)',
             r'(?:\W|\b)he(?:\W|\b)', r'(?:\W|\b)him(?:\W|\b)',
             r'(?:\W|\b)they(?:\W|\b)', r'(?:\W|\b)them(?:\W|\b)',
             r'pronouns'
                ],
        'personality type': [
            r'(?:i|e|a)(?:s|n)(?:t|f)(?:j|p)',
            r'introvert',
            r'extrovert', 
            r'ambivert',
            r'\b[0-9]w[0-9]\b',
            ],
        'ethnicity/nationality': [r'\b{}\b'.format(el) for el in eths + nats] + 
                [r'latino', r'latina', r'cubana', r'cubano', r'chilena', r'chileno', r'mexicano', r'mexicana',
                r'palestinian'],
        'relationship status': [
            r'taken', r'married', r'single', r'engaged', r'husband', r'spouse', r'wife', r'newlywed',
            r'in a rl', r'in rl', r'in a relationship',
        ]
}
terms['sexuality/gender'] = terms['gender'] + terms['sexual orientation'] + terms['pronouns']

excl_terms = {
    'age': ['nsfw 18', '18 nsfw', '18 only', 'only 18'],
}

In [5]:
# Combine terms in regex
terms_re = {}
for cat in terms:
    terms_re[cat] = r'|'.join(terms[cat])

In [12]:
def has_category(cat, segments):
    ans = False
    
    if not isinstance(segments, list):
        return ans
    
    ans = any(re.search(terms_re[cat], s) for s in segments)
#     for c in terms[cat]:
#         ans = any(re.search(c, s) for s in segments)
#         if ans:
#             break
            
    if cat in excl_terms:
        for c in excl_terms[cat]:
            if any(c in s for s in segments):
                ans = False
            
    return ans

## Results

In [13]:
positives = {}
negatives = {}
truecat = {}
pos_matches = {}
outlines = []

for cat in tqdm(terms):
    print(cat)
    positives[cat] = {}
    negatives[cat] = {}
    truecat[cat] = {}
    pos_matches[cat] = {}
    
    for sp in split:
    
        preds = []
        preds = split[sp]['segments_25_nopunct'].map(lambda x: has_category(cat, x))

        # Get precision and recall
        positives[cat][sp] = preds[preds==True]
        negatives[cat][sp] = preds[preds==False]
        pos_matches[cat][sp] = set(positives[cat][sp].index).intersection(split[sp][split[sp][cat]==1].index)
        truecat[cat][sp] = split[sp][split[sp][cat]==1]

        if len(pos_matches[cat][sp]) > 0:
            prec = len(pos_matches[cat][sp])/len(positives[cat][sp])
        else:    
            prec = 0
            
        prec_str = f'{len(pos_matches[cat][sp])}/{len(positives[cat][sp])} ({prec:.1%})'
            
        rec = len(pos_matches[cat][sp])/len(truecat[cat][sp])
        rec_str = f'{len(pos_matches[cat][sp])}/{len(truecat[cat][sp])} ({rec:.1%})'
        f1 = 2 * prec * rec / (prec + rec)

        outlines.append([sp, cat, prec_str, rec_str, f1])
    
pd.DataFrame(outlines, columns=['dataset', 'predicted class', 'precision', 'recall', 'f1'])

age
gender
sexual orientation
pronouns
personality type
ethnicity/nationality
relationship status
sexuality/gender



Unnamed: 0,dataset,predicted class,precision,recall,f1
0,train1000,age,266/313 (85.0%),266/294 (90.5%),0.876442
1,dev200,age,46/61 (75.4%),46/52 (88.5%),0.814159
2,train1000,gender,55/64 (85.9%),55/69 (79.7%),0.827068
3,dev200,gender,5/10 (50.0%),5/6 (83.3%),0.625
4,train1000,sexual orientation,53/57 (93.0%),53/57 (93.0%),0.929825
5,dev200,sexual orientation,10/10 (100.0%),10/10 (100.0%),1.0
6,train1000,pronouns,68/74 (91.9%),68/68 (100.0%),0.957746
7,dev200,pronouns,9/14 (64.3%),9/9 (100.0%),0.782609
8,train1000,personality type,18/23 (78.3%),18/19 (94.7%),0.857143
9,dev200,personality type,2/4 (50.0%),2/2 (100.0%),0.666667


In [8]:
pd.set_option('display.max_colwidth', -1)

In [20]:
# Examine misclassified
cat = 'ethnicity/nationality'
sp = 'train1000'

print('False positives:')
false_positives = set(positives[cat][sp].index).intersection(split[sp][split[sp][cat]!=1].index)
display(split[sp].loc[false_positives, ['segments_25_nopunct', cat]])

print('False negatives:')
false_negatives = set(negatives[cat][sp].index).intersection(split[sp][split[sp][cat]==1].index)
display(split[sp].loc[false_negatives, ['segments_25_nopunct', cat]])

False positives:


Unnamed: 0,segments_25_nopunct,ethnicity/nationality
2085891,"[madridista primero, luego mexicano]",
3928996,"[heo jongin sage xxiv, racer, kamikaze, black chevy corvette c5, ]",
3027319,"[english major, poet, hockey player, var fhs document, location var w_h window, screen, width x window, screen, height fhs, src s1, freehostedscripts, net ocounter, head, appendchild fhs document, src s1, freehostedscripts, net ocount, head]",
2337898,"[female, 25, in a good working rl, german, filthy 18 only]",
5701899,"[14 years old, french, lunax lulu, cosplayer, do some random art]",
2416138,"[owner male, 19 years old, panda white tiger, writer]",
3267373,"[but probably bisexual, chinese zodiac horse, who knows, not me]",
3236218,[an american musical],
5182327,"[27, lesbian, switch most submissive, nsfw r18 posts, speak english spanish]",
2620408,"[lvl 18, green black, fma, otaku, , japan, snapchat lilja_elric]",


False negatives:


Unnamed: 0,segments_25_nopunct,ethnicity/nationality
1219397,"[instagram pristinetrash, darian, , bi]",1.0
3320520,"[ayla, 16, the netherlands]",1.0
2881000,"[italy, female]",1.0
3919978,"[baby, brasil, mg]",1.0
1780813,[allah swt],1.0
2191982,"[andrea, 20, méxico]",1.0
188208,"[makeup, 17, us]",1.0
31575,"[twenty three, social justice advocate, lover of jesus people]",1.0
745368,"[22, jesus follower, nature, coffee, traveler]",1.0
3112475,"[bk, nyc bk]",1.0


In [10]:
row = 6137916
for term in split[sp].loc[row, 'segments_25_nopunct']:
    for t in eths:
        if re.search(t, term):
            print(term)
            print(t)

east coast
coast


In [None]:
for cat in ['sexual orientation', 'gender', 'pronouns', 'sexuality/gender']:
    print(cat)
    print(has_category(cat, split['dev200'].loc[4539145, 'segments_25_nopunct']))

## Apply to corpus of descriptions

In [15]:
# Load blog descriptions
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_recent100_restr25.pkl')
print(descs.columns)
len(descs)

Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'segments', 'restr_segments_25'],
      dtype='object')


1134175

In [22]:
# Annotate for identity categories
for cat in tqdm(terms):
    print(cat)
    descs[cat] = descs['segments_25_nopunct'].map(lambda x: has_category(cat, x))

age
gender
sexual orientation
pronouns
personality type
ethnicity/nationality
relationship status
sexuality/gender



## Modifications

In [None]:
sp = 'train500'
# sp = 'dev100'
incorrect = split[sp][split[sp]['restr_segments_25'].map(lambda x: 'aromantic' in ' '.join(x))]
# incorrect = split[sp][split[sp]['restr_segments_25'].map(lambda x: 'poly' in ' '.join(x))]
incorrect
# mask = split['train500']['restr_segments_25'].map(lambda x: 'poly' in ' '.join(x) if isinstance(x, list))
# split['train500'][mask]

In [None]:
# Corrections
sp = 'train500'
cat = 'pronouns'
val = 1
# sp = 'dev100'

# for i in incorrect.index:
for i in [3047905]:
    split[sp].loc[i, cat] = val
    
len(split[sp])

In [None]:
split['train500']['gender'] = split['train500']['gender'].fillna(0)

In [None]:
# Convert from string to list
split['dev100']['restr_segments_25'] = split['dev100']['restr_segments_25'].map(lambda x: x[2:-2].split("', '") if isinstance(x, str) else x)
split['dev100']['restr_segments_25']

In [None]:
# Remove mistake settings
split['train500'] = split['train500'][split['train500']['restr_segments_25'].map(lambda x: not isinstance(x, float))]
len(split['train500'])

In [None]:
# Remove mistake settings
split['dev100'] = split['dev100'][split['dev100']['restr_segments_25'].map(lambda x: not isinstance(x, float))]
len(split['dev100'])

In [None]:
split = {}
s = 'train1000'
split[s] = pd.read_csv(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.csv', index_col=0)
len(split[s])

In [None]:
# Convert from string to list
split[s]['restr_segments_25'] = split[s]['restr_segments_25'].map(lambda x: x[2:-2].split("', '") if isinstance(x, str) else x)
split[s]['restr_segments_25']

In [None]:
split[s].to_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')

In [None]:
split['dev100'].to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_dev100.pkl')
split['dev100'].to_csv('/usr0/home/mamille2/tumblr/data/list_descriptions_dev100.csv')
split['train500'].to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_train500.pkl')
split['train500'].to_csv('/usr0/home/mamille2/tumblr/data/list_descriptions_train500.csv')

## 1-time

In [16]:
def split_rm_punct(segments):
    """ Return segments split on punctuation, punctuation removed """
    
    new_segs = []
    
    for seg in segments:
        new_seg = ' '.join(re.split(r'\W', seg))
        new_seg = re.sub(r'\W', ' ', new_seg)
        new_seg = re.sub(r'\s+', ' ', new_seg).strip()
        new_segs.append(new_seg)
        
    return new_segs

In [18]:
descs['segments_25_nopunct'] = list(map(split_rm_punct, tqdm(descs['restr_segments_25'].tolist())))




In [24]:
descs.to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_recent100_restr25.pkl')

In [None]:
preds = []
preds = split['dev200']['restr_segments_25'].map(lambda x: has_category('sexuality/gender', x))
preds[preds==True]

In [None]:
split['dev200'].columns

In [None]:
split['dev200'].rename(columns={'personal description/ commentary': 'personal description/commentary',
                               'ethnicity/ nationality': 'ethnicity/nationality'}, inplace=True)

In [None]:
split['dev200'].to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_dev200.pkl')

In [None]:
has_category('sexuality/gender', split['dev200'].loc[4539145,'restr_segments_25'])

In [None]:
has_category('gender', ['male', '28'])

In [None]:
has_category('gender', ['girl', '28'])

In [None]:
has_category('sexual orientation', ['pan as fuck', '28'])

In [None]:
has_category('pronouns', ['she/her', 'them', 'he'])

In [None]:
has_category('pronouns', ['banshee'])

In [None]:
has_category('pronouns', ['he they'])

# Qualitatively examine description segments

In [None]:
# Load descriptions
list_desc_data = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions.pkl')
print(len(list_desc_data))
print(list_desc_data.columns)

In [None]:
pd.set_option('display.max_colwidth', 999)

In [None]:
samp = list_desc_data.sample(30)
samp

# Examine Brown clustering of description segments

In [None]:
with open('/usr0/home/mamille2/brown-cluster/desc_segments_20-c50-p1.out/paths') as f:
# with open('/usr0/home/mamille2/brown-cluster/desc_segments_20_freq-c50-p1.out/paths') as f:
    lines = f.read().splitlines()
    
len(lines)

In [None]:
outlines = []

for l in lines:
    l_split = l.split('\t')
    if len(l_split) == 3:
        outlines.append(l_split)
#         clu['all'][l_split[0]].append(l_split[1])
    
# print(len(clu['all']))
# clu['all'].keys()

clu = pd.DataFrame(outlines, columns=['cluster', 'word', 'freq'])
clu

In [None]:
clu['freq'] = clu['freq'].astype(int)

In [None]:
clu.sort_values(['cluster', 'freq'], inplace=True, ascending=False)
clu

In [None]:
pd.set_option('display.max_rows', 999)

In [None]:
clu

In [None]:
for val in clu['cluster'].unique():
    rows = clu[clu['cluster']==val]
    print(rows.head(20))
    print()

In [None]:
clu.to_csv('/usr0/home/mamille2/tumblr/results/desc_segments_brown_clusters.csv', index=False)

# Reduce dimensionality of description embeddings

In [None]:
# desc_embs = np.load('/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_avg.npy')
# desc_embs = np.load('/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy')
desc_embs = np.load('/usr0/home/mamille2/tumblr/data/desc_recent5_avg.npy')
desc_embs.shape

In [None]:
# Get labels (top prob clusters)--just load saved probabilities
# probs = np.load('/usr0/home/mamille2/tumblr/data/gmm_50_desc_avg_probs.npy')
# probs = np.load('/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc_avg_probs.npy')
probs = np.load('/usr0/home/mamille2/tumblr/data/recent5_gmm_50_desc_avg_probs.npy')
probs.shape

In [None]:
clusters_assgn = np.argsort(probs, axis=1)[:,-1] 
clusters_assgn.shape

## PCA

In [None]:
pca = PCA(n_components=2)
reduced = pca.fit_transform(desc_embs)
print(reduced.shape)
print(pca.explained_variance_ratio_)

## t-SNE

In [None]:
# Reduce dimensions to 50 first
pca = PCA(n_components=50)
pca_reduced = pca.fit_transform(desc_embs)
print(pca_reduced.shape)
print(pca.explained_variance_ratio_)

In [None]:
inds = np.random.choice(len(pca_reduced), int(1e4))
samp = pca_reduced[inds]

tsne = TSNE(n_components=2, verbose=2)
# reduced = tsne.fit_transform(desc_embs)
reduced = tsne.fit_transform(samp)
print(reduced.shape)

## Graph clusters of reduced dimensions

In [None]:
# If sampled, need to same cluster assignments
clusters_assgn = clusters_assgn[inds]
len(clusters_assgn)

In [None]:
%matplotlib inline

fig = plt.figure(figsize=(15,10))
scatter = plt.scatter(reduced[:,0], reduced[:,1], c=clusters_assgn, s=10)
plt.colorbar(scatter)
# plt.axis([-1,2.5,-4,1.5])
plt.axis([-3,10,-2,5])
# plt.title("PCA of cotrained description embeddings")
plt.title("PCA of description embeddings")
# plt.title("t-SNE of description embeddings (10k)")
# fig.savefig('/usr0/home/mamille2/tumblr/results/pca_cotrain.png', dpi=100)
fig.savefig('/usr0/home/mamille2/tumblr/results/pca_desc_recent5.png', dpi=100)
# fig.savefig('/usr0/home/mamille2/tumblr/results/tsne_cotrain.png', dpi=100)
# fig.savefig('/usr0/home/mamille2/tumblr/results/tsne_desc.png', dpi=100)
fig.show()

In [None]:
clu_ctr = Counter(clusters_assgn)
clu_ctr

# Run GMM clustering on blog descriptions

In [None]:
# Load data
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

In [None]:
X = desc_emb[:500000,:]
clf = GaussianMixture(n_components=50, verbose=2, warm_start=True)
clf.fit(X)

In [None]:
outpath = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'

with open(outpath, 'wb') as f:
    pickle.dump(clf, f)

## Try to continue training a model

In [None]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [None]:
X = desc_emb[:500000,:]
clf.fit(X)

# Examine trained GMM

In [None]:
# Load data
# desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
# desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_avg.npy'
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_sum.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

In [None]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
# path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'
# path = '/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc_sum.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [None]:
# Load descriptions
# path = '/usr0/home/mamille2/tumblr/data/en_blog_descriptions.pkl'
path = '/usr0/home/mamille2/tumblr/data/desc_recent5.pkl'
desc_df = pd.read_pickle(path)

# descs = desc_df['parsed_blog_description'].tolist()
desc_toks = desc_df['tokenized_blog_description'].tolist()

In [None]:
# clf.bic(desc_emb[:500000,:]) # -615M for 20 comps, -652M for 50 comps
clf.bic(desc_emb)

In [None]:
clf.lower_bound_

In [None]:
# Get highest weights
wted_comps = np.argsort(clf.weights_)[::-1]
wted_comps

## Examine datapoints with highest probabilities assigned for each cluster; examine cluster assignments

In [None]:
# probs = clf.predict_proba(desc_emb[:500000,:])
probs = clf.predict_proba(desc_emb)
probs.shape

In [None]:
top_probs = np.argsort(probs, axis=0)[::-1]
top_probs.shape

In [None]:
def top_descs(probs, descs, k, order, vocab_file=None):
    """ Prints top k descriptions for each component"""
    
    top_probs = np.argsort(probs, axis=0)[::-1]
    
    if vocab_file: # dict [n_words]: [vocab]
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)
    
    for i in order:
        print("Component {}".format(i))
        col = top_probs[:,i]
#     for i, c in enumerate(top_probs.T):
        
        for el in col[:k]: 
            if vocab_file:
                print('\t' + ' '.join(d if d in vocab[100000] else '<unk>' for d in descs[el])) # for tokenized
            else:
                print('\t' + ' '.join(d if d in vocab[100000] else '<unk>' for d in descs[el])) # for tokenized
#             print('\t' + descs[el])
            
        print()

In [None]:
# Top descriptions from halfday co-training, sum
top_descs(probs, desc_toks, 20, wted_comps, '/usr0/home/mamille2/tumblr/data/halfday_top5_vocab100000.pkl')

In [None]:
# Top descriptions from just descriptions (50 components)
top_descs(probs, descs, 20, wted_comps)

In [None]:
# Top descriptions from halfday co-training, averages
top_descs(probs, descs, 20, wted_comps)

## Find closest words in embedding space to cluster means
Doesn't really mean anything, as are averaging embeddings across all words in a post and 'dmitry' is closest to each cluster mean

In [None]:
path = '/usr0/home/mamille2/tumblr/data/desc_ftvecs100000.pkl'

with open(path, 'rb') as f:
    wd_embs = pickle.load(f)
    
len(wd_embs[100000])

In [None]:
closests = []
dist = euclidean

for m in tqdm(clf.means_):
    closest_dist = np.infty
    closest_wd = None
    
    for wd, emb in wd_embs[100000].items():
        if dist(m,emb) < closest_dist:
            closest_wd = wd
            
    closests.append(closest_wd)
    
closests

# Sample blog descriptions for analysis

In [None]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

In [None]:
pd.set_option('display.max_colwidth', 999)

In [None]:
s = data.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

## Blog descriptions from blogs that have text posts in halfday

In [None]:
text_posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/halfday_text.pkl')
print(len(text_posts))
text_posts.columns

In [None]:
# Blogs that also have text descriptions
tumblogs_allposts = text_posts['tumblog_id'].unique()
len(tumblogs_allposts)

In [None]:
count_series = text_posts.groupby(['tumblog_id']).size()

In [None]:
tumblogs_2posts = count_series[count_series >= 2].index

In [None]:
tumblogs_5posts = count_series[count_series >= 5].index

In [None]:
tumblogs_10posts = count_series[count_series >= 10].index

In [None]:
data_text = data[data['tumblog_id'].isin(tumblogs_allposts)]
len(data_text)

In [None]:
data_text2 = data[data['tumblog_id'].isin(tumblogs_2posts)]
len(data_text2)

In [None]:
data_text = data[data['tumblog_id'].isin(tumblogs_5posts)]
len(data_text)

In [None]:
data_text10 = data[data['tumblog_id'].isin(tumblogs_10posts)]
len(data_text10)

In [None]:
# Sample from those who have at least 10 text posts in halfday

s = data_text10.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

In [None]:
# Sample from those who have at least 2 text posts in halfday

s = data_text2.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

# LSA on blog descriptions

## Get blog descriptions

In [None]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

In [None]:
blog_descs = data['parsed_blog_description'].values
blog_descs.shape

## Get tfidf matrix

In [None]:
tfidf = TfidfVectorizer(max_features=100000)
tfidf_mat = tfidf.fit_transform(blog_descs)
tfidf_mat.shape

## Do SVD

In [None]:
svd = TruncatedSVD(n_components=300)
svd_mat = svd.fit_transform(tfidf_mat)
svd_mat.shape

In [None]:
svd.explained_variance_ratio_.sum() 
# 17% with 100 components over full vocab 
# 22% with 100 components over top 100k words
# 34% with 300 components over top 100k words

In [None]:
# Words x components matrix
svd.components_.shape

In [None]:
# word features
feats = tfidf.get_feature_names()
len(feats)

## Get ranked word features by component

In [None]:
top = np.argsort(svd.components_)[:100]
top.shape

In [None]:
top_sub = top[:, :100]
top_sub.shape

In [None]:
feats2names = np.vectorize(lambda x: feats[x])
top_feats = feats2names(top_sub)
top_feats

In [None]:
for i, factor in enumerate(top_feats):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

In [None]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topwords.npy', top_feats)

## Get ranked documents by component

In [None]:
top_docs_idx = np.argsort(svd_mat.T) # Select 10 highest components
top_docs_idx.shape

In [None]:
top_docs_idx = top_docs_idx[:100]
top_docs_idx.shape

In [None]:
top_sub = top_docs_idx[:, :100]
top_sub.shape

In [None]:
idx2docs = np.vectorize(lambda x: blog_descs[x])
top_docs = idx2docs(top_sub)
top_docs

In [None]:
for i, factor in enumerate(top_docs):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

In [41]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topdocs.npy', top_docs)

NameError: name 'top_docs' is not defined