In [8]:
import pandas as pd
import os
# data_dirpath = '/usr2/mamille2/tumblr/data' # erebor
data_dirpath = '/usr0/home/mamille2/erebor/tumblr/data' # misty

# 3M dataset, cost-sensitive classification with Logistic Regression class_weight

In [9]:
from pprint import pprint

# Load data
# data = pd.read_pickle(os.path.join(data_dirpath, 'posts_descs_3m_rebloggers.pkl'))
data = pd.read_pickle(os.path.join(data_dirpath, 'posts_descs_3m_reblog_restricted.pkl'))
print(len(data))
pprint(sorted(data.columns.tolist()))

20163
['accepts_answers',
 'activity_time_epoch_followee',
 'activity_time_epoch_follower',
 'activity_time_epoch_post',
 'age_followee',
 'age_follower',
 'age_terms_followee',
 'age_terms_follower',
 'blog_classifier_followee',
 'blog_classifier_follower',
 'blog_classifier_post',
 'body',
 'created_time_epoch_followee',
 'created_time_epoch_follower',
 'created_time_epoch_post',
 'ethnicity/nationality_followee',
 'ethnicity/nationality_follower',
 'ethnicity/nationality_hegemonic_present_followee',
 'ethnicity/nationality_hegemonic_present_follower',
 'ethnicity/nationality_opposite_present_followee',
 'ethnicity/nationality_opposite_present_follower',
 'ethnicity/nationality_terms_followee',
 'ethnicity/nationality_terms_follower',
 'fandoms_followee',
 'fandoms_follower',
 'fandoms_terms_followee',
 'fandoms_terms_follower',
 'follower',
 'gender/sexuality_followee',
 'gender/sexuality_follower',
 'gender/sexuality_hegemonic_present_followee',
 'gender/sexuality_hegemonic_present

In [18]:
# Split train/test
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2, random_state=9)

num_reblogs = len(data_train[data_train['reblogged']==True])
print(f"Reblogged: {num_reblogs} / {len(data_train)} ({num_reblogs/len(data_train) * 100}%)")

Reblogged: 1824 / 16130 (11.308121512709238%)


In [35]:
# Decide features, vectorize
from sklearn.feature_extraction.text import CountVectorizer

text_vectorizer = CountVectorizer(min_df=1, max_features=50000, stop_words='english')
text_vectorizer.fit(data_train['post_text']) # corpus is a list of strings (documents), probably can't fit to test

feats = {'train':{}, 'test': {}}

feats['train']['post_text'] = text_vectorizer.transform(data_train['post_text'])
feats['test']['post_text'] = text_vectorizer.transform(data_test['post_text'])

# desc_vectorizer = CountVectorizer(min_df=1, max_features=5000)
# desc_vectorizer.fit(data_train['parsed_blog_description_followee']) # corpus is a list of strings (documents), probably can't fit to test
# desc_vectorizer.fit(data_train['parsed_blog_description_follower']) # corpus is a list of strings (documents), probably can't fit to test

# feats['train']['parsed_blog_description_followee'] = desc_vectorizer.transform(data_train['parsed_blog_description_followee'])
# feats['test']['parsed_blog_description_followee'] = desc_vectorizer.transform(data_test['parsed_blog_description_followee'])
# feats['train']['parsed_blog_description_follower'] = desc_vectorizer.transform(data_train['parsed_blog_description_follower'])
# feats['test']['parsed_blog_description_follower'] = desc_vectorizer.transform(data_test['parsed_blog_description_follower'])

len(feats['train'])

1

In [36]:
from scipy.sparse import hstack

X_train, X_test = hstack(list(feats['train'].values())), hstack(list(feats['test'].values()))
y_train, y_test = data_train['reblogged'], data_test['reblogged']

print(X_train.shape)
print(X_test.shape)

(16130, 50000)
(4033, 50000)


In [38]:
# Run logistic regression with class weighting
from sklearn.linear_model import LogisticRegression

# clf = LogisticRegression()
clf = LogisticRegression(class_weight='balanced')
# clf = LogisticRegression(class_weight={0: 0.01, 1: 0.99})

clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [39]:
preds = clf.predict(X_test)
sum(preds)

734

In [40]:
# clf = LogisticRegression(class_weight={0: 0.01, 1: 0.99})
from sklearn.metrics import precision_recall_fscore_support

scores = precision_recall_fscore_support(y_test, preds, average=None)

scores_df = pd.DataFrame(np.array(scores[:-1]), columns=['Not reblogged', 'Reblogged'], 
                         index=['Precision', 'Recall', 'F1'])
scores_df

Unnamed: 0,Not reblogged,Reblogged
Precision,0.900576,0.183924
Recall,0.832213,0.291577
F1,0.865046,0.225564


## Save, load features 

In [19]:
# Save features
import scipy.sparse

scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_post_desc_text_train_60k.npz'), X_train)
scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_post_desc_text_test_60k.npz'), X_test)
# scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_post_text_train_50k.npz'), X_train)
# scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_post_text_test_50k.npz'), X_test)
# scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_post_text_train.npz'), X_train)
# scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_post_text_test.npz'), X_test)

In [9]:
# Save outcome measure
import pickle
with open(os.path.join(data_dirpath,'bow_3m_reblogged_train.pkl'), 'wb') as f:
    pickle.dump(data_train['reblogged'], f)
with open(os.path.join(data_dirpath, 'bow_3m_reblogged_test.pkl'), 'wb') as f:
    pickle.dump(data_test['reblogged'], f)
# scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_reblogged_train.npz'), y_train)
# scipy.sparse.save_npz(os.path.join(data_dirpath, 'bow_3m_reblogged_test.npz'), y_test)

In [2]:
# Load data
import pickle
import scipy.sparse

X_train = scipy.sparse.load_npz(os.path.join(data_dirpath, 'bow_3m_post_text_train.npz'))
X_test = scipy.sparse.load_npz(os.path.join(data_dirpath, 'bow_3m_post_text_test.npz'))

with open(os.path.join(data_dirpath,'bow_3m_reblogged_train.pkl'), 'rb') as f:
    y_train = pickle.load(f)
with open(os.path.join(data_dirpath, 'bow_3m_reblogged_test.pkl'), 'rb') as f:
    y_test = pickle.load(f)
    
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2163083, 10000)
(540771, 10000)
(2163083,)
(540771,)


## 1-time

In [7]:
# Decide features, vectorize
data.loc[0,'body']

'<p><a class="tumblr_blog" href="http://yumipuffyloki.tumblr.com/post/55496140766">yumipuffyloki</a>:</p><blockquote> <p><strong>Every person who reblogs this will have a Pokémon egg in their s</strong><strong>ubmissions</strong> and a few days later a Pokémon will hatch from the egg.</p> <p><img alt="image" src="https://68.media.tumblr.com/b609aa8c9dd7dd6c41bf88b926af2175/tumblr_inline_mpyursuY161qz4rgp.gif"/></p> <p><strong>The Pokémon will be submitted based on their blog. </strong>It may be shiny or even a legendary.\xa0(Have your submissions open and only reblog, likes do not count.)</p> </blockquote>'

In [14]:
# Strip HTML from post body
from tqdm import tqdm_notebook as tqdm
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

data['post_text'] = list(map(strip_tags, tqdm(data['body'].fillna(''))))
data.loc[0, 'post_text']

HBox(children=(IntProgress(value=0, max=2703854), HTML(value='')))

'yumipuffyloki: Every person who reblogs this will have a Pokémon egg in their submissions and a few days later a Pokémon will hatch from the egg.  The Pokémon will be submitted based on their blog. It may be shiny or even a legendary.\xa0(Have your submissions open and only reblog, likes do not count.) '

In [15]:
data.to_pickle(os.path.join(data_dirpath, 'posts_descs_3m_rebloggers.pkl'))