# Predict community from style features (vs unigrams)

## Load, prepare data

In [2]:
import pandas as pd

# Load data
communities = [
    'voltron',
    'bts',
    'studyblr',
    'miraculous_ladybug',
    'riverdale',
    'south_park',
    'star_vs_the_forces_of_evil',
    'simblr',
]

data = {}
for community in communities:
    data[community] = pd.read_pickle(f'../../data/{community}_posts.pkl')

In [3]:
features = set()
for community in communities:
    row_features = data[community]['style_features'].map(lambda x: x.keys())
    community_features = set([f for feats in row_features for f in feats])
    features |= community_features
    
print(len(features))

362


In [4]:
# Expand dataframe with columns for features

import numpy as np
from tqdm import tqdm_notebook as tqdm

for c in communities:
    data[c]['community'] = [c] * len(data[c])
expanded_data = pd.concat(data.values())

for f in features:
    expanded_data[f] = expanded_data['style_features'].map(lambda x: x.get(f, 0))

In [5]:
print(expanded_data.shape)
print(expanded_data.columns)

(1968498, 390)
Index(['post_tags_string', 'post_id', 'activity_time_epoch', 'tumblelog_id',
       'is_private', 'post_title', 'post_short_url', 'post_slug', 'post_type',
       'post_caption',
       ...
       'repeated_ぇ', '(', 'repeated_ᴬ', 'repeated_̼', 'repeated_*',
       'repeated_|', '\', 'repeated_嘿', 'repeated_뽀', 'repeated_३'],
      dtype='object', length=390)


In [6]:
expanded_data.columns.tolist()

['post_tags_string',
 'post_id',
 'activity_time_epoch',
 'tumblelog_id',
 'is_private',
 'post_title',
 'post_short_url',
 'post_slug',
 'post_type',
 'post_caption',
 'post_format',
 'post_note_count',
 'post_tags',
 'post_content',
 'reblogged_from_post_id',
 'reblogged_from_metadata',
 'created_time_epoch',
 'updated_time_epoch',
 'is_submission',
 'mentions',
 'source_title',
 'source_url',
 'post_classifier',
 'blog_classifier',
 'activity_date',
 'post_body',
 'style_features',
 'community',
 '#',
 'repeated_嘟',
 'repeated_呜',
 'repeated_哈',
 'repeated_–',
 'repeated_ふ',
 'repeated_덤',
 'repeated_へ',
 '"',
 'repeated_ㅤ',
 'repeated_ʰ',
 'repeated_ᶫ',
 'repeated_̩',
 'repeated_ي',
 'repeated_5',
 'repeated_е',
 'repeated_ᵢ',
 'repeated_낄',
 'repeated_ウ',
 'repeated_≈',
 'repeated_y',
 'repeated_\u2003',
 'repeated_ㅇ',
 'repeated_∞',
 'repeated_¶',
 'repeated_ㅋ',
 'repeated_å',
 'repeated_\u200b',
 'repeated_、',
 'repeated_˱',
 'repeated_～',
 'repeated_쿄',
 '{',
 'repeated_в',
 'r

## Split data, format for sklearn

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train, test = train_test_split(expanded_data, test_size=0.1, random_state=9)

# Select columns, normalize
feature_cols = train.columns[expanded_data.columns.tolist().index('community') + 1:]
X_train = train.loc[:, feature_cols]
X_test = test.loc[:, feature_cols]
y_train = train['community']
y_test = test['community']

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

  return self.partial_fit(X, y)
  app.launch_new_instance()


(1771648, 362)
(1771648,)
(196850, 362)
(196850,)


## Classify communities

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='sag', multi_class='multinomial', verbose=2)
clf.fit(X_train, y_train)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


max_iter reached after 1770 seconds


[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed: 29.5min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=3, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=2, warm_start=False)

In [9]:
clf.score(X_test, y_test) # mean accuracy

0.32939293878587755

In [11]:
1/len(communities)

0.125

In [None]:
# majority baseline