# Process data

## Concatenate posts

In [53]:
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm

community = 'voltron'
data_dirpath = f'../data/{community}_posts'

posts = []
for fname in tqdm(os.listdir(data_dirpath)):
    fpath = os.path.join(data_dirpath, fname)
    part = pd.read_csv(fpath, sep='\t')
    posts.extend(part.values.tolist())

HBox(children=(IntProgress(value=0, max=1540), HTML(value='')))

In [54]:
data = pd.DataFrame(posts, columns=part.columns)
print(data.columns)
print(data.shape)

Index(['post_tags_string', 'post_id', 'activity_time_epoch', 'tumblelog_id',
       'is_private', 'post_title', 'post_short_url', 'post_slug', 'post_type',
       'post_caption', 'post_format', 'post_note_count', 'post_tags',
       'post_content', 'reblogged_from_post_id', 'reblogged_from_metadata',
       'created_time_epoch', 'updated_time_epoch', 'is_submission', 'mentions',
       'source_title', 'source_url', 'post_classifier', 'blog_classifier',
       'activity_date'],
      dtype='object')
(4255856, 25)


## Remove duplicate posts, non-string posts

In [55]:
data.drop_duplicates('post_id', inplace=True)
data.shape

(1971849, 25)

In [40]:
data.dropna(subset=['post_id'], inplace=True)
data.shape

(1971848, 25)

In [56]:
data = data[data['post_content'].map(lambda x: isinstance(x, str) and len(x) > 0)]
data.shape

(1880600, 25)

## Preprocess text

In [57]:
import re
import spacy

from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

def process_text(text):
    """ Extract body of text, preprocess it """
    
    if not isinstance(text, str):
        return ''
    
    m = re.search(r'body=(.*), note_count', text)
    if not m: return ''
    body = m.group(1)
    
    # Strip HTML 
    body = strip_tags(body)
    
    # Tokenize
    toks = [tok.text for tok in nlp.tokenizer(body.lower())]
    
    return ' '.join(toks)

In [58]:
data['post_body'] = list(map(process_text, tqdm(data['post_content'])))

HBox(children=(IntProgress(value=0, max=1880600), HTML(value='')))

In [59]:
data = data[data['post_body'].map(lambda x: len(x) > 0)]
data.shape

(569677, 26)

## Save out

In [60]:
data.to_csv(f'../data/{community}_posts.csv', index=False)

# Extract stylistic features

In [65]:
from string import punctuation
from nltk.corpus import words

en_words = set(words.words())
def extract_style_features(text):
    
    features = {}
    
    # Number of words
    toks = text.split()
    n_words = len(toks)
    features['n_words'] = n_words
    features['n_characters'] = len(text)
    
    if n_words == 0: return features
   
    # Number, which punctuation
    total_punctuation = 0
    for p in punctuation:
        p_count = text.count(p)
        features[p] = p_count
        total_punctuation += p_count
    features['avg_punctuation'] = total_punctuation/n_words
    
    # Capitalization
    total_capitals = sum(1 for c in text if c.isupper())
    word_initial_capitals = sum(1 for w in toks if w[0].isupper())
    features['n_capitals'] = total_capitals
    features['avg_capitalized_words'] = word_initial_capitals/n_words
    features['avg_capitalized_letters'] = total_capitals/features['n_characters']
    
    # Out-of-vocabulary words
    features['n_oov'] = sum(1 for w in toks if not w in en_words)
    features['avg_oov'] = features['n_oov']/features['n_words']
    
    # Repeated characters
    total_char_repeats = 0
    for char in set(text):
        repeated_count = text.count(''.join([char]*3))
        if repeated_count > 0:
            features[f'repeated_{char}'] = repeated_count
            total_char_repeats += repeated_count
            
    features['total_char_repeats'] = total_char_repeats
    
    return features

In [17]:
pd.set_option('display.max_colwidth', -1)

In [20]:
test = data.loc[1, 'post_content']
test

'{mentions={}, format=html, title=, body=<p>Lance: you just have to be the bigger person and let it go.</p><p>Allura: Okay. *uses her shape shifting power to grow taller* I don?t see how this is supposed to make me want to let it go.</p>, note_count=0, slug=lance-you-just-have-to-be-the-bigger-person-and, tags={(voltron),(allurance),(lallura)}}'

In [66]:
data['style_features'] = list(map(extract_style_features, tqdm(data['post_body'])))

HBox(children=(IntProgress(value=0, max=569677), HTML(value='')))

In [67]:
data.to_pickle(f'../data/{community}_posts.pkl')

TypeError: to_pickle() got an unexpected keyword argument 'index'