In [1]:
import re
def remove_html(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

import nltk.tokenize as tk
def sentence_tokenize(text):
    sentences = tk.sent_tokenize(text)
    return len(sentences), sentences

def word_tokenize(text):
    words = tk.word_tokenize(text)
    return len(words), words

In [2]:
# Call on concatenation of body and title
from collections import namedtuple
tokenized_row = namedtuple('tokenized_row', 'sent_count sentences word_count words')

def convert_row(text):
    text = remove_html(text)
    sent_count, sentences = sentence_tokenize(text)
    word_count, words = word_tokenize(text)
    return tokenized_row(sent_count, sentences, word_count, words)
def build_dict(dataframe):
    token_dict = {}
    body_words = []
    title_words = []
    for i in range(len(dataframe.index.values)):
        index = dataframe.index.values[i]
        title = convert_row(dataframe['Title'].values[i])
        title_words = title_words + title.words
        body = convert_row(dataframe['Body'].values[i])
        body_words = body_words + body.words
        token_dict[index] = (title, body)
    return token_dict, title_words, body_words
    

In [3]:
import pickle
import pandas
import os
from sklearn.feature_extraction.text import CountVectorizer
filenames = ['combined_train_test.p', 'r_train_so_test.p', 'so_train_r_test.p',
            'so_alone.p', 'reddit_alone.p']
for filename in filenames:
    directory_name = filename.split('.p')[0]
    if not os.path.isdir(directory_name):
        os.mkdir(directory_name)
    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    body_vectorizer = CountVectorizer(stop_words='english', max_features = 2**12)
    title_vectorizer = CountVectorizer(stop_words='english', max_features = 2**12)
    train_token_dict, train_title_words, train_body_words = build_dict(train)
    test_token_dict, test_title_words, test_body_words = build_dict(test)
    body_vectorizer.fit((train_body_words + test_body_words))
    title_vectorizer.fit((train_title_words + test_title_words))
    with open(directory_name + "/tokenized_dict.p", 'wb') as pfile:
        pickle.dump((train_token_dict, test_token_dict), pfile)
    with open(directory_name + "/body_vectorizer.p", 'wb') as pfile:
        pickle.dump(body_vectorizer, pfile)
    with open(directory_name + "/title_vectorizer.p", 'wb') as pfile:
        pickle.dump(title_vectorizer, pfile)

In [4]:
for filename in ['combined_train_test.p']:
    directory_name = filename.split('.p')[0]
    if not os.path.isdir(directory_name):
        os.mkdir(directory_name)
    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    train_token_dict, train_title_words, train_body_words = build_dict(train)
    test_token_dict, test_title_words, test_body_words = build_dict(test)
    with open(directory_name + "/tokenized_dict.p", 'wb') as pfile:
        pickle.dump((train_token_dict, test_token_dict), pfile)