## Load Libraries and Helper Functions

In [41]:
# The future
#from __future__ import print_function, division, absolute_import

# Data wrangling libraries
import pandas as pd
import numpy as np
import re
from io import StringIO

import matplotlib.pyplot as plt
import glob as glob
import pickle as pickle

# Numpy shorthand stuff
from numpy import array

# NLTK shorthand stuff
import nltk
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split


%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     /home/chadharness/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [238]:
# Borrowed some functions from the w266 utils.py file
# Miscellaneous helpers
def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))


# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
        #word = re.sub(r"(DG)+", "DG", word)
    return word


def canonicalize_word(word, wordset=None, digits=True):
    #word = re.sub(r":","",word)
    #word = re.sub(r"https?","",word)
    #word = re.sub(r"\/","",word)
    #word = re.sub(r"@","",word)
    #word = re.sub(r"/\U0001.?'","",word)
    #replace hyperlinks with one instance of "postedhyperlinkvalue"
    word = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?\S*", "postedhyperlinkvalue", word)
    word = re.sub(r"(postedhyperlinkvalue)+", "postedhyperlinkvalue", word)
    #only lower case words (2 letters or longer) that are not all upper case
    if not word.isupper() or len(word) == 1:
        word = word.lower()
    #replace things like haha with ha
    word = re.sub(r"([a-z]{2,})\1{2,}", r"\1", word)
    #replace any three consecutive, identical letters with two instances of that letter
    word = re.sub(r"([a-z])\1{2,}", r"\1\1", word)
    #replace any two consecutive, identical consonants at the beginning of a string with one of that consonant
    word = re.sub(r"(^[^aeiou])\1{1,}", r"\1", word)
    
    #replace digits with a stand-in token
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

    
def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]


# Made some helper functions of our own
from nltk.stem import PorterStemmer   
def stem_sentence(token_sent, stemmer=PorterStemmer()):
    stem_token_sent = []
    for word in token_sent:
        stem_token_sent.append(stemmer.stem(word))
    return stem_token_sent


def sent_plus_word_tokenize(series):
    sentences = []
    words = []
    
    for comment in series:
        sentences.append(sent_tokenize(comment))
    
    flat_sentences = [item for sublist in sentences for item in sublist]
    
    for comment_sentence in flat_sentences:
        words.append(word_tokenize(comment_sentence))
    
    return sentences, words


def make_data(data, target='', commentfield='', tokenizer=TweetTokenizer(), canonize=True, stem=True):      
    # Separate comments
    comments = data.loc[:, commentfield]
    #comments = data.loc[:, 'comment_body']
    #labels = data.loc[:, target]
    
    # Convert to list
    comment_list = comments.values.tolist()
    
    # Tokenize comments
    tokenizer = tokenizer
    # A list of lists of tokenized sentences: word == string/token; sentence == list of string/tokens
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences_x = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences = []
    #sentence = []
    #last_tok = ''
    #for comment in tokenized_sentences_x:
    #    for tok in comment:
    #        if last_tok in ('http', 'https',':','http:','https:','@'):
    #            tok = last_tok + tok
    #        if tok in ('http', 'https',':', '@'):
    #            last_tok = tok
    #        else:
    #            last_tok = ''
    #            sentence.append(tok)
    #    tokenized_sentences.append(sentence)
    
    if stem:
        # Stem words
        comments_stem = []
        for sentence in tokenized_sentences:
            x_tokens_stem = stem_sentence(token_sent=sentence, stemmer=PorterStemmer())
            comments_stem.append(x_tokens_stem)
        tokenized_sentences = comments_stem
    
    if canonize:
        # Canonize words
        comments_canon = []
        for sentence in tokenized_sentences:
            x_tokens_canon = canonicalize_words(sentence)
            comments_canon.append(x_tokens_canon)
        # A list of lists of scrubbed tokens; token == word, list == sentence
        tokenized_sentences = comments_canon
    
    x_tokens = tokenized_sentences  
    #return comments, x_tokens, labels
    return comments, x_tokens


def rawlist_to_xtokens(rawlist=['default arg'], vocab_list=[]):
    xtokens = []
    for rawstring in rawlist:
        xtoken = list(filter(lambda x: x in vocab_list, rawstring.split()))
        xtokens.append(xtoken)   
    return xtokens


def xtoken_to_raw(xtoken=['default','arg']):  
    raw_text_string = ' '.join(xtoken)
    return raw_text_string


def raw_to_xtoken(raw_string='default arg'):
    xtoken = raw_string.split()
    return xtoken


def model_diagnostics(model, data, labels, target_names, random=False, test_size=0.10, random_state=42):
    
    # Split into test and train
    # Designate random test_size% of data (rounded up to next obs) as test data
    if random:
        train_data, test_data, train_labels, test_labels = train_test_split(data, labels, 
                                                                            test_size=test_size, 
                                                                            random_state=random_state)
    # Designate last test_size% of data (rounded up to next obs) as test data 
    else:
        idx = round(test_size*len(data))
        test_data = data[-idx:]
        test_labels = labels[-idx:]
    
    
    pred_labels = model.predict_classes(test_data)
    
    print("Test data length is: ", len(test_data))
    print("Test label length is: ", len(test_labels))
    print("Pred label length is: ", len(pred_labels))
    
    confusionMatrix = metrics.confusion_matrix(test_labels, pred_labels)
    classificationReport = classification_report(test_labels, pred_labels, target_names=target_names)
    
    return confusionMatrix, classificationReport


# Function to aggregate all of the comments for a given subreddit(s)

# Data location example: reddit data for March 2018 downloaded to ~/parlancr/data/reddit/2018_03/
# File names: reddit_2018_03000000000000.csv - reddit_2018_03000000000047.csv

def export_subreddits(subs):
    
    selected_subreddits = pd.DataFrame()
    file_stem = './data/reddit/*/reddit_*.csv'
    
    for f in sorted(glob.glob(file_stem)):
        
        print('Loading comments from: ', f)
        partition_comments = pd.read_csv(f)
        selected_subreddits = selected_subreddits.append(partition_comments[partition_comments['subreddit'].isin(subs)], ignore_index = True)
        
    return selected_subreddits


def build_model_input(pandas_df, commentfield, post_length, sent_length):
    
    sentences, tokens = sent_plus_word_tokenize(pandas_df[commentfield].dropna().values)
    
    sents = [sents for line in sentences for sents in line if len(line) <= post_length]
    
    sents_pd = pd.DataFrame({commentfield:sents})
    
    comments, x_tokens = make_data(sents_pd, commentfield=commentfield, canonize=True, stem=False, tokenizer=TweetTokenizer())
    
    tokens = [sent for sent in x_tokens if len(sent) <= sent_length]
    
    raw_list = list(map(xtoken_to_raw, tokens))
    
    pd_final = pd.DataFrame({commentfield:raw_list})
    
    train, validate, test = np.split(pd_final.sample(frac=1), [int(.6*len(pd_final)), int(.8*len(pd_final))])
    
    return train, validate, test, pd_final
    
    
    

## Load Data

In [4]:
!pwd

/home/chadharness/w210/Parlancr/data_pipeline


In [3]:
# Load pickled data frames

#reddit_all = pd.read_pickle('./data/reddit/subsets/reddit_data_all-teenagers-The_Donald.pkl')

#twitter_trump = pd.read_pickle('./data/twitter/trump_tweets/trump_tweet_data.pkl')

reddit_all = pd.read_pickle('../../data/reddit_data_all-teenagers-The_Donald.pkl')

#reddit_pol = pd.read_pickle('../../data/reddit_data_all-political.pkl')

In [9]:
reddit_all.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
0,"Hey man, no need to be rude",,,,thiccbagel,,,1508889606,t5_2rjli,t3_78jxo4,t1_doue4fo,14.0,1510153000.0,0,0,doue4rn,teenagers,,,
1,My eye,,,,SoFunnyLookin,USA,,1508889607,t5_38unr,t3_78jt7s,t1_doudf9c,5.0,1510153000.0,0,0,doue4tr,The_Donald,,,MURICA
2,[deleted],,,,[deleted],,,1508889609,t5_38unr,t3_78hszx,t3_78hszx,1.0,1510153000.0,0,0,doue4vo,The_Donald,,,
3,[removed],,,,[deleted],,,1508889611,t5_38unr,t3_78j6c0,t1_doudnhp,1.0,1510153000.0,0,0,doue4x4,The_Donald,,,
4,Why did they drop bomb? Is there even more in...,,,,superguyguy,,,1508889612,t5_38unr,t3_78jijg,t3_78jijg,2.0,1510153000.0,0,0,doue4xz,The_Donald,,,


In [10]:
reddit_all.count()

body                      8227161
score_hidden                    0
archived                        0
name                            0
author                    8227305
author_flair_text         4555520
downs                           0
created_utc               8227305
subreddit_id              8227305
link_id                   8227305
parent_id                 8227305
score                     8227305
retrieved_on              8227305
controversiality          8227305
gilded                    8227305
id                        8227305
subreddit                 8227305
ups                             0
distinguished               23321
author_flair_css_class    2833388
dtype: int64

In [11]:
# Remove problematic comments
reddit_all = reddit_all[reddit_all.body.notnull()]
reddit_all = reddit_all[reddit_all.body != '']
reddit_all = reddit_all[reddit_all.body != "no value"]
reddit_all = reddit_all[reddit_all.body !='[removed]']
reddit_all = reddit_all[reddit_all.body != '[deleted]']
reddit_all = reddit_all[~reddit_all.body.str.contains("\^This\^message\^was\^created\^by\^a\^bot")]
reddit_all = reddit_all[~reddit_all.body.str.contains("\*\*Please review the rules for")]
reddit_all = reddit_all[reddit_all.notnull()]

reddit_all.count()

body                      7610939
score_hidden                    0
archived                        0
name                            0
author                    7610939
author_flair_text         4555341
downs                           0
created_utc               7610939
subreddit_id              7610939
link_id                   7610939
parent_id                 7610939
score                     7610939
retrieved_on              7610939
controversiality          7610939
gilded                    7610939
id                        7610939
subreddit                 7610939
ups                             0
distinguished               23321
author_flair_css_class    2833336
dtype: int64

In [5]:
# Load pickled data frames

#reddit_all = pd.read_pickle('./data/reddit/subsets/reddit_data_all-teenagers-The_Donald.pkl')

#twitter_trump = pd.read_pickle('./data/twitter/trump_tweets/trump_tweet_data.pkl')

#reddit_all = pd.read_pickle('../../data/reddit_data_all-teenagers-The_Donald.pkl')

reddit_pol = pd.read_pickle('../../data/reddit_data_all-political.pkl')

In [12]:
reddit_pol.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
0,Also the only president to be attacked by a ro...,,,,xo13579,,,1508889603,t5_2qn70,t3_78ijb3,t3_78ijb3,1.0,1510153000.0,0,0,doue4og,democrats,,,
1,[deleted],,,,[deleted],,,1508889604,t5_2qh3l,t3_78hsqz,t1_doud1gs,1.0,1510153000.0,0,0,doue4pk,news,,,
2,don’t know much about that but rochelle might ...,,,,Jailhaus,,,1508889604,t5_2qh3l,t3_78jpym,t3_78jpym,0.0,1510153000.0,0,0,doue4q1,news,,,
3,"It's still a good thing. For one, it lets you ...",,,,Han55512,,,1508889605,t5_2qh13,t3_78ih3b,t1_dou3lrt,1.0,1510153000.0,0,0,doue4qk,worldnews,,,
4,[deleted],,,,[deleted],,,1508889605,t5_2qh13,t3_78ih3b,t1_doucs5h,1.0,1510153000.0,0,0,doue4qu,worldnews,,,


In [13]:
reddit_pol.count()

body                      10095309
score_hidden                     0
archived                         0
name                             0
author                    10095377
author_flair_text            87581
downs                            0
created_utc               10095377
subreddit_id              10095377
link_id                   10095377
parent_id                 10095377
score                     10095377
retrieved_on              10095377
controversiality          10095377
gilded                    10095377
id                        10095377
subreddit                 10095377
ups                              0
distinguished                52521
author_flair_css_class       14705
dtype: int64

In [6]:
# Remove problematic comments
reddit_pol = reddit_pol[reddit_pol.body.notnull()]
reddit_pol = reddit_pol[reddit_pol.body != '']
reddit_pol = reddit_pol[reddit_pol.body != "no value"]
reddit_pol = reddit_pol[reddit_pol.body !='[removed]']
reddit_pol = reddit_pol[reddit_pol.body != '[deleted]']
reddit_pol = reddit_pol[~reddit_pol.body.str.contains("\^This\^message\^was\^created\^by\^a\^bot")]
reddit_pol = reddit_pol[~reddit_pol.body.str.contains("\*\*Please review the rules for")]
reddit_pol = reddit_pol[reddit_pol.notnull()]

reddit_pol.count()

body                      8790896
score_hidden                    0
archived                        0
name                            0
author                    8790896
author_flair_text           87581
downs                           0
created_utc               8790896
subreddit_id              8790896
link_id                   8790896
parent_id                 8790896
score                     8790896
retrieved_on              8790896
controversiality          8790896
gilded                    8790896
id                        8790896
subreddit                 8790896
ups                             0
distinguished               52521
author_flair_css_class      14705
dtype: int64

##### Subreddits in `reddit_data_all-political.pkl`
* `r/Liberal`      _(n=13,449)_
* `r/Conservative` _(n=275,683)_
* `r/Republican`   _(n=14,633)_
* `r/democrats`    _(n=35,056)_
* `r/news`         _(n=4,332,619)_
* `r/worldnews`    _(n=4,119,456)_

In [21]:
reddit_libs = reddit_pol[reddit_pol.subreddit == 'Liberal']

In [12]:
reddit_libs.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
234,You just ignored what I said and then repeated...,,,,nationalistsareRINOs,,,1508889913,t5_2qxt5,t3_76nxe8,t1_doix4hy,1.0,1510154000.0,0,0,douee2k,Liberal,,,
5515,The 'murican problem is the anti-democratic Ko...,,,,EyeOfTheBeast,,,1508897487,t5_2qxt5,t3_77wpbq,t3_77wpbq,1.0,1510157000.0,0,0,doul2sv,Liberal,,,
8320,"Stay home, The Donald doesn’t need you help",,,,ynotfker,,,1508901854,t5_2qxt5,t3_7877lj,t3_7877lj,1.0,1510159000.0,0,0,douolfx,Liberal,,,
8898,Curious to know why people down voted this. Th...,,,,SwampShillin,,,1508902772,t5_2qxt5,t3_78iiqx,t3_78iiqx,3.0,1510159000.0,0,0,doup9cm,Liberal,,,
9087,Clearly he does. But that's what we get for el...,,,,nightness,,,1508903135,t5_2qxt5,t3_7877lj,t1_douolfx,3.0,1510159000.0,0,0,doupioi,Liberal,,,


In [22]:
reddit_libs.count()

body                      13449
score_hidden                  0
archived                      0
name                          0
author                    13449
author_flair_text             9
downs                         0
created_utc               13449
subreddit_id              13449
link_id                   13449
parent_id                 13449
score                     13449
retrieved_on              13449
controversiality          13449
gilded                    13449
id                        13449
subreddit                 13449
ups                           0
distinguished                 4
author_flair_css_class        0
dtype: int64

In [279]:
reddit_cons = reddit_pol[reddit_pol.subreddit == 'Conservative']

In [280]:
reddit_cons.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
50,Catholic student group at the oldest Catholic ...,,,,YankeeBlues21,,,1508889664,t5_2qh6p,t3_78jxnk,t3_78jxnk,9.0,1510153000.0,0,0,doue6jk,Conservative,,,
51,"I'm no troll and until Trump was nominated, I...",,,,VSxTravesty,,,1508889664,t5_2qh6p,t3_78hxuc,t1_doudbxz,54.0,1510153000.0,0,0,doue6kc,Conservative,,,
63,Jesus Christ dude. Todays news cycle is one th...,,,,dlc_protocol,,,1508889686,t5_2qh6p,t3_78jrry,t3_78jrry,10.0,1510153000.0,0,0,doue775,Conservative,,,
98,"Sadly, I hope this doesn't progress. None of ...",,,,xtehh,,,1508889727,t5_2qh6p,t3_78ify8,t3_78ify8,1.0,1510153000.0,0,0,doue8gw,Conservative,,,
124,"As an outsider, I would love for you to explai...",,,,exocortex,,,1508889752,t5_2qh6p,t3_78i6b1,t1_dou3boi,6.0,1510153000.0,0,0,doue978,Conservative,,,


In [281]:
reddit_cons.count()

body                      275683
score_hidden                   0
archived                       0
name                           0
author                    275683
author_flair_text          83971
downs                          0
created_utc               275683
subreddit_id              275683
link_id                   275683
parent_id                 275683
score                     275683
retrieved_on              275683
controversiality          275683
gilded                    275683
id                        275683
subreddit                 275683
ups                            0
distinguished                606
author_flair_css_class     13794
dtype: int64

In [181]:
reddit_repubs = reddit_pol[reddit_pol.subreddit == 'Republican']

In [182]:
reddit_repubs.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
980,Where does this fabled free trade exist?,,,,ozric101,,,1508890947,t5_2qndt,t3_78ihf8,t3_78ihf8,4.0,1510154000.0,0,0,doufbeh,Republican,,,
1161,/r/Republican is a partisan subreddit. This i...,,,,AutoModerator,,,1508891177,t5_2qndt,t3_78k3q0,t3_78k3q0,1.0,1510154000.0,0,0,doufirq,Republican,,moderator,
1327,"Flake was abandoned by the new GOP, he's an ol...",,,,thehonbtw,Libertarian,,1508891426,t5_2qndt,t3_78i6ey,t1_dou6t03,35.0,1510154000.0,0,0,doufqqw,Republican,,,
1730,"As a Libertarian Republican, not a proggie, I ...",,,,thehonbtw,Libertarian,,1508892018,t5_2qndt,t3_77zffm,t1_dopu70v,1.0,1510155000.0,0,0,doug9lo,Republican,,,
2232,"&gt; Correct, a true Conservative who is on th...",,,,haldir2012,,,1508892774,t5_2qndt,t3_78i6ey,t1_dou5gxa,22.0,1510155000.0,0,0,dougxjr,Republican,,,


In [183]:
reddit_repubs.count()

body                      14633
score_hidden                  0
archived                      0
name                          0
author                    14633
author_flair_text          2620
downs                         0
created_utc               14633
subreddit_id              14633
link_id                   14633
parent_id                 14633
score                     14633
retrieved_on              14633
controversiality          14633
gilded                    14633
id                        14633
subreddit                 14633
ups                           0
distinguished              2340
author_flair_css_class        1
dtype: int64

In [10]:
reddit_dems = reddit_pol[reddit_pol.subreddit == 'democrats']

In [11]:
reddit_dems.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
0,Also the only president to be attacked by a ro...,,,,xo13579,,,1508889603,t5_2qn70,t3_78ijb3,t3_78ijb3,1.0,1510153000.0,0,0,doue4og,democrats,,,
14,I don't understand how it could possibly be in...,,,,last_minutiae,,,1508889618,t5_2qn70,t3_78f7b3,t1_dotxbdp,1.0,1510153000.0,0,0,doue550,democrats,,,
502,It is. Don’t you see other posts?,,,,VegaThePunisher,,,1508890299,t5_2qn70,t3_789oai,t1_dou9cts,1.0,1510154000.0,0,0,doueqec,democrats,,,
821,"this is weird. your son died... here, have som...",,,,jarek99,,,1508890715,t5_2qn70,t3_78f7b3,t3_78f7b3,1.0,1510154000.0,0,0,douf3wz,democrats,,,
2238,You’re right it says please. Also the great th...,,,,FinallyGotMyGrade10,,,1508892782,t5_2qn70,t3_77t0k4,t1_doqxkht,1.0,1510155000.0,0,0,dougxsn,democrats,,,


In [13]:
reddit_dems.count()

body                      35056
score_hidden                  0
archived                      0
name                          0
author                    35056
author_flair_text           726
downs                         0
created_utc               35056
subreddit_id              35056
link_id                   35056
parent_id                 35056
score                     35056
retrieved_on              35056
controversiality          35056
gilded                    35056
id                        35056
subreddit                 35056
ups                           0
distinguished               792
author_flair_css_class      726
dtype: int64

In [14]:
reddit_news = reddit_pol[reddit_pol.subreddit == 'news']

In [15]:
reddit_news.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
2,don’t know much about that but rochelle might ...,,,,Jailhaus,,,1508889604,t5_2qh3l,t3_78jpym,t3_78jpym,0.0,1510153000.0,0,0,doue4q1,news,,,
8,*Demons run when a good man goes to war*\n\n*N...,,,,fullforce098,,,1508889607,t5_2qh3l,t3_78gumb,t1_dou4k7m,9.0,1510153000.0,0,0,doue4tv,news,,,
9,"kids that studies way too much, Asian kids.",,,,Edogawa1983,,,1508889609,t5_2qh3l,t3_78hsqz,t1_dou58l8,1.0,1510153000.0,0,0,doue4vf,news,,,
10,I'm sorry if I've not expressed my recognition...,,,,Smudge_SMJ_,,,1508889612,t5_2qh3l,t3_78gumb,t1_doub9gg,2.0,1510153000.0,0,0,doue4z2,news,,,
11,The REAL world. Chaos and anarchy...a cross bw...,,,,phbalanced1,,,1508889615,t5_2qh3l,t3_78gckt,t1_dotlip9,1.0,1510153000.0,0,0,doue52k,news,,,


In [16]:
reddit_news.count()

body                      4332619
score_hidden                    0
archived                        0
name                            0
author                    4332619
author_flair_text              25
downs                           0
created_utc               4332619
subreddit_id              4332619
link_id                   4332619
parent_id                 4332619
score                     4332619
retrieved_on              4332619
controversiality          4332619
gilded                    4332619
id                        4332619
subreddit                 4332619
ups                             0
distinguished                  98
author_flair_css_class         25
dtype: int64

In [17]:
reddit_wnews = reddit_pol[reddit_pol.subreddit == 'worldnews']

In [18]:
reddit_wnews.head()

Unnamed: 0,body,score_hidden,archived,name,author,author_flair_text,downs,created_utc,subreddit_id,link_id,parent_id,score,retrieved_on,controversiality,gilded,id,subreddit,ups,distinguished,author_flair_css_class
3,"It's still a good thing. For one, it lets you ...",,,,Han55512,,,1508889605,t5_2qh13,t3_78ih3b,t1_dou3lrt,1.0,1510153000.0,0,0,doue4qk,worldnews,,,
5,That's not quite how adblockers work. A brows...,,,,rrawk,,,1508889605,t5_2qh13,t3_78ih3b,t1_doudv5w,-2.0,1510153000.0,1,0,doue4r3,worldnews,,,
6,"It's like some bad movie you see on SyFy.\n\n""...",,,,Fiat-Libertas,,,1508889605,t5_2qh13,t3_78ih3b,t1_doudtf8,0.0,1510153000.0,0,0,doue4r7,worldnews,,,
17,"This makes no sense, anti Monsanto stuff is ma...",,,,factbasedorGTFO,,,1508889624,t5_2qh13,t3_78fiks,t1_dou083h,0.0,1510153000.0,1,0,doue5cb,worldnews,,,
19,Plus look how many millions of gallons they bu...,,,,SpermJacker,,,1508889625,t5_2qh13,t3_78hosa,t1_dou90w9,1.0,1510153000.0,0,0,doue5d1,worldnews,,,


In [19]:
reddit_wnews.count()

body                      4119456
score_hidden                    0
archived                        0
name                            0
author                    4119456
author_flair_text             230
downs                           0
created_utc               4119456
subreddit_id              4119456
link_id                   4119456
parent_id                 4119456
score                     4119456
retrieved_on              4119456
controversiality          4119456
gilded                    4119456
id                        4119456
subreddit                 4119456
ups                             0
distinguished               48681
author_flair_css_class        159
dtype: int64

In [None]:
# Load trump tweet data from csv file and create data frame
# Store the data frame as a pickle / .pkl file for later EDA

trump_tweet_data = pd.read_csv('./data/twitter/trump_tweets/tweets.csv')

trump_tweet_data.to_pickle('./data/twitter/trump_tweets/trump_tweet_data.pkl')

In [None]:
# Create data frame with all of the selected subreddit's comments
# Store the filtered data frame as a pickle / .pkl file for later EDA

reddit_data = export_subreddits(subs = ['Republican','democrats','Conservative','Liberal','worldnews','news'])

print(reddit_data['subreddit'].value_counts())

In [None]:
reddit_data = export_subreddits(subs = ['teenagers','The_Donald'])

print(reddit_data['subreddit'].value_counts())

In [None]:
# Filtering r/teenagers data to include only flared author's comments and loading The_Donald

reddit_teenagers_all = reddit_all[reddit_all['subreddit'].isin(['teenagers'])]

reddit_teenagers_filtered = reddit_teenagers_all[reddit_teenagers_all['author_flair_text'].str.match('1\d', na=False)]

reddit_theDonald = reddit_all[reddit_all['subreddit'].isin(['The_Donald'])]

In [None]:
# Filtering r/teenagers data to include only flared author's comments and loading The_Donald

reddit_teenagers_all = reddit_all[reddit_all['subreddit'].isin(['teenagers'])]

reddit_teenagers_filtered = reddit_teenagers_all[reddit_teenagers_all['author_flair_text'].str.match('1\d', na=False)]

reddit_theDonald = reddit_all[reddit_all['subreddit'].isin(['The_Donald'])]

In [None]:
# Preliminary EDA


print('r/teenagers (with flair filter): ' + str(reddit_teenagers_filtered.shape[0]) + ' comments')
print('r/The_Donald: ' + str(reddit_theDonald.shape[0]) + ' comments')
print('Trump Tweets: ' + str(twitter_trump.shape[0]) + ' tweets')

# Histograms of comment word length by source

plt.figure()
reddit_teenagers_filtered['body'].str.count(' ').plot.hist(color='k', 
                                                           alpha=0.5, 
                                                           bins=[0, 5, 10, 20, 25, 50, 75, 100], 
                                                           xlim = (0,100), 
                                                           xticks = [0, 5, 10, 20, 25, 50, 75, 100],
                                                           title = 'r/teenagers comment word count frequency'
                                                          )

plt.figure()
reddit_theDonald['body'].str.count(' ').plot.hist(color='k', 
                                                           alpha=0.5, 
                                                           bins=[0, 5, 10, 20, 25, 50, 75, 100], 
                                                           xlim = (0,100), 
                                                           xticks = [0, 5, 10, 20, 25, 50, 75, 100],
                                                           title = 'r/The_Donald comment word count frequency'
                                                          )

plt.figure()
twitter_trump['text'].str.count(' ').plot.hist(color='k', 
                                                           alpha=0.5, 
                                                           bins=[0, 5, 10, 20, 25, 50, 75, 100], 
                                                           xlim = (0,100), 
                                                           xticks = [0, 5, 10, 20, 25, 50, 75, 100],
                                                           title = 'Trump tweet word count frequency'
                                                          )


plt.show()

# Count of comments between 5 - 50 words from each source

print('r/teenagers (with flair filter): ' + str(reddit_teenagers_filtered[reddit_teenagers_filtered['body'].str.count(' ').between(5,50)].shape[0]) + ' comments between 5 - 50 words')
print('r/The_Donald: ' + str(reddit_theDonald[reddit_theDonald['body'].str.count(' ').between(5,50)].shape[0]) + ' comments between 5 - 50 words')
print('Trump Tweets: ' + str(twitter_trump[twitter_trump['text'].str.count(' ').between(5,50)].shape[0]) + ' tweets between 5 - 50 words')


In [4]:
reddit_all.shape

(8227305, 20)

In [6]:
reddit_pol.shape

(10095377, 20)

### Clean Political Reddit Comments

#### Liberals

In [43]:
libs_sentences, libs_tokens = sent_plus_word_tokenize(reddit_libs['body'].dropna().values)

In [47]:
len(libs_sentences)

13449

In [78]:
libs_sentences[1000]

["I think there's some truth to that.",
 "Look what's happening in Alabama.",
 'Moore has the GOP split and his opponent is within striking distance of winning.',
 "The fact that there's such a close election in Alabama is telling in itself."]

In [124]:
#word_list = [word for line in sentence for word in line.split()]
libs_sents = [sents for line in libs_sentences for sents in line if len(line) <= 10]

In [125]:
len(libs_sents)

32688

In [126]:
libs_sents[1004]

'ReXXon Mobil Drillerson \n\nFTW'

In [127]:
libs_sents_pd = pd.DataFrame({'text':libs_sents})
#trump = pd.DataFrame(trump_raw_list)

In [128]:
libs_sents_pd.head()

Unnamed: 0,text
0,You just ignored what I said and then repeated...
1,The 'murican problem is the anti-democratic Ko...
2,"Stay home, The Donald doesn’t need you help"
3,Curious to know why people down voted this.
4,The author is simply stating facts that we nee...


In [129]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
libs_comments, libs_x_tokens = make_data(libs_sents_pd, commentfield='text', canonize=True, stem=False, tokenizer=TweetTokenizer())

In [130]:
libs_comments.head()

0    You just ignored what I said and then repeated...
1    The 'murican problem is the anti-democratic Ko...
2          Stay home, The Donald doesn’t need you help
3          Curious to know why people down voted this.
4    The author is simply stating facts that we nee...
Name: text, dtype: object

In [131]:
len(libs_x_tokens)

32688

In [142]:
libs_x_tokens[36]

['if',
 'you',
 "haven't",
 'seen',
 'this',
 'guy',
 ',',
 '[',
 "he's",
 'brilliant',
 ']',
 '(',
 'postedhyperlinkvalue',
 ')',
 '.']

In [133]:
libs_tokens = [sent for sent in libs_x_tokens if len(sent) <= 20]

In [134]:
len(libs_tokens)

24896

In [159]:
libs_tokens[25]

['if',
 'you',
 "haven't",
 'seen',
 'this',
 'guy',
 ',',
 '[',
 "he's",
 'brilliant',
 ']',
 '(',
 'postedhyperlinkvalue',
 ')',
 '.']

In [136]:
libs_raw_list = list(map(xtoken_to_raw, libs_tokens))

In [160]:
libs_raw_list[25]

"if you haven't seen this guy , [ he's brilliant ] ( postedhyperlinkvalue ) ."

In [161]:
libs = pd.DataFrame({'text':libs_raw_list})
#trump = pd.DataFrame(trump_raw_list)

In [162]:
libs.head()

Unnamed: 0,text
0,you just ignored what i said and then repeated...
1,the ' murican problem is the anti-democratic k...
2,"stay home , the donald doesn ’ t need you help"
3,curious to know why people down voted this .
4,the author is simply stating facts that we nee...


In [163]:
libs.count()

text    24896
dtype: int64

In [164]:
libs.text[25]

"if you haven't seen this guy , [ he's brilliant ] ( postedhyperlinkvalue ) ."

In [173]:
libs_train, libs_validate, libs_test, libs_pd = build_model_input(pandas_df=reddit_libs, commentfield='body', post_length=10, sent_length=20)

In [168]:
libs_pd.head()

Unnamed: 0,body
0,you just ignored what i said and then repeated...
1,the ' murican problem is the anti-democratic k...
2,"stay home , the donald doesn ’ t need you help"
3,curious to know why people down voted this .
4,the author is simply stating facts that we nee...


In [169]:
libs_pd.count()

body    24896
dtype: int64

In [177]:
libs_pd.body[25]

"if you haven't seen this guy , [ he's brilliant ] ( postedhyperlinkvalue ) ."

In [174]:
libs_train.count()

body    14937
dtype: int64

In [175]:
libs_test.count()

body    4980
dtype: int64

In [176]:
libs_validate.count()

body    4979
dtype: int64

In [178]:
libs_train.to_csv('../../data/reddit_liberals.train.0', sep='\t', index=False, header=False)

In [179]:
libs_test.to_csv('../../data/reddit_liberals.test.0', sep='\t', index=False, header=False)

In [180]:
libs_validate.to_csv('../../data/reddit_liberals.dev.0', sep='\t', index=False, header=False)

#### Democrats

In [249]:
dems_train, dems_validate, dems_test, dems_pd = build_model_input(pandas_df=reddit_dems, commentfield='body', post_length=10, sent_length=20)

In [250]:
dems_pd.head()

Unnamed: 0,body
0,also the only president to be attacked by a ro...
1,i don't understand how it could possibly be in...
2,it is .
3,don ’ t you see other posts ?
4,this is weird .


In [251]:
dems_pd.count()

body    65836
dtype: int64

In [274]:
dems_pd.body[2000]

"just because hate groups are being banned doesn't mean your brony subreddit is going to be banned ."

In [275]:
dems_train.count()

body    39501
dtype: int64

In [276]:
dems_test.count()

body    13168
dtype: int64

In [277]:
dems_validate.count()

body    13167
dtype: int64

In [178]:
dems_train.to_csv('../../data/reddit_democrats.train.0', sep='\t', index=False, header=False)

In [179]:
dems_test.to_csv('../../data/reddit_democrats.test.0', sep='\t', index=False, header=False)

In [180]:
dems_validate.to_csv('../../data/reddit_democrats.dev.0', sep='\t', index=False, header=False)

#### Conservatives

In [282]:
cons_train, cons_validate, cons_test, cons_pd = build_model_input(pandas_df=reddit_cons, commentfield='body', post_length=10, sent_length=20)

In [283]:
cons_pd.head()

Unnamed: 0,body
0,#nottheonion
1,"i'm no troll and until trump was nominated , i..."
2,president obama was the first to make the prob...
3,jesus christ dude .
4,todays news cycle is one that just wont stop g...


In [284]:
cons_pd.count()

body    494021
dtype: int64

In [285]:
cons_pd.body[25]

'can you give me any pre-trump sources ?'

In [286]:
cons_train.count()

body    296412
dtype: int64

In [287]:
cons_test.count()

body    98805
dtype: int64

In [288]:
cons_validate.count()

body    98804
dtype: int64

In [178]:
cons_train.to_csv('../../data/reddit_conservatives.train.0', sep='\t', index=False, header=False)

In [179]:
cons_test.to_csv('../../data/reddit_conservatives.test.0', sep='\t', index=False, header=False)

In [180]:
cons_validate.to_csv('../../data/reddit_conservatives.dev.0', sep='\t', index=False, header=False)

#### Republicans

In [239]:
repubs_train, repubs_validate, repubs_test, repubs_pd = build_model_input(pandas_df=reddit_repubs, commentfield='body', post_length=10, sent_length=20)

In [240]:
repubs_pd.head()

Unnamed: 0,body
0,where does this fabled free trade exist ?
1,/ r / republican is a partisan subreddit .
2,this is a place for republicans to discuss iss...
3,"] ( postedhyperlinkvalue ) * i am a bot , and ..."
4,*


In [241]:
repubs_pd.count()

body    31683
dtype: int64

In [242]:
repubs_pd.body[43]

'i could see that being a DGDGDGDG play to get more house and senate seats .'

In [243]:
repubs_train.count()

body    19009
dtype: int64

In [244]:
repubs_test.count()

body    6337
dtype: int64

In [245]:
repubs_validate.count()

body    6337
dtype: int64

In [246]:
repubs_train.to_csv('../../data/reddit_republicans.train.0', sep='\t', index=False, header=False)

In [247]:
repubs_test.to_csv('../../data/reddit_republicans.test.0', sep='\t', index=False, header=False)

In [248]:
repubs_validate.to_csv('../../data/reddit_republicans.dev.0', sep='\t', index=False, header=False)

### Construct v0 and v1 to Train Model

In [306]:
#Concatenate dataframes
#Train
train0 = pd.concat([repubs_train, cons_train])
train1 = pd.concat([dems_train, libs_train])
#Test
test0 = pd.concat([repubs_test, cons_test])
test1 = pd.concat([dems_test, libs_test])
#Dev
dev0 = pd.concat([repubs_validate, cons_validate])
dev1 = pd.concat([dems_validate, libs_validate])

In [307]:
train0.count()

body    315421
dtype: int64

In [308]:
train1.count()

body    54438
dtype: int64

In [309]:
train0, discard = np.split(train0.sample(frac=1), [int(.17259*len(train0))])

In [310]:
train0.count()

body    54438
dtype: int64

In [311]:
test0.count()

body    105142
dtype: int64

In [312]:
test1.count()

body    18148
dtype: int64

In [313]:
test0, discard = np.split(test0.sample(frac=1), [int(.17260*len(test0))])

In [314]:
test0.count()

body    18147
dtype: int64

In [315]:
dev0.count()

body    105141
dtype: int64

In [316]:
dev1.count()

body    18146
dtype: int64

In [317]:
dev0, discard = np.split(dev0.sample(frac=1), [int(.17259*len(dev0))])

In [318]:
dev0.count()

body    18146
dtype: int64

In [319]:
train0.to_csv('../../data/reddit_repubscons.train.0', sep='\t', index=False, header=False)

In [320]:
test0.to_csv('../../data/reddit_repubscons.test.0', sep='\t', index=False, header=False)

In [321]:
dev0.to_csv('../../data/reddit_repubscons.dev.0', sep='\t', index=False, header=False)

In [322]:
train1.to_csv('../../data/reddit_repubscons.train.1', sep='\t', index=False, header=False)

In [323]:
test1.to_csv('../../data/reddit_repubscons.test.1', sep='\t', index=False, header=False)

In [324]:
dev1.to_csv('../../data/reddit_repubscons.dev.1', sep='\t', index=False, header=False)

### Clean Trump Reddit Comments

In [10]:
theD = pd.read_csv('../../data/reddit_theDonald_sentences.csv')
theD.shape

(11821939, 1)

In [33]:
smpl, discard = np.split(theD.sample(frac=1), [int(.15365*len(theD))])

In [23]:
smpl.head()

Unnamed: 0,text
9302680,i just found this one at time : * '' there are...
9061838,its all right there .
2147763,lol
6452995,we 're all very good boys waiting patiently fo...
9546641,the jewish faith is very different that biblic...


In [34]:
smpl.count()

text    1816439
dtype: int64

In [35]:
theD = smpl

In [36]:
theD.head()

Unnamed: 0,text
3918856,"& gt ; the company that is across the street ,..."
1465983,seriously if i 'm wrong tell me .
2137069,anyway classic projection from the lefty loon ...
1952404,they ’ re still owned and controlled by big me...
1841977,have n't checked in a couple weeks .


In [37]:
# Remove problematic comments
theD = theD[theD.text.notnull()]
theD = theD[theD.text != '']
theD = theD[theD.text != "no value"]
theD = theD[theD.text !='[ removed ]']
theD = theD[theD.text != '[ deleted ]']
theD = theD[~theD.text.str.contains(" \ ^ This \ ^ message \ ^ was \ ^ created \ ^ by \ ^ a \ ^ bot")]
theD = theD[~theD.text.str.contains(" \ * \ * Please review the rules for")]
theD = theD[theD.notnull()]

theD.count()

text    1738575
dtype: int64

In [38]:
theD.count()

text    1738575
dtype: int64

In [39]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
theD_comments, theD_x_tokens = make_data(theD, commentfield='text', canonize=True, stem=False, tokenizer=RegexpTokenizer(r'\w+'))

In [45]:
theD_comments.head()

3918856    & gt ; the company that is across the street ,...
1465983                    seriously if i 'm wrong tell me .
2137069    anyway classic projection from the lefty loon ...
1952404    they ’ re still owned and controlled by big me...
1841977                 have n't checked in a couple weeks .
Name: text, dtype: object

In [64]:
len(theD_x_tokens)

1738575

In [65]:
theD_x_tokens[0]

['gt',
 'the',
 'company',
 'that',
 'is',
 'across',
 'the',
 'street',
 'well',
 'they',
 'ai',
 'n',
 't',
 'censoring',
 'shit',
 'and',
 'how',
 'will',
 'the',
 'democrat',
 'troll',
 'who',
 'never',
 'leaves',
 'his',
 'room',
 'ever',
 'find',
 'out',
 'they',
 'are',
 'censoring']

In [48]:
theD_tokens = [sent for sent in theD_x_tokens if len(sent) <= 20]

In [49]:
len(theD_tokens)

1491356

In [50]:
theD_tokens[33]

['am', 'i', 'miss', 'remembering', 'this']

In [51]:
theD_raw_list = list(map(xtoken_to_raw, theD_tokens))

In [52]:
theD_raw_list[33]

'am i miss remembering this'

In [53]:
theD = pd.DataFrame({'text':theD_raw_list})
#trump = pd.DataFrame(trump_raw_list)

In [54]:
theD.head()

Unnamed: 0,text
0,seriously if i m wrong tell me
1,anyway classic projection from the lefty loon ...
2,they re still owned and controlled by big media
3,have n t checked in a couple weeks
4,we do n t know about the black kids because he...


In [55]:
# Remove problematic comments
theD = theD[theD.text.notnull()]
theD = theD[theD.text != '']
theD = theD[theD.text != "no value"]
theD = theD[theD.text !='[ removed ]']
theD = theD[theD.text != '[ deleted ]']
theD = theD[~theD.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
theD = theD[~theD.text.str.contains("\*\*Please review the rules for")]
theD = theD[theD.notnull()]

theD.count()

text    1460892
dtype: int64

In [56]:
theD.count()

text    1460892
dtype: int64

In [57]:
train, validate, test = np.split(theD.sample(frac=1), [int(.6*len(theD)), int(.8*len(theD))])

In [58]:
train.head()

Unnamed: 0,text
663248,lol i love it
663346,no
955287,the mainstream media is losing this battle bigly
841005,not surprising his approval rating is this hig...
262918,i m satisfied with that


In [66]:
train.count()

text    876535
dtype: int64

In [60]:
test.head()

Unnamed: 0,text
1360032,lying down it suffered
215090,but if the kids what to change their gender th...
650660,alums get to emailing
1092112,foreign would be a much bigger scandal
321826,all of us at t_d could have confirmed that


In [61]:
test.count()

text    292179
dtype: int64

In [62]:
validate.head()

Unnamed: 0,text
1354771,all stand for anthem
168721,being nearby may have been better worded
124241,it s going to be buried and forgotten unfortun...
684552,flashback to last year https youtu be 4vioqzfo6bo
832969,you do n t need instructions lol


In [63]:
validate.count()

text    292178
dtype: int64

In [67]:
train.to_csv('../../data/reddit_thedonald.train.0', sep='\t', index=False, header=False)

In [68]:
test.to_csv('../../data/reddit_thedonald.test.0', sep='\t', index=False, header=False)

In [69]:
validate.to_csv('../../data/reddit_thedonald.dev.0', sep='\t', index=False, header=False)

In [9]:
theD.head()

Unnamed: 0,text
0,my eye
3,why did they drop bomb ?
4,is there even more incriminating evidence in t...
5,"if we had a gun problem in this country , ther..."
6,i love him .


In [None]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments, 
# list of list of tokens, and 
# panda series of target labels
theD_comments, theD_x_tokens = make_data(theD, commentfield='text', canonize=True, stem=False)

### Clean Trump Twitter Comments

In [4]:
trump = pd.read_csv('../../data/twitter_trump_sentences.csv')
trump.shape

(60262, 1)

In [5]:
trump.head()

Unnamed: 0,text
0,today it was my great honor to welcome preside...
1,.
2,@ asahutchinson the great governor of arkansas...
3,he has done an incredible job with a focus on ...
4,asa loves our military and our veterans .


In [6]:
# Remove problematic comments
trump = trump[trump.text.notnull()]
trump = trump[trump.text != '']
trump = trump[trump.text != "no value"]
trump = trump[trump.text !='[ removed ]']
trump = trump[trump.text != '[ deleted ]']
trump = trump[~trump.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
trump = trump[~trump.text.str.contains("\*\*Please review the rules for")]
trump = trump[trump.notnull()]

trump.count()

text    60262
dtype: int64

In [7]:
trump.count()

text    60262
dtype: int64

In [8]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
trump_comments, trump_x_tokens = make_data(trump, commentfield='text', canonize=True, stem=False, tokenizer=RegexpTokenizer(r'\w+'))

In [9]:
trump_comments[0]

'today it was my great honor to welcome president moon jae-in of the republic of korea to the @ whitehouse ! 🇺🇸🇰🇷 https : //t.co/yvoxnia1dm'

In [251]:
len(trump_x_tokens)

60262

In [252]:
trump_x_tokens[0]

['today',
 'it',
 'was',
 'my',
 'great',
 'honor',
 'to',
 'welcome',
 'president',
 'moon',
 'jae',
 'in',
 'of',
 'the',
 'republic',
 'of',
 'korea',
 'to',
 'the',
 'whitehouse',
 'https',
 't',
 'co',
 'yvoxnia1dm']

In [373]:
trump_tokens = [sent for sent in trump_x_tokens if len(sent) <= 20]

In [374]:
len(trump_tokens)

53264

In [375]:
trump_tokens[33]

['great',
 'to',
 'have',
 'our',
 'incredible',
 'first',
 'lady',
 'back',
 'home',
 'in',
 'the',
 'white',
 'house']

In [376]:
trump_raw_list = list(map(xtoken_to_raw, trump_tokens))

In [377]:
trump_raw_list[33]

'great to have our incredible first lady back home in the white house'

In [378]:
trump = pd.DataFrame({'text':trump_raw_list})
#trump = pd.DataFrame(trump_raw_list)

In [379]:
trump.head()

Unnamed: 0,text
0,
1,asahutchinson the great governor of arkansas i...
2,he has done an incredible job with a focus on ...
3,asa loves our military and our veterans
4,i fully endorse asa for governor


In [380]:
# Remove problematic comments
trump = trump[trump.text.notnull()]
trump = trump[trump.text != '']
trump = trump[trump.text != "no value"]
trump = trump[trump.text !='[ removed ]']
trump = trump[trump.text != '[ deleted ]']
trump = trump[~trump.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
trump = trump[~trump.text.str.contains("\*\*Please review the rules for")]
trump = trump[trump.notnull()]

trump.count()

text    51870
dtype: int64

In [381]:
trump.count()

text    51870
dtype: int64

In [382]:
train, validate, test = np.split(trump.sample(frac=1), [int(.6*len(trump)), int(.8*len(trump))])

In [383]:
train.head()

Unnamed: 0,text
7230,bigleaguetruth
42988,make the boston killer talk before our doctors...
43756,celebapprentice
24140,he will never again hold court
5904,design or negotiations yet


In [384]:
train.count()

text    31122
dtype: int64

In [385]:
test.head()

Unnamed: 0,text
31793,capitalism is where it s at
45340,thanks
18053,go jeb
6142,thank you to our amazing wounded warriors for ...
26811,azigmann realdonaldtrump donald trump for pres...


In [386]:
test.count()

text    10374
dtype: int64

In [387]:
train.to_csv('../../data/twitter_trump.train.0', sep='\t', index=False, header=False)

In [388]:
test.to_csv('../../data/twitter_trump.test.0', sep='\t', index=False, header=False)

In [389]:
validate.to_csv('../../data/twitter_trump.dev.0', sep='\t', index=False, header=False)

### Clean Teen Reddit Comments

In [70]:
teen = pd.read_csv('../../data/reddit_teenagers_filtered_sentences.csv')
teen.shape

(1816680, 1)

In [71]:
teen.head()

Unnamed: 0,text
0,mood
1,ayyyyyyy big mouth
2,arrested development
3,i have a feeling you watched jacksfilms new video
4,what the fuck is this comment section


In [72]:
# Remove problematic comments
teen = teen[teen.text.notnull()]
teen = teen[teen.text != '']
teen = teen[teen.text != "no value"]
teen = teen[teen.text !='[ removed ]']
teen = teen[teen.text != '[ deleted ]']
teen = teen[~teen.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
teen = teen[~teen.text.str.contains("\*\*Please review the rules for")]
teen = teen[teen.notnull()]

teen.count()

text    1816592
dtype: int64

In [73]:
teen.count()

text    1816592
dtype: int64

In [74]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
teen_comments, teen_x_tokens = make_data(teen, commentfield='text', canonize=True, stem=False, tokenizer=RegexpTokenizer(r'\w+'))

In [75]:
teen_comments[3000]

'50 % 58 min to full'

In [76]:
len(teen_x_tokens)

1816592

In [77]:
teen_x_tokens[3000]

['DGDG', 'DGDG', 'min', 'to', 'full']

In [78]:
teen_tokens = [sent for sent in teen_x_tokens if len(sent) <= 20]

In [79]:
len(teen_tokens)

1680022

In [80]:
teen_raw_list = list(map(xtoken_to_raw, teen_tokens))

In [81]:
teen_raw_list[3000]

'i cant belive you posted in your own meme thread'

In [82]:
teen = pd.DataFrame({'text':teen_raw_list})
#teen = pd.DataFrame(teen_raw_list)

In [83]:
# Remove problematic comments
teen = teen[teen.text.notnull()]
teen = teen[teen.text != '']
teen = teen[teen.text != "no value"]
teen = teen[teen.text !='[ removed ]']
teen = teen[teen.text != '[ deleted ]']
teen = teen[~teen.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
teen = teen[~teen.text.str.contains("\*\*Please review the rules for")]
teen = teen[teen.notnull()]

teen.count()

text    1644726
dtype: int64

In [84]:
teen.head()

Unnamed: 0,text
0,mood
1,ayy big mouth
2,arrested development
3,i have a feeling you watched jacksfilms new video
4,what the fuck is this comment section


In [85]:
teen.count()

text    1644726
dtype: int64

In [86]:
train, validate, test = np.split(teen.sample(frac=1), [int(.6*len(teen)), int(.8*len(teen))])

In [409]:
#[int(.6*len(teen)), int(.8*len(teen))]

[986835, 1315780]

In [459]:
#train, discard = np.split(train.sample(frac=1), [int(.03155*len(train))])

In [87]:
train.head()

Unnamed: 0,text
868442,also do you go to school
1640360,sigh this will go on forever uses uno reverse ...
49700,beautiful
239967,im so ugly
1206766,no because selfies are a picture of ones self ...


In [88]:
train.count()

text    986835
dtype: int64

In [462]:
#test, discard = np.split(test.sample(frac=1), [int(.0315*len(test))])

In [89]:
test.head()

Unnamed: 0,text
391528,i love going to hang out with my mom s side of...
1267443,i mean they were warned they aren t allowed it...
104720,actually i think more people use it to find pe...
672819,why
130458,like DG hours


In [90]:
test.count()

text    328946
dtype: int64

In [465]:
#validate, discard = np.split(validate.sample(frac=1), [int(.0315*len(validate))])

In [91]:
validate.head()

Unnamed: 0,text
1227255,also i was in second grade lol
382911,meep
774594,my hands are sweaty also is the opening monolo...
136486,like a payment that comes out monthly
651542,sometimes people get recognised and then all t...


In [92]:
validate.count()

text    328945
dtype: int64

In [93]:
train.to_csv('../../data/reddit_thedonald.train.1', sep='\t', index=False, header=False)

In [94]:
test.to_csv('../../data/reddit_thedonald.test.1', sep='\t', index=False, header=False)

In [95]:
validate.to_csv('../../data/reddit_thedonald.dev.1', sep='\t', index=False, header=False)

### Reload the "target" data and down-sample

In [2]:
reddit_thedonald_dev0 = pd.read_csv('../../data/reddit_thedonald.dev.0')
reddit_thedonald_dev0.shape

(292177, 1)

In [3]:
dev, discard = np.split(reddit_thedonald_dev0.sample(frac=1), [int(.2*len(reddit_thedonald_dev0))])

In [4]:
dev.count()

all stand for anthem    58435
dtype: int64

In [5]:
dev.head()

Unnamed: 0,all stand for anthem
46909,gt i thought they were just childish people wh...
134354,throw in some basic foundational mba entrepene...
67472,i read that as took his ball gag and went home...
185224,it s time we call them out on their bullshit b...
220457,spez apparently i am too old to keep up with t...


In [6]:
dev.to_csv('../../data/reddit_thedonald.dev.0', sep='\t', index=False, header=False)

In [7]:
reddit_thedonald_test0 = pd.read_csv('../../data/reddit_thedonald.test.0')
reddit_thedonald_test0.shape

(292178, 1)

In [8]:
test, discard = np.split(reddit_thedonald_test0.sample(frac=1), [int(.2*len(reddit_thedonald_test0))])

In [9]:
test.count()

lying down it suffered    58435
dtype: int64

In [10]:
test.head()

Unnamed: 0,lying down it suffered
166925,or DG DG
16590,obama eats glue out of the democrats ass okay ...
221756,the stress added on to her already declining h...
34662,its self preservation after all
32630,these limousine liberals are so out of touch


In [12]:
test.to_csv('../../data/reddit_thedonald.test.0', sep='\t', index=False, header=False)

In [13]:
reddit_thedonald_train0 = pd.read_csv('../../data/reddit_thedonald.train.0')
reddit_thedonald_train0.shape

(876534, 1)

In [14]:
train, discard = np.split(reddit_thedonald_train0.sample(frac=1), [int(.2*len(reddit_thedonald_train0))])

In [15]:
train.count()

lol i love it    175306
dtype: int64

In [16]:
train.head()

Unnamed: 0,lol i love it
364633,legalize drugs factory jobs
627925,i hope they protest everyday by destroying pri...
31227,i just barfed
570592,we l drive them back from the gates of cadia
283178,so many double standards


In [17]:
train.to_csv('../../data/reddit_thedonald.train.0', sep='\t', index=False, header=False)

### Reload the "opposition" data and down-sample

In [45]:
reddit_thedonald_dev1 = pd.read_csv('../../data/reddit_thedonald.dev.1')
reddit_thedonald_dev1.shape

(328944, 1)

In [46]:
dev, discard = np.split(reddit_thedonald_dev1.sample(frac=1), [int(.27*len(reddit_thedonald_dev1))])

In [47]:
dev.count()

also i was in second grade lol    88814
dtype: int64

In [48]:
dev.head()

Unnamed: 0,also i was in second grade lol
229790,why was this user banned why aren t my posts s...
289543,bruce it is my street name
169692,sex is better
269847,it l probably be garbage but i would like to d...
163918,please please think about what you are doing


In [49]:
dev.to_csv('../../data/reddit_thedonald.dev.1', sep='\t', index=False, header=False)

In [40]:
reddit_thedonald_test1 = pd.read_csv('../../data/reddit_thedonald.test.1')
reddit_thedonald_test1.shape

(328945, 1)

In [41]:
test, discard = np.split(reddit_thedonald_test1.sample(frac=1), [int(.27*len(reddit_thedonald_test1))])

In [42]:
test.count()

i love going to hang out with my mom s side of the family    88815
dtype: int64

In [43]:
test.head()

Unnamed: 0,i love going to hang out with my mom s side of the family
151696,i want to hug my crush
10164,it s ok now but it wasn t working
221441,oof
75835,all of it
294144,depends on whos wearing them


In [44]:
test.to_csv('../../data/reddit_thedonald.test.1', sep='\t', index=False, header=False)

In [19]:
reddit_thedonald_train1 = pd.read_csv('../../data/reddit_thedonald.train.1')
reddit_thedonald_train1.shape

(986834, 1)

In [32]:
train, discard = np.split(reddit_thedonald_train1.sample(frac=1), [int(.27*len(reddit_thedonald_train1))])

In [33]:
train.count()

also do you go to school    266445
dtype: int64

In [34]:
train.head()

Unnamed: 0,also do you go to school
174122,cuz we love you
564008,damn you re DGDG minutes away from me lmao
14090,i have a shitty one piece bacon costume so pro...
26041,why the fuck
16998,yeah if i fell asleep on the floor


In [35]:
train.to_csv('../../data/reddit_thedonald.train.1', sep='\t', index=False, header=False)