In [1]:
# The future
#from __future__ import print_function, division, absolute_import

# Data wrangling libraries
import pandas as pd
import numpy as np
import re
from io import StringIO

# Numpy shorthand stuff
from numpy import array

# NLTK shorthand stuff
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import RegexpTokenizer

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split


%matplotlib inline

In [2]:
# Borrowed some functions from the w266 utils.py file
# Miscellaneous helpers
def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))


# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word


def canonicalize_word(word, wordset=None, digits=True):
    #word = re.sub(r":","",word)
    #word = re.sub(r"https?","",word)
    #word = re.sub(r"\/","",word)
    #word = re.sub(r"@","",word)
    #word = re.sub(r"/\U0001.?'","",word)
    #replace hyperlinks with one instance of "postedhyperlinkvalue"
    word = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?\S*", "postedhyperlinkvalue", word)
    word = re.sub(r"(postedhyperlinkvalue)+", "postedhyperlinkvalue", word)
    #only lower case words (2 letters or longer) that are not all upper case
    if not word.isupper() or len(word) == 1:
        word = word.lower()
    #replace things like haha with ha
    word = re.sub(r"([a-z]{2,})\1{2,}", r"\1", word)
    #replace any three consecutive, identical letters with two instances of that letter
    word = re.sub(r"([a-z])\1{2,}", r"\1\1", word)
    #replace any two consecutive, identical consonants at the beginning of a string with one of that consonant
    word = re.sub(r"(^[^aeiou])\1{1,}", r"\1", word)
    
    #replace digits with a stand-in token
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

    
def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]


# Made some helper functions of our own
from nltk.stem import PorterStemmer   
def stem_sentence(token_sent, stemmer=PorterStemmer()):
    stem_token_sent = []
    for word in token_sent:
        stem_token_sent.append(stemmer.stem(word))
    return stem_token_sent   


def make_data(data, target='', commentfield='', tokenizer=TweetTokenizer(), canonize=True, stem=True):      
    # Separate comments
    comments = data.loc[:, commentfield]
    #comments = data.loc[:, 'comment_body']
    #labels = data.loc[:, target]
    
    # Convert to list
    comment_list = comments.values.tolist()
    
    # Tokenize comments
    tokenizer = tokenizer
    # A list of lists of tokenized sentences: word == string/token; sentence == list of string/tokens
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences_x = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences = []
    #sentence = []
    #last_tok = ''
    #for comment in tokenized_sentences_x:
    #    for tok in comment:
    #        if last_tok in ('http', 'https',':','http:','https:','@'):
    #            tok = last_tok + tok
    #        if tok in ('http', 'https',':', '@'):
    #            last_tok = tok
    #        else:
    #            last_tok = ''
    #            sentence.append(tok)
    #    tokenized_sentences.append(sentence)
    
    if stem:
        # Stem words
        comments_stem = []
        for sentence in tokenized_sentences:
            x_tokens_stem = stem_sentence(token_sent=sentence, stemmer=PorterStemmer())
            comments_stem.append(x_tokens_stem)
        tokenized_sentences = comments_stem
    
    if canonize:
        # Canonize words
        comments_canon = []
        for sentence in tokenized_sentences:
            x_tokens_canon = canonicalize_words(sentence)
            comments_canon.append(x_tokens_canon)
        # A list of lists of scrubbed tokens; token == word, list == sentence
        tokenized_sentences = comments_canon
    
    x_tokens = tokenized_sentences  
    #return comments, x_tokens, labels
    return comments, x_tokens


def rawlist_to_xtokens(rawlist=['default arg'], vocab_list=[]):
    xtokens = []
    for rawstring in rawlist:
        xtoken = list(filter(lambda x: x in vocab_list, rawstring.split()))
        xtokens.append(xtoken)   
    return xtokens


def xtoken_to_raw(xtoken=['default','arg']):  
    raw_text_string = ' '.join(xtoken)
    return raw_text_string


def raw_to_xtoken(raw_string='default arg'):
    xtoken = raw_string.split()
    return xtoken


def model_diagnostics(model, data, labels, target_names, random=False, test_size=0.10, random_state=42):
    
    # Split into test and train
    # Designate random test_size% of data (rounded up to next obs) as test data
    if random:
        train_data, test_data, train_labels, test_labels = train_test_split(data, labels, 
                                                                            test_size=test_size, 
                                                                            random_state=random_state)
    # Designate last test_size% of data (rounded up to next obs) as test data 
    else:
        idx = round(test_size*len(data))
        test_data = data[-idx:]
        test_labels = labels[-idx:]
    
    
    pred_labels = model.predict_classes(test_data)
    
    print("Test data length is: ", len(test_data))
    print("Test label length is: ", len(test_labels))
    print("Pred label length is: ", len(pred_labels))
    
    confusionMatrix = metrics.confusion_matrix(test_labels, pred_labels)
    classificationReport = classification_report(test_labels, pred_labels, target_names=target_names)
    
    return confusionMatrix, classificationReport

In [3]:
!pwd

/home/chadharness/w210/Parlancr/models


### Clean Trump Reddit Comments

In [10]:
theD = pd.read_csv('../../data/reddit_theDonald_sentences.csv')
theD.shape

(11821939, 1)

In [33]:
smpl, discard = np.split(theD.sample(frac=1), [int(.15365*len(theD))])

In [23]:
smpl.head()

Unnamed: 0,text
9302680,i just found this one at time : * '' there are...
9061838,its all right there .
2147763,lol
6452995,we 're all very good boys waiting patiently fo...
9546641,the jewish faith is very different that biblic...


In [34]:
smpl.count()

text    1816439
dtype: int64

In [35]:
theD = smpl

In [36]:
theD.head()

Unnamed: 0,text
3918856,"& gt ; the company that is across the street ,..."
1465983,seriously if i 'm wrong tell me .
2137069,anyway classic projection from the lefty loon ...
1952404,they ’ re still owned and controlled by big me...
1841977,have n't checked in a couple weeks .


In [37]:
# Remove problematic comments
theD = theD[theD.text.notnull()]
theD = theD[theD.text != '']
theD = theD[theD.text != "no value"]
theD = theD[theD.text !='[ removed ]']
theD = theD[theD.text != '[ deleted ]']
theD = theD[~theD.text.str.contains(" \ ^ This \ ^ message \ ^ was \ ^ created \ ^ by \ ^ a \ ^ bot")]
theD = theD[~theD.text.str.contains(" \ * \ * Please review the rules for")]
theD = theD[theD.notnull()]

theD.count()

text    1738575
dtype: int64

In [38]:
theD.count()

text    1738575
dtype: int64

In [39]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
theD_comments, theD_x_tokens = make_data(theD, commentfield='text', canonize=True, stem=False, tokenizer=RegexpTokenizer(r'\w+'))

In [45]:
theD_comments.head()

3918856    & gt ; the company that is across the street ,...
1465983                    seriously if i 'm wrong tell me .
2137069    anyway classic projection from the lefty loon ...
1952404    they ’ re still owned and controlled by big me...
1841977                 have n't checked in a couple weeks .
Name: text, dtype: object

In [64]:
len(theD_x_tokens)

1738575

In [65]:
theD_x_tokens[0]

['gt',
 'the',
 'company',
 'that',
 'is',
 'across',
 'the',
 'street',
 'well',
 'they',
 'ai',
 'n',
 't',
 'censoring',
 'shit',
 'and',
 'how',
 'will',
 'the',
 'democrat',
 'troll',
 'who',
 'never',
 'leaves',
 'his',
 'room',
 'ever',
 'find',
 'out',
 'they',
 'are',
 'censoring']

In [48]:
theD_tokens = [sent for sent in theD_x_tokens if len(sent) <= 20]

In [49]:
len(theD_tokens)

1491356

In [50]:
theD_tokens[33]

['am', 'i', 'miss', 'remembering', 'this']

In [51]:
theD_raw_list = list(map(xtoken_to_raw, theD_tokens))

In [52]:
theD_raw_list[33]

'am i miss remembering this'

In [53]:
theD = pd.DataFrame({'text':theD_raw_list})
#trump = pd.DataFrame(trump_raw_list)

In [54]:
theD.head()

Unnamed: 0,text
0,seriously if i m wrong tell me
1,anyway classic projection from the lefty loon ...
2,they re still owned and controlled by big media
3,have n t checked in a couple weeks
4,we do n t know about the black kids because he...


In [55]:
# Remove problematic comments
theD = theD[theD.text.notnull()]
theD = theD[theD.text != '']
theD = theD[theD.text != "no value"]
theD = theD[theD.text !='[ removed ]']
theD = theD[theD.text != '[ deleted ]']
theD = theD[~theD.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
theD = theD[~theD.text.str.contains("\*\*Please review the rules for")]
theD = theD[theD.notnull()]

theD.count()

text    1460892
dtype: int64

In [56]:
theD.count()

text    1460892
dtype: int64

In [57]:
train, validate, test = np.split(theD.sample(frac=1), [int(.6*len(theD)), int(.8*len(theD))])

In [58]:
train.head()

Unnamed: 0,text
663248,lol i love it
663346,no
955287,the mainstream media is losing this battle bigly
841005,not surprising his approval rating is this hig...
262918,i m satisfied with that


In [66]:
train.count()

text    876535
dtype: int64

In [60]:
test.head()

Unnamed: 0,text
1360032,lying down it suffered
215090,but if the kids what to change their gender th...
650660,alums get to emailing
1092112,foreign would be a much bigger scandal
321826,all of us at t_d could have confirmed that


In [61]:
test.count()

text    292179
dtype: int64

In [62]:
validate.head()

Unnamed: 0,text
1354771,all stand for anthem
168721,being nearby may have been better worded
124241,it s going to be buried and forgotten unfortun...
684552,flashback to last year https youtu be 4vioqzfo6bo
832969,you do n t need instructions lol


In [63]:
validate.count()

text    292178
dtype: int64

In [67]:
train.to_csv('../../data/reddit_thedonald.train.0', sep='\t', index=False, header=False)

In [68]:
test.to_csv('../../data/reddit_thedonald.test.0', sep='\t', index=False, header=False)

In [69]:
validate.to_csv('../../data/reddit_thedonald.dev.0', sep='\t', index=False, header=False)

In [9]:
theD.head()

Unnamed: 0,text
0,my eye
3,why did they drop bomb ?
4,is there even more incriminating evidence in t...
5,"if we had a gun problem in this country , ther..."
6,i love him .


In [None]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments, 
# list of list of tokens, and 
# panda series of target labels
theD_comments, theD_x_tokens = make_data(theD, commentfield='text', canonize=True, stem=False)

### Clean Trump Twitter Comments

In [4]:
trump = pd.read_csv('../../data/twitter_trump_sentences.csv')
trump.shape

(60262, 1)

In [5]:
trump.head()

Unnamed: 0,text
0,today it was my great honor to welcome preside...
1,.
2,@ asahutchinson the great governor of arkansas...
3,he has done an incredible job with a focus on ...
4,asa loves our military and our veterans .


In [6]:
# Remove problematic comments
trump = trump[trump.text.notnull()]
trump = trump[trump.text != '']
trump = trump[trump.text != "no value"]
trump = trump[trump.text !='[ removed ]']
trump = trump[trump.text != '[ deleted ]']
trump = trump[~trump.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
trump = trump[~trump.text.str.contains("\*\*Please review the rules for")]
trump = trump[trump.notnull()]

trump.count()

text    60262
dtype: int64

In [7]:
trump.count()

text    60262
dtype: int64

In [8]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
trump_comments, trump_x_tokens = make_data(trump, commentfield='text', canonize=True, stem=False, tokenizer=RegexpTokenizer(r'\w+'))

In [9]:
trump_comments[0]

'today it was my great honor to welcome president moon jae-in of the republic of korea to the @ whitehouse ! 🇺🇸🇰🇷 https : //t.co/yvoxnia1dm'

In [251]:
len(trump_x_tokens)

60262

In [252]:
trump_x_tokens[0]

['today',
 'it',
 'was',
 'my',
 'great',
 'honor',
 'to',
 'welcome',
 'president',
 'moon',
 'jae',
 'in',
 'of',
 'the',
 'republic',
 'of',
 'korea',
 'to',
 'the',
 'whitehouse',
 'https',
 't',
 'co',
 'yvoxnia1dm']

In [373]:
trump_tokens = [sent for sent in trump_x_tokens if len(sent) <= 20]

In [374]:
len(trump_tokens)

53264

In [375]:
trump_tokens[33]

['great',
 'to',
 'have',
 'our',
 'incredible',
 'first',
 'lady',
 'back',
 'home',
 'in',
 'the',
 'white',
 'house']

In [376]:
trump_raw_list = list(map(xtoken_to_raw, trump_tokens))

In [377]:
trump_raw_list[33]

'great to have our incredible first lady back home in the white house'

In [378]:
trump = pd.DataFrame({'text':trump_raw_list})
#trump = pd.DataFrame(trump_raw_list)

In [379]:
trump.head()

Unnamed: 0,text
0,
1,asahutchinson the great governor of arkansas i...
2,he has done an incredible job with a focus on ...
3,asa loves our military and our veterans
4,i fully endorse asa for governor


In [380]:
# Remove problematic comments
trump = trump[trump.text.notnull()]
trump = trump[trump.text != '']
trump = trump[trump.text != "no value"]
trump = trump[trump.text !='[ removed ]']
trump = trump[trump.text != '[ deleted ]']
trump = trump[~trump.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
trump = trump[~trump.text.str.contains("\*\*Please review the rules for")]
trump = trump[trump.notnull()]

trump.count()

text    51870
dtype: int64

In [381]:
trump.count()

text    51870
dtype: int64

In [382]:
train, validate, test = np.split(trump.sample(frac=1), [int(.6*len(trump)), int(.8*len(trump))])

In [383]:
train.head()

Unnamed: 0,text
7230,bigleaguetruth
42988,make the boston killer talk before our doctors...
43756,celebapprentice
24140,he will never again hold court
5904,design or negotiations yet


In [384]:
train.count()

text    31122
dtype: int64

In [385]:
test.head()

Unnamed: 0,text
31793,capitalism is where it s at
45340,thanks
18053,go jeb
6142,thank you to our amazing wounded warriors for ...
26811,azigmann realdonaldtrump donald trump for pres...


In [386]:
test.count()

text    10374
dtype: int64

In [387]:
train.to_csv('../../data/twitter_trump.train.0', sep='\t', index=False, header=False)

In [388]:
test.to_csv('../../data/twitter_trump.test.0', sep='\t', index=False, header=False)

In [389]:
validate.to_csv('../../data/twitter_trump.dev.0', sep='\t', index=False, header=False)

### Clean Teen Reddit Comments

In [70]:
teen = pd.read_csv('../../data/reddit_teenagers_filtered_sentences.csv')
teen.shape

(1816680, 1)

In [71]:
teen.head()

Unnamed: 0,text
0,mood
1,ayyyyyyy big mouth
2,arrested development
3,i have a feeling you watched jacksfilms new video
4,what the fuck is this comment section


In [72]:
# Remove problematic comments
teen = teen[teen.text.notnull()]
teen = teen[teen.text != '']
teen = teen[teen.text != "no value"]
teen = teen[teen.text !='[ removed ]']
teen = teen[teen.text != '[ deleted ]']
teen = teen[~teen.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
teen = teen[~teen.text.str.contains("\*\*Please review the rules for")]
teen = teen[teen.notnull()]

teen.count()

text    1816592
dtype: int64

In [73]:
teen.count()

text    1816592
dtype: int64

In [74]:
# Perform all desired pre-processing and split data into 
# panda series of raw comments and list of list of tokens.
teen_comments, teen_x_tokens = make_data(teen, commentfield='text', canonize=True, stem=False, tokenizer=RegexpTokenizer(r'\w+'))

In [75]:
teen_comments[3000]

'50 % 58 min to full'

In [76]:
len(teen_x_tokens)

1816592

In [77]:
teen_x_tokens[3000]

['DGDG', 'DGDG', 'min', 'to', 'full']

In [78]:
teen_tokens = [sent for sent in teen_x_tokens if len(sent) <= 20]

In [79]:
len(teen_tokens)

1680022

In [80]:
teen_raw_list = list(map(xtoken_to_raw, teen_tokens))

In [81]:
teen_raw_list[3000]

'i cant belive you posted in your own meme thread'

In [82]:
teen = pd.DataFrame({'text':teen_raw_list})
#teen = pd.DataFrame(teen_raw_list)

In [83]:
# Remove problematic comments
teen = teen[teen.text.notnull()]
teen = teen[teen.text != '']
teen = teen[teen.text != "no value"]
teen = teen[teen.text !='[ removed ]']
teen = teen[teen.text != '[ deleted ]']
teen = teen[~teen.text.str.contains("\^This \^message \^was \^created \^by \^a \^bot")]
teen = teen[~teen.text.str.contains("\*\*Please review the rules for")]
teen = teen[teen.notnull()]

teen.count()

text    1644726
dtype: int64

In [84]:
teen.head()

Unnamed: 0,text
0,mood
1,ayy big mouth
2,arrested development
3,i have a feeling you watched jacksfilms new video
4,what the fuck is this comment section


In [85]:
teen.count()

text    1644726
dtype: int64

In [86]:
train, validate, test = np.split(teen.sample(frac=1), [int(.6*len(teen)), int(.8*len(teen))])

In [409]:
#[int(.6*len(teen)), int(.8*len(teen))]

[986835, 1315780]

In [459]:
#train, discard = np.split(train.sample(frac=1), [int(.03155*len(train))])

In [87]:
train.head()

Unnamed: 0,text
868442,also do you go to school
1640360,sigh this will go on forever uses uno reverse ...
49700,beautiful
239967,im so ugly
1206766,no because selfies are a picture of ones self ...


In [88]:
train.count()

text    986835
dtype: int64

In [462]:
#test, discard = np.split(test.sample(frac=1), [int(.0315*len(test))])

In [89]:
test.head()

Unnamed: 0,text
391528,i love going to hang out with my mom s side of...
1267443,i mean they were warned they aren t allowed it...
104720,actually i think more people use it to find pe...
672819,why
130458,like DG hours


In [90]:
test.count()

text    328946
dtype: int64

In [465]:
#validate, discard = np.split(validate.sample(frac=1), [int(.0315*len(validate))])

In [91]:
validate.head()

Unnamed: 0,text
1227255,also i was in second grade lol
382911,meep
774594,my hands are sweaty also is the opening monolo...
136486,like a payment that comes out monthly
651542,sometimes people get recognised and then all t...


In [92]:
validate.count()

text    328945
dtype: int64

In [93]:
train.to_csv('../../data/reddit_thedonald.train.1', sep='\t', index=False, header=False)

In [94]:
test.to_csv('../../data/reddit_thedonald.test.1', sep='\t', index=False, header=False)

In [95]:
validate.to_csv('../../data/reddit_thedonald.dev.1', sep='\t', index=False, header=False)

### Reload the "target" data and down-sample

In [2]:
reddit_thedonald_dev0 = pd.read_csv('../../data/reddit_thedonald.dev.0')
reddit_thedonald_dev0.shape

(292177, 1)

In [3]:
dev, discard = np.split(reddit_thedonald_dev0.sample(frac=1), [int(.2*len(reddit_thedonald_dev0))])

In [4]:
dev.count()

all stand for anthem    58435
dtype: int64

In [5]:
dev.head()

Unnamed: 0,all stand for anthem
46909,gt i thought they were just childish people wh...
134354,throw in some basic foundational mba entrepene...
67472,i read that as took his ball gag and went home...
185224,it s time we call them out on their bullshit b...
220457,spez apparently i am too old to keep up with t...


In [6]:
dev.to_csv('../../data/reddit_thedonald.dev.0', sep='\t', index=False, header=False)

In [7]:
reddit_thedonald_test0 = pd.read_csv('../../data/reddit_thedonald.test.0')
reddit_thedonald_test0.shape

(292178, 1)

In [8]:
test, discard = np.split(reddit_thedonald_test0.sample(frac=1), [int(.2*len(reddit_thedonald_test0))])

In [9]:
test.count()

lying down it suffered    58435
dtype: int64

In [10]:
test.head()

Unnamed: 0,lying down it suffered
166925,or DG DG
16590,obama eats glue out of the democrats ass okay ...
221756,the stress added on to her already declining h...
34662,its self preservation after all
32630,these limousine liberals are so out of touch


In [12]:
test.to_csv('../../data/reddit_thedonald.test.0', sep='\t', index=False, header=False)

In [13]:
reddit_thedonald_train0 = pd.read_csv('../../data/reddit_thedonald.train.0')
reddit_thedonald_train0.shape

(876534, 1)

In [14]:
train, discard = np.split(reddit_thedonald_train0.sample(frac=1), [int(.2*len(reddit_thedonald_train0))])

In [15]:
train.count()

lol i love it    175306
dtype: int64

In [16]:
train.head()

Unnamed: 0,lol i love it
364633,legalize drugs factory jobs
627925,i hope they protest everyday by destroying pri...
31227,i just barfed
570592,we l drive them back from the gates of cadia
283178,so many double standards


In [17]:
train.to_csv('../../data/reddit_thedonald.train.0', sep='\t', index=False, header=False)

### Reload the "opposition" data and down-sample

In [45]:
reddit_thedonald_dev1 = pd.read_csv('../../data/reddit_thedonald.dev.1')
reddit_thedonald_dev1.shape

(328944, 1)

In [46]:
dev, discard = np.split(reddit_thedonald_dev1.sample(frac=1), [int(.27*len(reddit_thedonald_dev1))])

In [47]:
dev.count()

also i was in second grade lol    88814
dtype: int64

In [48]:
dev.head()

Unnamed: 0,also i was in second grade lol
229790,why was this user banned why aren t my posts s...
289543,bruce it is my street name
169692,sex is better
269847,it l probably be garbage but i would like to d...
163918,please please think about what you are doing


In [49]:
dev.to_csv('../../data/reddit_thedonald.dev.1', sep='\t', index=False, header=False)

In [40]:
reddit_thedonald_test1 = pd.read_csv('../../data/reddit_thedonald.test.1')
reddit_thedonald_test1.shape

(328945, 1)

In [41]:
test, discard = np.split(reddit_thedonald_test1.sample(frac=1), [int(.27*len(reddit_thedonald_test1))])

In [42]:
test.count()

i love going to hang out with my mom s side of the family    88815
dtype: int64

In [43]:
test.head()

Unnamed: 0,i love going to hang out with my mom s side of the family
151696,i want to hug my crush
10164,it s ok now but it wasn t working
221441,oof
75835,all of it
294144,depends on whos wearing them


In [44]:
test.to_csv('../../data/reddit_thedonald.test.1', sep='\t', index=False, header=False)

In [19]:
reddit_thedonald_train1 = pd.read_csv('../../data/reddit_thedonald.train.1')
reddit_thedonald_train1.shape

(986834, 1)

In [32]:
train, discard = np.split(reddit_thedonald_train1.sample(frac=1), [int(.27*len(reddit_thedonald_train1))])

In [33]:
train.count()

also do you go to school    266445
dtype: int64

In [34]:
train.head()

Unnamed: 0,also do you go to school
174122,cuz we love you
564008,damn you re DGDG minutes away from me lmao
14090,i have a shitty one piece bacon costume so pro...
26041,why the fuck
16998,yeah if i fell asleep on the floor


In [35]:
train.to_csv('../../data/reddit_thedonald.train.1', sep='\t', index=False, header=False)