In [41]:
import pandas as pd
import numpy as np
import spacy
import re
import string
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer

In [None]:
file = '../Datasets/test-balanced.csv'
df = pd.read_csv(file, delimiter='\t', header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,Actually most of her supporters and sane peopl...,Quinnjester,politics,3,3,0,2016-09,1473569605,Hillary's Surrogotes Told to Blame Media for '...
1,0,They can't survive without an echo chamber whi...,TheGettysburgAddress,The_Donald,13,-1,-1,2016-11,1478788413,Thank God Liberals like to live in concentrate...
2,0,you're pretty cute yourself 1729 total,Sempiternally_free,2007scape,8,-1,-1,2016-11,1478042903,Saw this cutie training his Attack today...
3,0,If you kill me you'll crash the meme market,Catacomb82,AskReddit,2,-1,-1,2016-10,1477412597,If you were locked in a room with 49 other peo...
4,0,I bet he wrote that last message as he was sob...,Dorian-throwaway,niceguys,5,-1,-1,2016-11,1477962278,You're not even that pretty!


In [43]:
df.columns = ['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment']
df = df[df['subreddit'].isin(['news', 'politics', 'worldnews'])]

In [44]:
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,Actually most of her supporters and sane peopl...,Quinnjester,politics,3,3,0,2016-09,1473569605,Hillary's Surrogotes Told to Blame Media for '...
6,0,"""Four Score and Seven Gropes Ago...""",Kanzisbuddy,politics,-1,-1,-1,2016-10,1477159141,Gettysburg Address: The First 100 Days Of A Tr...
9,0,"Yes, because making sure the party in power do...",rydan,politics,-1,-1,0,2016-09,1472954129,"He's already encouraged his supporters to ""obs..."
33,0,"Yes you WILL, democrats cave and compromise ev...",o0flatCircle0o,politics,1,-1,-1,2016-11,1479345713,New Top Judiciary Dem Warns Trump: We Won't Fo...
54,0,You would think that as much as everyone blame...,Old_Army90,politics,3,-1,-1,2016-11,1480531475,This is pure victim-blaming. After the Republi...


In [45]:
len(df)

20681

In [46]:
df.isna().sum()

label             0
comment           0
author            0
subreddit         0
score             0
ups               0
downs             0
date              0
created_utc       0
parent_comment    0
dtype: int64

In [47]:
#Selecting only the comment and label column
comments = df.loc[:,["label", "comment", "parent_comment"]]
comments.head()

Unnamed: 0,label,comment,parent_comment
0,0,Actually most of her supporters and sane peopl...,Hillary's Surrogotes Told to Blame Media for '...
6,0,"""Four Score and Seven Gropes Ago...""",Gettysburg Address: The First 100 Days Of A Tr...
9,0,"Yes, because making sure the party in power do...","He's already encouraged his supporters to ""obs..."
33,0,"Yes you WILL, democrats cave and compromise ev...",New Top Judiciary Dem Warns Trump: We Won't Fo...
54,0,You would think that as much as everyone blame...,This is pure victim-blaming. After the Republi...


In [48]:
#Word count column
comments["word_count"] = comments["comment"].apply(lambda x: len(x.split()))
comments.sample(5)

Unnamed: 0,label,comment,parent_comment,word_count
98842,1,Yeah she must be so corrupt,I guess there's nothing stopping her from putt...,6
128892,1,Surprise.,Conservatives to push forward on manifesto and...,1
150202,1,Yes I'm sure everyone feels much better now ab...,More people die every day due to drunk driving...,24
31030,0,"Hey, you can't prove that ;)",You're fucking a white male,6
81799,1,"Maybe he thought that, since Hillary has evil ...","Maybe this is what he thinks as ""light hearted""?",24


In [49]:
#Capital letter counts
def count_cap(x):
  result = 0
  for c in x:
    if c.isupper():
      result += 1
  return result

comments["capital_count"] = comments["comment"].apply(lambda x: count_cap(x))

comments.sample(5)

Unnamed: 0,label,comment,parent_comment,word_count,capital_count
161824,0,So you could say that Curiosity led you to Mar...,Came here for this. And Curiosity.,11,4
130221,0,source?,15 y.o. boy was jailed for life for writing st...,1,0
50604,0,Until they win a medal and get all those sweet...,One distinction: NFL players are professionals...,12,1
242720,1,"Yes, but that required effort, you don't want ...",Well it becomes a waste of time to pursue a bi...,21,1
170937,0,The government stops farmers from using their ...,Farmers use a lot of utilities that result dir...,13,1


In [50]:
#Punctuation count
comments["punc_count"] = comments["comment"].apply(lambda x: sum(1 for i in x if i in string.punctuation))
comments.sample(5)

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
60847,1,Why is she being punished for RUSSIANS interfe...,yeah . . . i watched when she was asked about ...,8,9,1
144705,1,"If only more people had more guns, this wouldn...",Oregon college shooting is 994th mass gun atta...,10,1,2
173307,1,fuck that columbus day is way more relevant an...,Election Day should be a national holiday so t...,10,0,1
211797,0,"Hmm, I just don't know.","""You know that thing we always tell people we ...",5,2,3
237086,0,"You son of a bitch, you're right even though I...",You don't wait until it's raining to build a r...,19,2,4


In [51]:
#Lower case
comments["comment"] = comments["comment"].apply(lambda x: x.lower())
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: x.lower())
comments.head()

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
0,0,actually most of her supporters and sane peopl...,hillary's surrogotes told to blame media for '...,17,4,5
6,0,"""four score and seven gropes ago...""",gettysburg address: the first 100 days of a tr...,6,5,5
9,0,"yes, because making sure the party in power do...","he's already encouraged his supporters to ""obs...",38,1,6
33,0,"yes you will, democrats cave and compromise ev...",new top judiciary dem warns trump: we won't fo...,13,5,2
54,0,you would think that as much as everyone blame...,this is pure victim-blaming. after the republi...,24,1,2


In [52]:
#List of contractions manually modified for convenience
contractions = {
"i ain't": "i am not",
"you ain't": "you are not",
"he ain't": "he is not",
"she ain't": "she is not",
"they ain't": "they are not",
"it ain't": "it is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": " what is",
"what've": "what have",
"when's": " when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}


In [53]:
#Expand words
def decontract(words):
  final = []
  for word in words:
    if word in contractions:
      final.append(contractions[word])
    else:
      final.append(word)
  return " ".join(final)

comments["comment"] = comments["comment"].apply(lambda x: decontract(x.split()))
comments["comment"] = comments["comment"].apply(lambda x: re.sub(r"\'s", " is", x))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: decontract(x.split()))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: re.sub(r"\'s", " is", x))
comments.head()

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
0,0,actually most of her supporters and sane peopl...,hillary is surrogotes told to blame media for ...,17,4,5
6,0,"""four score and seven gropes ago...""",gettysburg address: the first 100 days of a tr...,6,5,5
9,0,"yes, because making sure the party in power do...","he is already encouraged his supporters to ""ob...",38,1,6
33,0,"yes you will, democrats cave and compromise ev...",new top judiciary dem warns trump: we will not...,13,5,2
54,0,you would think that as much as everyone blame...,this is pure victim-blaming. after the republi...,24,1,2


In [54]:
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [55]:
#Remove stop words
def remove_stopwords(words):
  final = []
  for word in words:
    if word not in stopwords:
      final.append(word)
  return " ".join(final)

comments["comment"] = comments["comment"].apply(lambda x: remove_stopwords(x.split()))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: remove_stopwords(x.split()))
comments.head()

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
0,0,actually supporters sane people saw media doin...,hillary surrogotes told blame media 'deplorabl...,17,4,5
6,0,"""four score seven gropes ago...""",gettysburg address: 100 days trump administrat...,6,5,5
9,0,"yes, making sure party power steal election (s...","encouraged supporters ""observe"" polling places...",38,1,6
33,0,"yes will, democrats cave compromise party no.",new judiciary dem warns trump: forget gop trea...,13,5,2
54,0,"think blames gerrymandering sub, know gerryman...",pure victim-blaming. republicans huge gains st...,24,1,2


In [56]:
#Removing special characters and punctuation

comments["comment"] = comments["comment"].apply(lambda x: re.sub(r'[^\w ]+', "", x))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: re.sub(r'[^\w ]+', "", x))
comments

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
0,0,actually supporters sane people saw media doin...,hillary surrogotes told blame media deplorable...,17,4,5
6,0,four score seven gropes ago,gettysburg address 100 days trump administrati...,6,5,5
9,0,yes making sure party power steal election som...,encouraged supporters observe polling places e...,38,1,6
33,0,yes will democrats cave compromise party no,new judiciary dem warns trump forget gop treat...,13,5,2
54,0,think blames gerrymandering sub know gerrymand...,pure victimblaming republicans huge gains stat...,24,1,2
...,...,...,...,...,...,...
251590,1,voted mccain,democrats fuckin pussys right voted obama,5,3,1
251593,1,muslim,hmm differences president president preceding him,3,1,2
251594,1,want black people republican website,resist welcome new gopcom,10,2,2
251596,1,let murders rapist free gotta people smokin we...,shoplifter gets life sentence texas,31,0,1


In [57]:
#Frequency of words
text = " ".join(comments["comment"])
parent_text = " ".join(comments["parent_comment"])
text = text.split()
parent_text = parent_text.split()
word_freq = pd.Series(text).value_counts()
parent_freq = pd.Series(parent_text).value_counts()

In [58]:
#Get top20 words
top_20_words = word_freq[:20]
top_20_words_parent = parent_freq[:20]
top_20_words

people    1235
yeah      1185
like      1082
right      686
sure       601
yes        600
trump      567
know       551
good       540
think      511
it         462
time       391
going      342
need       340
way        326
oh         326
want       319
money      306
thing      298
better     295
Name: count, dtype: int64

In [59]:
#get rare words
#rare_words = word_freq.tail(17311) #words that are used once
rare_words = word_freq[word_freq == 1]
rare_words_parent = parent_freq[parent_freq == 1]
rare_words

scourge      1
earners      1
proposes     1
motorist     1
oyppt        1
            ..
ringing      1
twothirds    1
quis         1
custodiet    1
imperfect    1
Name: count, Length: 8939, dtype: int64

In [60]:
#Remove common words and rare words
comments["comment"] = comments["comment"].apply(lambda x: " ".join([t for t in x.split() if t not in rare_words]))
comments["comment"] = comments["comment"].apply(lambda x: " ".join([t for t in x.split() if t not in top_20_words]))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: " ".join([t for t in x.split() if t not in rare_words_parent]))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: " ".join([t for t in x.split() if t not in top_20_words_parent]))
comments.head()

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
0,0,actually supporters sane saw media ap tweet,told blame media deplorables coverage,17,4,5
6,0,score seven ago,gettysburg address 100 days administration ves...,6,5,5
9,0,making party power steal election something si...,encouraged supporters observe polling places e...,38,1,6
33,0,will democrats cave compromise party no,new judiciary dem warns forget gop treated gar...,13,5,2
54,0,gerrymandering sub gerrymandering actually app...,pure republicans huge gains state legislatures...,24,1,2


In [61]:
#Lemmatize
nlp = spacy.load("en_core_web_sm")
def make_to_base(x):
  x = str(x)
  x_list = []
  doc = nlp(x)
  for token in doc:
    lemma = token.lemma_
    if lemma == "-PRON-" or lemma == "be":
      lemma = token.text
    x_list.append(lemma)
  return " ".join(x_list)

comments["comment"] = comments["comment"].apply(lambda x: make_to_base(x))
comments["parent_comment"] = comments["parent_comment"].apply(lambda x: make_to_base(x))
comments.head()

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count
0,0,actually supporter sane see medium ap tweet,tell blame medium deplorable coverage,17,4,5
6,0,score seven ago,gettysburg address 100 day administration vess...,6,5,5
9,0,make party power steal election something side...,encourage supporter observe polling place elec...,38,1,6
33,0,will democrats cave compromise party no,new judiciary dem warn forget gop treat garland,13,5,2
54,0,gerrymander sub gerrymandering actually apply ...,pure republicans huge gain state legislature 2...,24,1,2


In [62]:
def get_polarity(x):
  return TextBlob(x, analyzer=PatternAnalyzer()).sentiment.polarity

def get_subjectivity(x):
  return TextBlob(x, analyzer=PatternAnalyzer()).sentiment.subjectivity

comments["comment_polarity"] = comments["comment"].apply(lambda x: get_polarity(x))
comments["comment_subjectivity"] = comments["comment"].apply(lambda x: get_subjectivity(x))
comments["parent_polarity"] = comments["parent_comment"].apply(lambda x: get_polarity(x))
comments["parent_subjectivity"] = comments["parent_comment"].apply(lambda x: get_subjectivity(x))
comments.sample(5)

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count,comment_polarity,comment_subjectivity,parent_polarity,parent_subjectivity
48471,0,republicans,defend hell care jeb bush backer think,2,0,0,0.0,0.0,0.0,0.0
121646,1,look middle east india find social moral,antigay dominant position latin america africa...,15,4,2,0.011111,0.105556,0.083333,0.1375
80134,0,forget dick,yeah honestly care care cheat jfk serial phila...,6,4,2,0.0,0.0,0.6,0.9
215846,1,set prejudice aside little while,major issue hit she,13,1,2,-0.1875,0.5,0.0625,0.5
251282,1,justice serve,new york police officer prison violently push ...,6,1,4,0.0,0.0,-0.331818,0.727273


In [None]:
comments.to_csv('../Datasets/sentiment_test.csv', index=False)

In [None]:
cleaned = pd.read_csv('../Datasets/sentiment_test.csv')
cleaned.head()

Unnamed: 0,label,comment,parent_comment,word_count,capital_count,punc_count,comment_polarity,comment_subjectivity,parent_polarity,parent_subjectivity
0,0,actually supporter sane see medium ap tweet,tell blame medium deplorable coverage,17,4,5,0.0,0.1,-0.6,0.9
1,0,score seven ago,gettysburg address 100 day administration vess...,6,5,5,0.0,0.0,0.0,0.0
2,0,make party power steal election something side...,encourage supporter observe polling place elec...,38,1,6,-0.25,0.25,0.0,0.0
3,0,will democrats cave compromise party no,new judiciary dem warn forget gop treat garland,13,5,2,0.0,0.0,0.136364,0.454545
4,0,gerrymander sub gerrymandering actually apply ...,pure republicans huge gain state legislature 2...,24,1,2,0.0,0.1,0.228571,0.7
