## Importing the necessary libraries

#### Basic Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from string import punctuation
from collections import Counter

#### Scikit Learn Libraries

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
## May want to try count vectorizer? 

## Stating the path of the data files 

#### Path of the training / testing data

In [3]:
train_data_path = "../data/data/train/train_cleaned.csv"
test_data_path = "../data/data/test/test_cleaned.csv"

#### Path of the lexicons

In [4]:
emo_lexicon_path = "../data/external dataset/NRC-AffectIntensity-Lexicon.txt"
block_terms_path = "../data/external dataset/Terms-To-Block.txt"
swear_words_path = "../data/external dataset/swearWords.txt"
google_bad_words_path = "../data/external dataset/google-bad-words.txt"
github_bad_words_path = "../data/external dataset/bad-words-github.txt"
bad_words_path = "../data/external dataset/bad-words.txt"

## Helper Functions

In [None]:
def check_tags(comment, tag):
    length = len(comment.split(" "))
    count = comment.lower().count(tag)
    return float(count)/float(length), int(count > 0)

In [None]:
def check_emotions(comment, emotion):
    comment_array = comment.split(" ")
    length = len(comment_array)
    count = len([emotion.get(x) for x in comment_array if emotion.get(x) is not None])
    return float(count)/float(length), int(count > 0)

In [None]:
def check_swear_words(comment, word_lst):
#     comment = " " + comment + " "
#     length = len(comment.split(" "))
#     count = 0
#     for word in word_lst:
#         word = " " + word + " "
#         if word in comment:
#             comment = comment.replace(word, "")
#             count += 1 
    words = comment.split(" ")
    is_swear_word = [1 if word in word_lst else 0 for word in words]
    total_swear_words = np.sum(is_swear_word)
    return float(total_swear_words)/float(len(words)), int(total_swear_words>0)

In [None]:
def count_punc(comment):
    counts = Counter(comment)
    punc_dict = {key: occurences for key, occurences in counts.items() if key in punctuation}
    punc_count = sum(punc_dict.values())
    return punc_count

#### Defining the Emotion Lexicon class

In [None]:
class EmoLex: 
    
    def __init__(self, emoPath):
        self.emoPath = emoPath
        self.emoLex = pd.read_csv(self.emoPath, sep = "\t")
        
        self.anger = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "anger", :]
        self.fear = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "fear", :]
        self.joy = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "joy", :]
        self.sadness = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "sadness", :]
        
        anger_dict = {}
        for i in range(0, self.anger.shape[0]):
            word = self.anger.iloc[i, :].term
            score = self.anger.iloc[i, :].score
            anger_dict[word] = score
            
        fear_dict = {}
        for i in range(0, self.fear.shape[0]):
            word = self.fear.iloc[i, :].term
            score = self.fear.iloc[i, :].score
            fear_dict[word] = score
            
        joy_dict = {}
        for i in range(0, self.joy.shape[0]):
            word = self.joy.iloc[i, :].term
            score = self.joy.iloc[i, :].score
            joy_dict[word] = score
            
        sadness_dict = {}
        for i in range(0, self.sadness.shape[0]):
            word = self.sadness.iloc[i, :].term
            score = self.sadness.iloc[i, :].score
            sadness_dict[word] = score
            
        self.anger_dict = anger_dict
        self.fear_dict = fear_dict
        self.joy_dict = joy_dict
        self.sadness_dict = sadness_dict
        

#### Defining the vulgarities class

In [None]:
class Vulgarities:
    
    def __init__(self, block_terms_path, swear_words_path, 
                 google_bad_words_path, github_bad_words_path, bad_words_path):
        
        self.block_terms_path = block_terms_path
        self.swear_words_path = swear_words_path
        self.google_bad_words_path = google_bad_words_path
        self.github_bad_words_path = github_bad_words_path
        self.bad_words_path = bad_words_path
        
#         self.block_terms_list = list(pd.read_csv(self.block_terms_path, header = None, names= ["words"]).loc[:, "words"])
#         self.swear_words_list = list(pd.read_csv(self.swear_words_path, header = None, names= ["words"]).loc[:, "words"])
#         self.google_bad_words_list = list(pd.read_csv(self.google_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
#         self.github_bad_words_list = list(pd.read_csv(self.github_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
#         self.bad_words_list = list(pd.read_csv(self.bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        
        self.block_terms_list = set(pd.read_csv(self.block_terms_path, header = None, names= ["words"]).loc[:, "words"])
        self.swear_words_list = set(pd.read_csv(self.swear_words_path, header = None, names= ["words"]).loc[:, "words"])
        self.google_bad_words_list = set(pd.read_csv(self.google_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        self.github_bad_words_list = set(pd.read_csv(self.github_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        self.bad_words_list = set(pd.read_csv(self.bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        
        

#### Defining the data class

In [None]:
class Data:
    
    def __init__(self, path, type_):
        self.path = path
        self.df = pd.read_csv(path)
    
    def addBOWFeatures(self, vectorizer):
        sparse_matrix = vectorizer.transform(self.df.loc[:, "comment_text"])
        bow_df = pd.DataFrame(np.array(sparse_matrix.todense()))
        self.bow = pd.concat([self.df.loc[:, "id"], bow_df], axis = 1)
    
    def addTagFeatures(self):
        self.df.loc[:, "pres_links"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "<link>")[1])
        self.df.loc[:, "pres_image"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "<image>")[1])
        self.df.loc[:, "pres_user"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "<user>")[1])
        self.df.loc[:, "pres_date"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "(utc)")[1])
        
    def addEmotionFeatures(self, emotion):
        self.df.loc[:, "pres_anger"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.anger_dict)[1])
        self.df.loc[:, "pres_fear"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.fear_dict)[1])
        self.df.loc[:, "pres_joy"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.joy_dict)[1])
        self.df.loc[:, "pres_sadness"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.sadness_dict)[1])
        
    def addVulgaritiesFeatures(self, terms):
        self.df.loc[:, "pres_block_terms"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.block_terms_list)[1])
        self.df.loc[:, "pres_swear_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.swear_words_list)[1])
        self.df.loc[:, "pres_google_bad_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.google_bad_words_list)[1])
        self.df.loc[:, "pres_github_bad_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.github_bad_words_list)[1])
        self.df.loc[:, "pres_bad_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.bad_words_list)[1])
    
    def addPunctuationCount(self):
        self.df.loc[:, "punc_count"] = self.df.loc[:, "comment_text"].apply(lambda x: count_punc(x))
        self.df.loc[:, "punc_count_normed"] = self.df.apply(lambda x: x["punc_count"] / len(x["comment_text"]) , axis=1)
    
    def writeFeatures(self, path_manual_features, path_bow_features):
        self.bow.to_csv(path_bow_features, index = None)
        self.df.to_csv(path_manual_features, index = None)
    

## Defining the main function

In [None]:
def main(train_data, test_data, vectorizer):
    
    vulgarities_lexicon = Vulgarities(block_terms_path, swear_words_path, google_bad_words_path, github_bad_words_path, bad_words_path)
    emo_lexicon = EmoLex(emo_lexicon_path)
    
    print("adding features for training data")
    train_data.addBOWFeatures(vectorizer)
    train_data.addTagFeatures()
    train_data.addVulgaritiesFeatures(vulgarities_lexicon)
    train_data.addEmotionFeatures(emo_lexicon)
    train_data.addPunctuationCount()
    train_data.performSentimentAnalysis()
    train_data.writeFeatures("../data/data/train/train_data_manual.csv", "../data/data/train/train_data_bow.csv")
    print("done adding features for training data")
    
    print("adding features for testing data")
    test_data.addBOWFeatures(vectorizer)
    test_data.addTagFeatures()
    test_data.addVulgaritiesFeatures(vulgarities_lexicon)
    test_data.addEmotionFeatures(emo_lexicon)
    test_data.addPunctuationCount()
    test_data.writeFeatures("../data/data/test/test_data_manual.csv", "../data/data/test/test_data_bow.csv")
    print("done adding features for testing data")
    
    return train_data, test_data
    

In [None]:
if __name__ == "__main__":
    
    train_data = Data(train_data_path, "train")
    test_data = Data(test_data_path, "test")
    
    vectorizer = TfidfVectorizer(min_df= 0.05, max_df = 0.75, lowercase=True, ngram_range= (1,2)) # play around with the ngram
    comments = list(train_data.df.loc[:, "comment_text"]) + list(test_data.df.loc[:, "comment_text"])
    vectorizer.fit(comments)
    
    train_data, test_data = main(train_data, test_data, vectorizer)
    

In [4]:
from afinn import Afinn

In [10]:
afinn = Afinn()
train_data = pd.read_csv(train_data_path)

In [11]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true....",1,0,0,0,0,0
1,27450690,""" Please do not vandalize pages, as you did wi...",0,0,0,0,0,0
2,54037174,""" """"Points of interest"""" I removed the """"point...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [13]:
train_data.loc[:, "sentiment_val"] = train_data.loc[:, "comment_text"].apply(lambda x: afinn.score(x))

In [29]:
train_data.head(11)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sentiment_val
0,22256635,"Nonsense? kiss off, geek. what I said is true....",1,0,0,0,0,0,2.0
1,27450690,""" Please do not vandalize pages, as you did wi...",0,0,0,0,0,0,0.0
2,54037174,""" """"Points of interest"""" I removed the """"point...",0,0,0,0,0,0,4.0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,3.0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,0.0
5,82428052,Fried chickens Is dat sum fried chickens?,0,0,0,0,0,0,0.0
6,87311443,Why can you put English for example on some pl...,0,0,0,0,0,0,2.0
7,114749757,Guy Fawkes im a resident in bridgwater and i g...,0,0,0,0,0,0,4.0
8,138560519,as far as nicknames go this article is embarra...,0,0,0,0,0,0,-3.0
9,139353149,Woodland Meadows Good to hear that you correct...,0,0,0,0,0,0,3.0


In [27]:
train_data.iloc[10,1]

'" Well I just finished a good bit of editing. I haven\'t had a chance to go through and do all the tagging and referencing yet since work calls but I am extremely confident in the scientific factual basis of everything stated in here. The referencing can pretty much be all made to the book ""Particle Imaging Velocimetry A practical guide"" by Raffel, Willert, Wereley and Kompenhans if someone can add it. Also, not sure how to exactly do that as it all can be found in numerous scientific journals, databases, etc. but is so conveniently placed all in that one book. The only thing I am unsure about as I have no experience with is Molecular Tagging Velocimetry. To put it simply, I would like to keep technique defects in the summaries but I don\'t know what this one\'s is. To put is bluntly, the description as it is now is too good to be true. I know it has defects and problems, I don\'t know what they are. The fact is the technique was developed in 97ish and is not in widespread use. If t

In [21]:
import nltk

In [22]:
from nltk.corpus import sentiwordnet as swn
#nltk.download('averaged_perceptron_tagger')

In [23]:
data = "You are a great great great person! What is wrong with you!! What is wrong with you!!!!! Very good review! Nice and friendly place with excellent food and friendly and helpful staff. You need a car though. The children wants to go back! Playground and animals entertained them and they felt like at home. I also recommend the dinner! Great value for the price!"
sentences = nltk.sent_tokenize(data)

In [24]:
stokens = [nltk.word_tokenize(sent) for sent in sentences]

In [25]:
stokens

[['You', 'are', 'a', 'great', 'great', 'great', 'person', '!'],
 ['What', 'is', 'wrong', 'with', 'you', '!', '!'],
 ['What', 'is', 'wrong', 'with', 'you', '!', '!', '!', '!', '!'],
 ['Very', 'good', 'review', '!'],
 ['Nice',
  'and',
  'friendly',
  'place',
  'with',
  'excellent',
  'food',
  'and',
  'friendly',
  'and',
  'helpful',
  'staff',
  '.'],
 ['You', 'need', 'a', 'car', 'though', '.'],
 ['The', 'children', 'wants', 'to', 'go', 'back', '!'],
 ['Playground',
  'and',
  'animals',
  'entertained',
  'them',
  'and',
  'they',
  'felt',
  'like',
  'at',
  'home',
  '.'],
 ['I', 'also', 'recommend', 'the', 'dinner', '!'],
 ['Great', 'value', 'for', 'the', 'price', '!']]

In [26]:
taggedlist = []
for stoken in stokens:        
     taggedlist.append(nltk.pos_tag(stoken))
print(taggedlist)

[[('You', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('great', 'JJ'), ('great', 'JJ'), ('great', 'JJ'), ('person', 'NN'), ('!', '.')], [('What', 'WP'), ('is', 'VBZ'), ('wrong', 'JJ'), ('with', 'IN'), ('you', 'PRP'), ('!', '.'), ('!', '.')], [('What', 'WP'), ('is', 'VBZ'), ('wrong', 'JJ'), ('with', 'IN'), ('you', 'PRP'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.'), ('!', '.')], [('Very', 'RB'), ('good', 'JJ'), ('review', 'NN'), ('!', '.')], [('Nice', 'NNP'), ('and', 'CC'), ('friendly', 'JJ'), ('place', 'NN'), ('with', 'IN'), ('excellent', 'JJ'), ('food', 'NN'), ('and', 'CC'), ('friendly', 'JJ'), ('and', 'CC'), ('helpful', 'JJ'), ('staff', 'NN'), ('.', '.')], [('You', 'PRP'), ('need', 'VBP'), ('a', 'DT'), ('car', 'NN'), ('though', 'NN'), ('.', '.')], [('The', 'DT'), ('children', 'NNS'), ('wants', 'VBZ'), ('to', 'TO'), ('go', 'VB'), ('back', 'RB'), ('!', '.')], [('Playground', 'NN'), ('and', 'CC'), ('animals', 'NNS'), ('entertained', 'VBD'), ('them', 'PRP'), ('and', 'CC'), ('they', 'PRP'), (

In [27]:
wnl = nltk.WordNetLemmatizer()
score_list=[]
for idx,taggedsent in enumerate(taggedlist):
    score_list.append([])
    for idx2,t in enumerate(taggedsent):
        newtag=''
        lemmatized=wnl.lemmatize(t[0])
        if t[1].startswith('NN'):
            newtag='n'
        elif t[1].startswith('JJ'):
            newtag='a'
        elif t[1].startswith('V'):
            newtag='v'
        elif t[1].startswith('R'):
            newtag='r'
        else:
            newtag=''       
        if(newtag!=''):    
            synsets = list(swn.senti_synsets(lemmatized, newtag))
            #Getting average of all possible sentiments, as you requested        
            score=0
            if(len(synsets)>0):
                for syn in synsets:
                    score+=syn.pos_score()-syn.neg_score()
                score_list[idx].append(score/len(synsets))
            
print(score_list)

[[0.009615384615384616, 0.2916666666666667, 0.2916666666666667, 0.2916666666666667, 0.0], [0.009615384615384616, -0.5972222222222222], [0.009615384615384616, -0.5972222222222222], [0.125, 0.6130952380952381, 0.025], [0.0, 0.1875, 0.0, 1.0, -0.041666666666666664, 0.1875, 0.25, 0.0], [-0.08333333333333333, 0.0], [0.03125, -0.075, 0.0125, 0.0], [0.0, -0.375, 0.08333333333333333, 0.015625, 0.027777777777777776], [0.0, 0.25, 0.0], [0.0, 0.10416666666666667, 0.08928571428571429]]


In [28]:
sentence_sentiment=[]

for score_sent in score_list:
    sentence_sentiment.append(sum([word_score for word_score in score_sent])/len(score_sent))
print("Sentiment for each sentence for:" + data + "\n")
print(sentence_sentiment)

Sentiment for each sentence for:You are a great great great person! What is wrong with you!! What is wrong with you!!!!! Very good review! Nice and friendly place with excellent food and friendly and helpful staff. You need a car though. The children wants to go back! Playground and animals entertained them and they felt like at home. I also recommend the dinner! Great value for the price!

[0.17692307692307696, -0.2938034188034188, -0.2938034188034188, 0.2543650793650794, 0.19791666666666666, -0.041666666666666664, -0.007812499999999999, -0.04965277777777778, 0.08333333333333333, 0.06448412698412699]


In [29]:
for index in range(len(sentences)):
    print(sentences[index].ljust(80) + str(sentence_sentiment[index]))

You are a great great great person!                                             0.17692307692307696
What is wrong with you!!                                                        -0.2938034188034188
What is wrong with you!!!!!                                                     -0.2938034188034188
Very good review!                                                               0.2543650793650794
Nice and friendly place with excellent food and friendly and helpful staff.     0.19791666666666666
You need a car though.                                                          -0.041666666666666664
The children wants to go back!                                                  -0.007812499999999999
Playground and animals entertained them and they felt like at home.             -0.04965277777777778
I also recommend the dinner!                                                    0.08333333333333333
Great value for the price!                                                      0.06448412698412

### Binary Sentiment ###

In [30]:
import nltk
from nltk.corpus import sentiwordnet as swn

In [142]:
data = "Congratulations. You are a great! You suck! Asshole! Fucker! Fuck. Bitches. Gay. Sissy. Motherfuckers. PENIS. Fker. a$$hole"

In [143]:
tokens = nltk.word_tokenize(data)

In [144]:
taggedlist = []    
taggedlist.append(nltk.pos_tag(tokens))
print(taggedlist)

[[('Congratulations', 'NNS'), ('.', '.'), ('You', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('great', 'JJ'), ('!', '.'), ('You', 'PRP'), ('suck', 'VBP'), ('!', '.'), ('Asshole', 'NN'), ('!', '.'), ('Fucker', 'NN'), ('!', '.'), ('Fuck', 'NNP'), ('.', '.'), ('Bitches', 'NNP'), ('.', '.'), ('Gay', 'NNP'), ('.', '.'), ('Sissy', 'NNP'), ('.', '.'), ('Motherfuckers', 'NNP'), ('.', '.'), ('PENIS', 'NNP'), ('.', '.'), ('Fker', 'NNP'), ('.', '.'), ('a', 'DT'), ('$', '$'), ('$', '$'), ('hole', 'NN')]]


In [145]:
wnl = nltk.WordNetLemmatizer()
score_list=[]
word_list=[]
for idx,t in enumerate(taggedlist[0]):
    newtag=''
    lemmatized=wnl.lemmatize(t[0])
    if t[1].startswith('NN'):
        newtag='n'
    elif t[1].startswith('JJ'):
        newtag='a'
    elif t[1].startswith('V'):
        newtag='v'
    elif t[1].startswith('R'):
        newtag='r'
    else:
        newtag=''
    if(newtag!=''):
        synsets = list(swn.senti_synsets(lemmatized, newtag))        
        score=0
        if(len(synsets)>0):
            print(lemmatized, newtag)
            for syn in synsets:
                score+=syn.pos_score()-syn.neg_score()
            score_list.append(score/len(synsets))
            word_list.append(lemmatized)
word_list

Congratulations n
are v
great a
suck v
Asshole n
Fucker n
Fuck n
Bitches n
Gay n
Sissy n
Motherfuckers n
PENIS n
hole n


['Congratulations',
 'are',
 'great',
 'suck',
 'Asshole',
 'Fucker',
 'Fuck',
 'Bitches',
 'Gay',
 'Sissy',
 'Motherfuckers',
 'PENIS',
 'hole']

In [146]:
for index in range(len(word_list)):
    print(word_list[index].ljust(20) + str(score_list[index]))

Congratulations     0.5416666666666666
are                 0.009615384615384616
great               0.2916666666666667
suck                -0.07142857142857142
Asshole             -0.375
Fucker              -0.0625
Fuck                0.0
Bitches             -0.1875
Gay                 0.125
Sissy               0.0
Motherfuckers       -0.625
PENIS               0.0
hole                -0.078125


In [147]:
sentence_sentiment= sum(score for score in score_list)/len(score_list)

print(sentence_sentiment)

-0.03320037334460412


In [148]:
print(data.ljust(80))
print(str(sentence_sentiment))

Congratulations. You are a great! You suck! Asshole! Fucker! Fuck. Bitches. Gay. Sissy. Motherfuckers. PENIS. Fker. a$$hole
-0.03320037334460412
