## Importing the necessary libraries

#### Basic Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from string import punctuation
from collections import Counter

#### Scikit Learn Libraries

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
## May want to try count vectorizer? 

## Stating the path of the data files 

#### Path of the training / testing data

In [3]:
train_data_path = "../../data/data/train/train_cleaned.csv"
test_data_path = "../../data/data/test/test_cleaned.csv"

#### Path of the lexicons

In [4]:
emo_lexicon_path = "../../data/external dataset/NRC-AffectIntensity-Lexicon.txt"
block_terms_path = "../../data/external dataset/Terms-To-Block.txt"
swear_words_path = "../../data/external dataset/swearWords.txt"
google_bad_words_path = "../../data/external dataset/google-bad-words.txt"
github_bad_words_path = "../../data/external dataset/bad-words-github.txt"
bad_words_path = "../../data/external dataset/bad-words.txt"

## Helper Functions

In [5]:
def check_tags(comment, tag):
    length = len(comment.split(" "))
    count = comment.lower().count(tag)
    return float(count)/float(length), int(count > 0)

In [6]:
def check_emotions(comment, emotion):
    comment_array = comment.split(" ")
    length = len(comment_array)
    count = len([emotion.get(x) for x in comment_array if emotion.get(x) is not None])
    return float(count)/float(length), int(count > 0)

In [7]:
def check_swear_words(comment, word_lst):
#     comment = " " + comment + " "
#     length = len(comment.split(" "))
#     count = 0
#     for word in word_lst:
#         word = " " + word + " "
#         if word in comment:
#             comment = comment.replace(word, "")
#             count += 1 
    words = comment.split(" ")
    is_swear_word = [1 if word in word_lst else 0 for word in words]
    total_swear_words = np.sum(is_swear_word)
    return float(total_swear_words)/float(len(words)), int(total_swear_words>0)

In [8]:
def count_punc(comment):
    counts = Counter(comment)
    punc_dict = {key: occurences for key, occurences in counts.items() if key in punctuation}
    punc_count = sum(punc_dict.values())
    return punc_count

#### Defining the Emotion Lexicon class

In [9]:
class EmoLex: 
    
    def __init__(self, emoPath):
        self.emoPath = emoPath
        self.emoLex = pd.read_csv(self.emoPath, sep = "\t")
        
        self.anger = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "anger", :]
        self.fear = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "fear", :]
        self.joy = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "joy", :]
        self.sadness = self.emoLex.loc[self.emoLex.loc[:, "AffectDimension"] == "sadness", :]
        
        anger_dict = {}
        for i in range(0, self.anger.shape[0]):
            word = self.anger.iloc[i, :].term
            score = self.anger.iloc[i, :].score
            anger_dict[word] = score
            
        fear_dict = {}
        for i in range(0, self.fear.shape[0]):
            word = self.fear.iloc[i, :].term
            score = self.fear.iloc[i, :].score
            fear_dict[word] = score
            
        joy_dict = {}
        for i in range(0, self.joy.shape[0]):
            word = self.joy.iloc[i, :].term
            score = self.joy.iloc[i, :].score
            joy_dict[word] = score
            
        sadness_dict = {}
        for i in range(0, self.sadness.shape[0]):
            word = self.sadness.iloc[i, :].term
            score = self.sadness.iloc[i, :].score
            sadness_dict[word] = score
            
        self.anger_dict = anger_dict
        self.fear_dict = fear_dict
        self.joy_dict = joy_dict
        self.sadness_dict = sadness_dict
        

#### Defining the vulgarities class

In [10]:
class Vulgarities:
    
    def __init__(self, block_terms_path, swear_words_path, 
                 google_bad_words_path, github_bad_words_path, bad_words_path):
        
        self.block_terms_path = block_terms_path
        self.swear_words_path = swear_words_path
        self.google_bad_words_path = google_bad_words_path
        self.github_bad_words_path = github_bad_words_path
        self.bad_words_path = bad_words_path
        
#         self.block_terms_list = list(pd.read_csv(self.block_terms_path, header = None, names= ["words"]).loc[:, "words"])
#         self.swear_words_list = list(pd.read_csv(self.swear_words_path, header = None, names= ["words"]).loc[:, "words"])
#         self.google_bad_words_list = list(pd.read_csv(self.google_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
#         self.github_bad_words_list = list(pd.read_csv(self.github_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
#         self.bad_words_list = list(pd.read_csv(self.bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        
        self.block_terms_list = set(pd.read_csv(self.block_terms_path, header = None, names= ["words"]).loc[:, "words"])
        self.swear_words_list = set(pd.read_csv(self.swear_words_path, header = None, names= ["words"]).loc[:, "words"])
        self.google_bad_words_list = set(pd.read_csv(self.google_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        self.github_bad_words_list = set(pd.read_csv(self.github_bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        self.bad_words_list = set(pd.read_csv(self.bad_words_path, header = None, names= ["words"]).loc[:, "words"])
        
        

#### Defining the data class

In [11]:
class Data:
    
    def __init__(self, path, type_):
        self.path = path
        self.df = pd.read_csv(path)
    
    def addBOWFeatures(self, vectorizer):
        sparse_matrix = vectorizer.transform(self.df.loc[:, "comment_text"])
        bow_df = pd.DataFrame(np.array(sparse_matrix.todense()))
        self.bow = pd.concat([self.df.loc[:, "id"], bow_df], axis = 1)
    
    def addTagFeatures(self):
        self.df.loc[:, "pres_links"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "<link>")[1])
        self.df.loc[:, "pres_image"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "<image>")[1])
        self.df.loc[:, "pres_user"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "<user>")[1])
        self.df.loc[:, "pres_date"] = self.df.loc[:, "comment_text"].apply(lambda x: check_tags(x, "(utc)")[1])
        
    def addEmotionFeatures(self, emotion):
        self.df.loc[:, "pres_anger"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.anger_dict)[1])
        self.df.loc[:, "pres_fear"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.fear_dict)[1])
        self.df.loc[:, "pres_joy"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.joy_dict)[1])
        self.df.loc[:, "pres_sadness"] = self.df.loc[:, "comment_text"].apply(lambda x: check_emotions(x, emotion.sadness_dict)[1])
        
    def addVulgaritiesFeatures(self, terms):
        self.df.loc[:, "pres_block_terms"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.block_terms_list)[1])
        self.df.loc[:, "pres_swear_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.swear_words_list)[1])
        self.df.loc[:, "pres_google_bad_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.google_bad_words_list)[1])
        self.df.loc[:, "pres_github_bad_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.github_bad_words_list)[1])
        self.df.loc[:, "pres_bad_words"] = self.df.loc[:, "comment_text"].apply(lambda x: check_swear_words(x, terms.bad_words_list)[1])
    
    def addPunctuationCount(self):
        self.df.loc[:, "punc_count"] = self.df.loc[:, "comment_text"].apply(lambda x: count_punc(x))
        self.df.loc[:, "punc_count_normed"] = self.df.apply(lambda x: x["punc_count"] / len(x["comment_text"]) , axis=1)
    
    def writeFeatures(self, path_manual_features, path_bow_features):
        self.bow.to_csv(path_bow_features, index = None)
        self.df.to_csv(path_manual_features, index = None)
    

## Defining the main function

In [12]:
def main(train_data, test_data, vectorizer):
    
    vulgarities_lexicon = Vulgarities(block_terms_path, swear_words_path, google_bad_words_path, github_bad_words_path, bad_words_path)
    emo_lexicon = EmoLex(emo_lexicon_path)
    
    print("adding features for training data")
    train_data.addBOWFeatures(vectorizer)
    train_data.addTagFeatures()
    train_data.addVulgaritiesFeatures(vulgarities_lexicon)
    train_data.addEmotionFeatures(emo_lexicon)
    train_data.addPunctuationCount()
    train_data.writeFeatures("../../data/data/train/train_data_manual.csv", "../data/data/train/train_data_bow.csv")
    print("done adding features for training data")
    
    print("adding features for testing data")
    test_data.addBOWFeatures(vectorizer)
    test_data.addTagFeatures()
    test_data.addVulgaritiesFeatures(vulgarities_lexicon)
    test_data.addEmotionFeatures(emo_lexicon)
    test_data.addPunctuationCount()
    test_data.writeFeatures("../../data/data/test/test_data_manual.csv", "../data/data/test/test_data_bow.csv")
    print("done adding features for testing data")
    
    return train_data, test_data
    

In [13]:
if __name__ == "__main__":
    
    train_data = Data(train_data_path, "train")
    test_data = Data(test_data_path, "test")
    
    vectorizer = TfidfVectorizer(min_df= 0.05, max_df = 0.75, lowercase=True, ngram_range= (1,2)) # play around with the ngram
    comments = list(train_data.df.loc[:, "comment_text"]) + list(test_data.df.loc[:, "comment_text"])
    vectorizer.fit(comments)
    
    train_data, test_data = main(train_data, test_data, vectorizer)
    

adding features for training data
done adding features for training data
adding features for testing data
done adding features for testing data
