## Importing the necessary libraries 

In [1]:
import pandas as pd
import numpy as np
from textstat.textstat import textstat
import sys
import os
import string
import math
import random
import re
from sklearn.metrics import make_scorer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

In [2]:
lemmatizer = WordNetLemmatizer()
#stemmer = PorterStemmer()
stemmer = SnowballStemmer("english")

#### Adding the liblinear path

#### Importing the classifier

#### Importing sklearn libraries

In [3]:
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import FunctionTransformer

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier



## Stating the path of the data

#### Given Dataset

In [51]:
train_path = "../data/data/train.csv"
test_path = "../data/data/test.csv"

In [52]:
trainData = pd.read_csv(train_path)
trainData.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [53]:
testData = pd.read_csv(test_path)
testData.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


#### Data Cleaning

In [8]:
emotion_lexicon_path = "../data/features/NRC-AffectIntensity-Lexicon.txt"

In [25]:
regex_image1 = re.compile("((\S*)|((Image:|File:).*))(.gif|.png|.tiff|.jpg|.jpeg|.svg|.ogg|.pdf|.m4a)", re.IGNORECASE)
regex_image2 = re.compile("(?i)(\[wiki_link: image.*\])", re.IGNORECASE)
regex_list_image = [regex_image1, regex_image2]

In [10]:
regex_links1 = re.compile("(?i)(\[wiki_link:.*?\])", re.IGNORECASE)
regex_links2 = re.compile("(?i)(\[EXTERNA(L)?_LINK:.*?\])", re.IGNORECASE)
regex_links3 = re.compile("((http)|(www))\S*", re.IGNORECASE)
regex_list_link = [regex_links1, regex_links2, regex_links3]

In [11]:
regex_new_line = re.compile("\n", re.IGNORECASE)

In [12]:
regex_multiple_white_space = re.compile("\s+", re.IGNORECASE)

In [13]:
regex_user1 = re.compile("[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}", re.IGNORECASE)
regex_user2 = re.compile("\[{1,2}(User:|User Talk:|User talk:|User_talk:)\s?\w*(\||\]{1,2})?", re.IGNORECASE)
regex_list_user = [regex_user1, regex_user2]

In [49]:
regex_css = re.compile('(style|class|cellpadding|cellspacing|width|colspan|rowspan|valign|align|id)\s?=\s?\"\".*?\"\"', re.IGNORECASE)

### Defining helper functions

In [14]:
def remove_new_line(comment, regex):
    try:
        comment = regex.sub(" ", comment)
        return comment
    except:
        print(comment)
        return ""

In [15]:
def replace_image_multiple_regex(comment, regexList):
    for regex in regexList:
        comment = regex.sub("<image>", comment)
    return comment

In [16]:
def replace_link_multiple_regex(comment, regexList):
    for regex in regexList:
        comment = regex.sub("<link>", comment)
    return comment

In [17]:
def remove_mutiple_white_space(comment, regex):
    comment = regex.sub(" ", comment).strip()
    return comment

In [18]:
def replace_user_multiple_regex(comment, regexList):
    for regex in regexList:
        comment = regex.sub("<user>", comment)
    return comment

In [29]:
def replace_css(comment, regex):
    comment = regex.sub("<css>", comment)
    return comment

In [19]:
def apply_stem(comment):
    word_array = comment.split(" ")
    word_array = [stemmer.stem(x) for x in word_array]
    result = " ".join(word_array)
    return result

In [54]:
print("Removing empty data...")
trainData['comment_text'].fillna("<empty>", inplace=True)
print("Removing new line...")
trainData["comment_text"] = trainData["comment_text"].apply(lambda x: remove_new_line(x, regex_new_line))
print("Removing white spaces...")
trainData["comment_text"] = trainData["comment_text"].apply(lambda x: remove_mutiple_white_space(x, regex_multiple_white_space))
print("Removing images...")
trainData["comment_text"] = trainData["comment_text"].apply(lambda x: replace_image_multiple_regex(x, regex_list_image))
print("Removing links...")
trainData["comment_text"] = trainData["comment_text"].apply(lambda x: replace_link_multiple_regex(x, regex_list_link))
print("Removing users...")
trainData["comment_text"] = trainData["comment_text"].apply(lambda x: replace_user_multiple_regex(x, regex_list_user))
print("Removing css...")
trainData["comment_text"] = trainData["comment_text"].apply(lambda x: replace_css(x, regex_css))
print("Done!")

Removing empty data...
Removing new line...
Removing white spaces...
Removing images...
Removing links...
Removing users...
Removing css...
Done!


In [42]:
testString = "{| style=\"\"background-color:#F5FFFA; padding:0;\"\" cellpadding=\"\"0\"\" style=\"\"border:1px solid #084080; background-color:#F5FFFA; vertical-align:top; color:#000000;\"\"|"
print(testString)
print(replace_css(testString, regex_css))

{| style=""background-color:#F5FFFA; padding:0;"" cellpadding=""0"" style=""border:1px solid #084080; background-color:#F5FFFA; vertical-align:top; color:#000000;""|
{| <css> <css> <css>|


In [77]:
print("Removing empty data...")
testData['comment_text'].fillna("<empty>", inplace=True)
print("Removing new line...")
testData["comment_text"] = testData["comment_text"].apply(lambda x: remove_new_line(x, regex_new_line))
print("Removing white spaces...")
testData["comment_text"] = testData["comment_text"].apply(lambda x: remove_mutiple_white_space(x, regex_multiple_white_space))
print("Removing images...")
testData["comment_text"] = testData["comment_text"].apply(lambda x: replace_image_multiple_regex(x, regex_list_image))
print("Removing links...")
testData["comment_text"] = testData["comment_text"].apply(lambda x: replace_link_multiple_regex(x, regex_list_link))
print("Removing users...")
testData["comment_text"] = testData["comment_text"].apply(lambda x: replace_user_multiple_regex(x, regex_list_user))

Removing empty data...
Removing new line...
Removing white spaces...
Removing images...
Removing links...
Removing users...
Saving csv...
Saved...


In [56]:
#testData["stemmed_text"] = testData["comment_text"].apply(lambda x: apply_stem(x));
trainData["stemmed_text"] = trainData["comment_text"].apply(lambda x: apply_stem(x));

In [55]:
#testData.to_csv("test_cleaned.csv", encoding = "utf-8", index = False)
trainData.to_csv("train_cleaned.csv", encoding = "utf-8", index = False)

#### Data Cleaning Methods

In [None]:
def clean_comment(comment): # cleaning to be filled up
    if math.isnan(comment):
        return ""
    else:
        return str(comment)

### Textstat Features

In [None]:
def average_syllable(comment):
    comment_lst = comment.split(" ")
    syllable_count = []
    for word in comment_lst:
        try:
            syllable_count.extend([textstat.syllable_count(word)])
        except:
            continue
    
    if len(syllable_count) == 0:
        return 0.0
    else:
        return np.average(syllable_count)

### Emotion Features

#### Simple Translator to remove all puncutuations 

In [None]:
translator = str.maketrans('', '', string.punctuation)

#### To get the average emotion score

In [None]:
def avg_emotion(comment, dict_):
    try:
        comment = comment.translate(translator)
    except:
        comment = ""
    comment_lst = comment.split(" ")
    score_lst = [dict_.get(word.lower()) for word in comment_lst if dict_.get(word.lower()) is not None]
    if len(score_lst) == 0:
        return 0.0
    else:
        return np.average(score_lst)

### Boolean Features

In [5]:
def check_file(comment):
    extensions = [".jpg", ".png", ".jpeg", ".gif", "tiff"]
    
    for extension in extensions: 
        
        if extension in comment.lower():
            return 1
    return 0

### Defining the Classes 

#### Data File Class

In [None]:
class DataFile:
    
    def __init__(self, path):
        self.__path = path
        df = pd.read_csv(path).replace(np.nan, '', regex=True)
        self.__df = df
        
    def dataCleaning(self):
        self.__df.loc[:, "comment_text"] = self.__df.loc[:, "comment_text"].apply(lambda x : clean_comment(x) )
    
    def getDF(self):
        return self.__df
    
    def updateDF(self, new_df):
        self.__df = new_df
    

#### Feature Engineering Class 

In [None]:
class FeaturesEngineer:
    
    def __init__(self, emotion_lexicon_path):
        self.emotion_lexicon_path = emotion_lexicon_path
        self.emotion_lexicon = pd.read_csv(self.emotion_lexicon_path, sep = "\t")
        
        self.anger = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "anger", :]
        self.fear = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "fear", :]
        self.joy = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "joy", :]
        self.sadness = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "sadness", :]
        
        anger_dict = {}
        for i in range(0, self.anger.shape[0]):
            word = self.anger.iloc[i, :].term
            score = self.anger.iloc[i, :].score
            anger_dict[word] = score
            
        fear_dict = {}
        for i in range(0, self.fear.shape[0]):
            word = self.fear.iloc[i, :].term
            score = self.fear.iloc[i, :].score
            fear_dict[word] = score
            
        joy_dict = {}
        for i in range(0, self.joy.shape[0]):
            word = self.joy.iloc[i, :].term
            score = self.joy.iloc[i, :].score
            joy_dict[word] = score
            
        sadness_dict = {}
        for i in range(0, self.sadness.shape[0]):
            word = self.sadness.iloc[i, :].term
            score = self.sadness.iloc[i, :].score
            sadness_dict[word] = score
            
        self.anger_dict = anger_dict
        self.fear_dict = fear_dict
        self.joy_dict = joy_dict
        self.sadness_dict = sadness_dict
    
    def createTextStatFeatures(self, df):
        
        df.loc[:,'comment_len'] = df.loc[:,'comment_text'].apply(lambda x: len(x))
        df.loc[:,'comment_avg_syllable'] = df.loc[:,'comment_text'].apply(lambda x: average_syllable(x))
        df.loc[:,'comment_syllable'] = df.loc[:,'comment_text'].apply(lambda x: textstat.syllable_count(x))
        df.loc[:,'comment_num_sent'] = df.loc[:, 'comment_text'].apply(lambda x: textstat.sentence_count(x))
        df.loc[:, "comment_word_per_sent"] = df.loc[:, "comment_text"].apply(lambda x: textstat.lexicon_count(x) / textstat.sentence_count(x))
        df.loc[:,'comment_flesch_reading_ease'] = df.loc[:,'comment_text'].apply(lambda x: textstat.flesch_reading_ease(x))
        df.loc[:,'comment_flesch_kincaid_grade'] = df.loc[:,'comment_text'].apply(lambda x: textstat.flesch_kincaid_grade(x))
        
        return df
        
    def createEmotionFeatures(self, df):
        
        df.loc[:, "avg_anger"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.anger_dict))
        df.loc[:, "avg_fear"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.fear_dict))
        df.loc[:, "avg_joy"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.joy_dict))
        df.loc[:, "avg_sadness"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.sadness_dict))
        
        return df
    
    def booleanFeatures()
    
    def intensifiersFeatures(self, df):
        
        df.loc[:, "emphasized_words"] = df.loc[:, "comment_text"].apply(lambda x: x)


### Defining the Evaluation Metric

In [None]:
def loss_func(y_label, y_proba):
    return log_loss(y_label, y_proba)

### Classification pipeline

#### Function to get the engineered features 

In [None]:
def get_manual_features(df):
    
    col = ["id", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]
    df_sieved = df.drop(col, axis = 1)
    
    return df_sieved
    

#### Turning to FT

In [None]:
get_manual_features_ft = FunctionTransformer(get_manual_features, validate = False)

### To do grid search for the best C 

#### Splitting the data in nr-fold cv 

In [None]:
def split_data(num, cv):
    
    num_points = int(num/cv)
    num_points_last = num - (cv-1) * num_points
    selected = []
    index_test = []
    index_train = []
    
    for i in range(0, cv):
        if i == cv-1:
            choices = [x for x in range(0, num) if x not in selected]
            picks = random.sample(choices, num_points_last)
            remaining = [x for x in range(0, num) if x not in picks]
            index_test.append(picks)
            index_train.append(remaining)
            selected.extend(picks)
        else:
            choices = [x for x in range(0, num) if x not in selected]
            picks = random.sample(choices, num_points)
            remaining = [x for x in range(0, num) if x not in picks]
            index_test.append(picks)
            index_train.append(remaining)
            selected.extend(picks)
            
    return index_train, index_test

#### Getting the best C

In [None]:
def best_C(min_c, max_c, step, data, label, cv):
    index_train, index_test = split_data(data.shape[0], cv)
    c_range = np.arange(min_c, max_c, step)
    loss_lst_compiled = []
    for c in c_range:
        val = 2.0 ** c
        loss_lst = []
        for i in range(0, cv):
            test_ = index_test[i]
            train_ = index_train[i]
            
            features_train = data.iloc[train_, :]
            labels_train = data.iloc[train_, :].loc[:, label]
            
            features_test = data.iloc[test_, :]
            labels_test = data.iloc[test_, :].loc[:, label]
            
            param = "-c " + str(val) + " -s " + str(self.algo)
    
            pipe.fit(features_train, labels_train)
            labels = pipe.predict(features_test)
            proba = pipe.predict_proba(features_test)
            
            loss = loss_func(labels_test, proba)
            loss_lst.extend([loss])
        
        avg_loss = np.average(loss_lst)
        loss_lst_compiled.append(avg_loss)
        
    loss_lst_compiled = [x for x in zip(c_range, loss_lst_compiled)]  
    loss_lst_compiled = sorted(loss_lst_compiled, key= lambda x: x[1])
    return  loss_lst_compiled[0]


### Defining the main function 

In [None]:
def main():
    
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    loss_compiled = []
    
    for label in labels:
        best_c, best_rate = best_C(-5,5,1, trainData.getDF(), label, 5)
        print('Log loss for "{}" based on 5 fold CV: {}, with log2c = {}'.format(label, best_rate, best_c))
        loss_compiled.extend([best_rate])
    
    print(loss_compiled)
    avg_loss = np.average(loss_compiled)
    print('Average Log loss = {}'.format(avg_loss))
    

#### Running the main function 

In [None]:
if __name__ == "__main__":
    
    #extractor = FeaturesEngineer(emotion_lexicon_path)
    #trainData = DataFile(path = train_path)
    print("Done reading in train data")
    #testData = DataFile(path= test_path)
    #print("Done reading in test data")
    
    #trainData.updateDF(extractor.createEmotionFeatures(trainData.getDF()))
    print("Done creating emotion features for train data")
    #testData.updateDF(extractor.createEmotionFeatures(testData.getDF()))
    #print("Done creating emotion features for test data")
    
    #trainData.updateDF(extractor.createTextStatFeatures(trainData.getDF()))
    print("Done creating textstat features for train data")
    #testData.updateDF(extractor.createTextStatFeatures(testData.getDF()))
    #print("Done creating textstat features for test data")
    
    main()
    

In [None]:
trainData.getDF().to_csv("../data/features/train_data_cache.csv", index = False)

In [None]:
LogisticRegression(max_iter=)