## Importing the necessary libraries 

In [1]:
import pandas as pd
import numpy as np
from textstat.textstat import textstat
import sys
import os
import string
import math
import random
from sklearn.metrics import make_scorer

#### Adding the liblinear path

#### Importing the classifier

#### Importing sklearn libraries

In [2]:
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import FunctionTransformer

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier



## Stating the path of the data

#### Given Dataset

In [3]:
train_path = "../data/data/train.csv"
test_path = "../data/data/test.csv"

#### Additional Data

In [4]:
emotion_lexicon_path = "../data/features/NRC-AffectIntensity-Lexicon.txt"

### Defining helper functions

#### Data Cleaning Methods

In [5]:
def clean_comment(comment): # cleaning to be filled up
    if math.isnan(comment):
        return ""
    else:
        return str(comment)

### Textstat Features

In [6]:
def average_syllable(comment):
    comment_lst = comment.split(" ")
    syllable_count = []
    for word in comment_lst:
        try:
            syllable_count.extend([textstat.syllable_count(word)])
        except:
            continue
    
    if len(syllable_count) == 0:
        return 0.0
    else:
        return np.average(syllable_count)

### Emotion Features

#### Simple Translator to remove all puncutuations 

In [7]:
translator = str.maketrans('', '', string.punctuation)

#### To get the average emotion score

In [8]:
def avg_emotion(comment, dict_):
    try:
        comment = comment.translate(translator)
    except:
        comment = ""
    comment_lst = comment.split(" ")
    score_lst = [dict_.get(word.lower()) for word in comment_lst if dict_.get(word.lower()) is not None]
    if len(score_lst) == 0:
        return 0.0
    else:
        return np.average(score_lst)

### Defining the Classes 

#### Data File Class

In [9]:
class DataFile:
    
    def __init__(self, path):
        self.__path = path
        df = pd.read_csv(path).replace(np.nan, '', regex=True)
        self.__df = df
        
    def dataCleaning(self):
        self.__df.loc[:, "comment_text"] = self.__df.loc[:, "comment_text"].apply(lambda x : clean_comment(x) )
    
    def getDF(self):
        return self.__df
    
    def updateDF(self, new_df):
        self.__df = new_df
    

#### Feature Engineering Class 

In [10]:
class FeaturesEngineer:
    
    def __init__(self, emotion_lexicon_path):
        self.emotion_lexicon_path = emotion_lexicon_path
        self.emotion_lexicon = pd.read_csv(self.emotion_lexicon_path, sep = "\t")
        
        self.anger = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "anger", :]
        self.fear = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "fear", :]
        self.joy = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "joy", :]
        self.sadness = self.emotion_lexicon.loc[self.emotion_lexicon.loc[:, "AffectDimension"] == "sadness", :]
        
        anger_dict = {}
        for i in range(0, self.anger.shape[0]):
            word = self.anger.iloc[i, :].term
            score = self.anger.iloc[i, :].score
            anger_dict[word] = score
            
        fear_dict = {}
        for i in range(0, self.fear.shape[0]):
            word = self.fear.iloc[i, :].term
            score = self.fear.iloc[i, :].score
            fear_dict[word] = score
            
        joy_dict = {}
        for i in range(0, self.joy.shape[0]):
            word = self.joy.iloc[i, :].term
            score = self.joy.iloc[i, :].score
            joy_dict[word] = score
            
        sadness_dict = {}
        for i in range(0, self.sadness.shape[0]):
            word = self.sadness.iloc[i, :].term
            score = self.sadness.iloc[i, :].score
            sadness_dict[word] = score
            
        self.anger_dict = anger_dict
        self.fear_dict = fear_dict
        self.joy_dict = joy_dict
        self.sadness_dict = sadness_dict
    
    def createTextStatFeatures(self, df):
        
        df.loc[:,'comment_len'] = df.loc[:,'comment_text'].apply(lambda x: len(x))
        df.loc[:,'comment_avg_syllable'] = df.loc[:,'comment_text'].apply(lambda x: average_syllable(x))
        df.loc[:,'comment_syllable'] = df.loc[:,'comment_text'].apply(lambda x: textstat.syllable_count(x))
        df.loc[:,'comment_num_sent'] = df.loc[:, 'comment_text'].apply(lambda x: textstat.sentence_count(x))
        df.loc[:, "comment_word_per_sent"] = df.loc[:, "comment_text"].apply(lambda x: textstat.lexicon_count(x) / textstat.sentence_count(x))
        df.loc[:,'comment_flesch_reading_ease'] = df.loc[:,'comment_text'].apply(lambda x: textstat.flesch_reading_ease(x))
        df.loc[:,'comment_flesch_kincaid_grade'] = df.loc[:,'comment_text'].apply(lambda x: textstat.flesch_kincaid_grade(x))
        
        return df
        
    def createEmotionFeatures(self, df):
        
        df.loc[:, "avg_anger"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.anger_dict))
        df.loc[:, "avg_fear"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.fear_dict))
        df.loc[:, "avg_joy"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.joy_dict))
        df.loc[:, "avg_sadness"] = df.loc[:, "comment_text"].apply(lambda x: avg_emotion(x, self.sadness_dict))
        
        return df


### Defining the Evaluation Metric

In [11]:
def loss_func(y_label, y_proba):
    return log_loss(y_label, y_proba)

### Classification pipeline

#### Function to get the engineered features 

In [12]:
def get_manual_features(df):
    
    col = ["id", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]
    df_sieved = df.drop(col, axis = 1)
    
    return df_sieved
    

#### Turning to FT

In [13]:
get_manual_features_ft = FunctionTransformer(get_manual_features, validate = False)

### To do grid search for the best C 

#### Splitting the data in nr-fold cv 

In [14]:
def split_data(num, cv):
    
    num_points = int(num/cv)
    num_points_last = num - (cv-1) * num_points
    selected = []
    index_test = []
    index_train = []
    
    for i in range(0, cv):
        if i == cv-1:
            choices = [x for x in range(0, num) if x not in selected]
            picks = random.sample(choices, num_points_last)
            remaining = [x for x in range(0, num) if x not in picks]
            index_test.append(picks)
            index_train.append(remaining)
            selected.extend(picks)
        else:
            choices = [x for x in range(0, num) if x not in selected]
            picks = random.sample(choices, num_points)
            remaining = [x for x in range(0, num) if x not in picks]
            index_test.append(picks)
            index_train.append(remaining)
            selected.extend(picks)
            
    return index_train, index_test

#### Getting the best C

In [15]:
def best_C(min_c, max_c, step, data, label, cv):
    index_train, index_test = split_data(data.shape[0], cv)
    c_range = np.arange(min_c, max_c, step)
    loss_lst_compiled = []
    for c in c_range:
        val = 2.0 ** c
        logreg = LogisticRegression(solver = "newton-cg", C = val)
        pipe = Pipeline([
            ("getmanualfeatures", get_manual_features_ft),
            ("classifier", logreg)
        ])
        loss_lst = []
        for i in range(0, cv):
            test_ = index_test[i]
            train_ = index_train[i]
            
            features_train = data.iloc[train_, :]
            labels_train = data.iloc[train_, :].loc[:, label]
            
            features_test = data.iloc[test_, :]
            labels_test = data.iloc[test_, :].loc[:, label]
            
            pipe.fit(features_train, labels_train)
            labels = pipe.predict(features_test)
            proba = pipe.predict_proba(features_test)
            
            loss = loss_func(labels_test, proba)
            loss_lst.extend([loss])
        
        avg_loss = np.average(loss_lst)
        loss_lst_compiled.append(avg_loss)
        
    loss_lst_compiled = [x for x in zip(c_range, loss_lst_compiled)]  
    loss_lst_compiled = sorted(loss_lst_compiled, key= lambda x: x[1])
    return  loss_lst_compiled[0]


### Defining the main function 

In [None]:
def main():
    
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    loss_compiled = []
    
    for label in labels:
        best_c, best_rate = best_C(-3,3,1, trainData.getDF(), label, 5)
        print('Log loss for "{}" based on 5 fold CV: {}, with log2c = {}'.format(label, best_rate, best_c))
        loss_compiled.extend([best_rate])
    
    avg_loss = np.average(loss_compiled)
    print('Average Log loss = {}'.format(avg_loss))
    

#### Running the main function 

In [None]:
if __name__ == "__main__":
    
    extractor = FeaturesEngineer(emotion_lexicon_path)
    trainData = DataFile(path = train_path)
    print("Done reading in train data")
    #testData = DataFile(path= test_path)
    #print("Done reading in test data")
    
    trainData.updateDF(extractor.createEmotionFeatures(trainData.getDF()))
    print("Done creating emotion features for train data")
    #testData.updateDF(extractor.createEmotionFeatures(testData.getDF()))
    #print("Done creating emotion features for test data")
    
    trainData.updateDF(extractor.createTextStatFeatures(trainData.getDF()))
    print("Done creating textstat features for train data")
    #testData.updateDF(extractor.createTextStatFeatures(testData.getDF()))
    #print("Done creating textstat features for test data")
    
    main()
    

Done reading in train data
Done creating emotion features for train data
Done creating textstat features for train data
