## Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import string
import math
import random
from sklearn.metrics import log_loss

## Adding Liblinear to the path

In [2]:
paths = ["C:/Users/serena/Downloads/liblinear-2.20/python"] ## do change this :) 
for path in paths:
    if path not in sys.path:
        sys.path.insert(0, path)

## Importing the classifier

In [3]:
from liblinear import * 
from liblinearutil import *

## Stating the path of the datasets

#### BOW Features

In [4]:
train_data_bow_path = "../../data/data/train/train_data_bow.csv"

test_data_bow_path = "../../data/data/test/test_data_bow.csv"

#### Manual Features

In [5]:
train_data_manual_path = "../../data/data/train/train_data_manual.csv"

test_data_manual_path = "../../data/data/test/test_data_manual.csv"

## Some helper functions

#### Getting the log loss

In [6]:
def loss_func(y_label, y_proba):
    return log_loss(y_label, y_proba)

#### Creating the feature vectors

In [7]:
def get_feature_vector(bow_array, manual_array):
    
    num_obs = len(bow_array)
    max_bow_features = len(bow_array[0])
    features_compiled = []
    
    for i in range(0, num_obs):
        
        features = {}
        bow_obs = bow_array[i]
        manual_obs = manual_array[i]
        
        for index,value in enumerate(bow_obs):   
            if value != 0:
                features[(index + 1)] = value
        
        for index,value in enumerate(manual_obs):
            if value != 0:
                features[(index + 1 + max_bow_features)] = value
        
        features_compiled.extend([features])
    
    return features_compiled

#### Creating a range of values

In [8]:
def frange(start, end, step):
    val = []
    current = start
    val.extend([current])
    while current < end:
        add = current * step
        val.extend([add])
        current = add
    return val

#### Splitting the training data into k blocks

In [9]:
def generateSplits(num, nr_folds):
    
    samples_split = []
    
    choices = [i for i in range(0,num)]
    
    num_samples = int(num/ nr_folds)
    num_samples_last = num - (int(num/ nr_folds) * (nr_folds - 1))
    
    for i in range(0, nr_folds):
        
        if i == (nr_folds -1):
            picks = random.sample(choices, num_samples_last)
            samples_split.append(picks)
            choices = [x for x in choices if x not in picks]
        
        else:
            picks = random.sample(choices, num_samples)
            samples_split.append(picks)
            choices = [x for x in choices if x not in picks]

    return samples_split

## Creating data class

In [10]:
class data: 
    
    def __init__(self, path_bow, path_manual, type_):
        self.path_bow = path_bow
        self.path_manual = path_manual
        
        self.bow_df = pd.read_csv(path_bow)
        self.manual_df = pd.read_csv(path_manual)
        self.num_obs = self.bow_df.shape[0]
        self.type_ = type_
        
        if type_ == "train":
            
            self.bow = np.array(self.bow_df.iloc[:, 1:])
            self.manual = np.array(self.manual_df.iloc[:, 8:])
            self.features = get_feature_vector(self.bow, self.manual)
            
            self.toxic = list(self.manual_df.loc[:, "toxic"])
            self.severe_toxic = list(self.manual_df.loc[:, "severe_toxic"])
            self.obscene = list(self.manual_df.loc[:, "obscene"])
            self.threat = list(self.manual_df.loc[:, "threat"])
            self.insult = list(self.manual_df.loc[:, "insult"])
            self.identity_hate = list(self.manual_df.loc[:, "identity_hate"])
        
        elif type_ == "test":
            
            self.bow = np.array(self.bow_df.iloc[:, 1:])
            self.manual = np.array(self.manual_df.iloc[:, 2:])
            self.features = get_feature_vector(self.bow, self.manual)
            
    
    def getLabels(self, label):
        
        if label == "toxic":
            return self.toxic
        
        elif label == "severe_toxic":
            return self.severe_toxic
        
        elif label == "obscene":
            return self.obscene
        
        elif label == "threat":
            return self.threat
        
        elif label == "insult":
            return self.insult
        
        elif label == "identity_hate":
            return self.identity_hate
        
            

## Creating the classifier class

In [15]:
class Classifier:
    
    def __init__(self, algo, nr_folds, min_c, max_c, type_):
        
        self.algo = algo
        self.nr_folds = nr_folds
        self.min_c = min_c
        self.max_c = max_c
        self.c_range = frange(min_c, max_c, 2)
        self.type_ = type_
        
        self.best_c = 0
        self.best_rate = 0   
    
    def getBestC(self, data):
        
        type_ = self.type_
        features_ = data.features
        labels_ = data.getLabels(type_)
        
        samples_split = generateSplits(data.num_obs, self.nr_folds)
        print("done split")
        
        loss_master = []
        
        for c in self.c_range:
            
            loss = []
            
            for i in range(0, self.nr_folds):
                
                test_index = samples_split[i]
                train_index = [x for j in range(0, self.nr_folds) for x in samples_split[j] if j != i]
                
                train_labels = [labels_[x] for x in train_index]
                train_features = [features_[x] for x in train_index]
                
                test_labels = [labels_[x] for x in test_index]
                test_features = [features_[x] for x in test_index]
                
                param = "-c " + str(c) + " -s " + str(self.algo)
                model = train(train_labels, train_features, param)
                
                labels, acc, proba = predict(test_labels, test_features, model, "-b 1")
                
                index = model.get_labels().index(1)
                loss_current = loss_func(labels, [x[index] for x in proba])
                
                loss.extend([loss_current])
            
            loss_mean = np.mean(loss)
            loss_master.extend([loss_mean])
        
        results_zipped = [x for x in zip(self.c_range, loss_master)]
        results_zipped = sorted(results_zipped, key= lambda x: x[1], reverse= False)
        
        self.best_c = results_zipped[0][0]
        self.best_rate = results_zipped[0][1]
        
        print("best c for %s is %f with lowest log loss of: %f "%(label,self.best_c, self.best_rate))
        
    def trainModel(self, data): 
        
        self.getBestC(data)
        
        type_ = self.type_
        features_ = data.features
        labels_ = data.getLabels(type_)
        
        param = "-c " + str(self.best_c) + " -s " + str(self.algo)
        model = train(labels_, features_, param)
        
        print("Training model with label = %s"%type_)
        
        self.__model = model
    
    def predict(self, test_data):
        
        features_ = test_data.features
        labels_ = np.array([0] * len(features_))
        
        labels, acc, proba = predict(labels_, features_, self.__model, "-b 1")
        index = self.__model.get_labels().index(1)

        return [x[index] for x in proba], self.best_rate


## Defining the main function

#### Algo Codes

In [12]:
def main(train_data, test_data, classifier, label):
    
    print("Training for: %s"%label)
   
    classifier.trainModel(train_data)
    proba, log_loss = classifier.predict(test_data)
    print("")
    
    return proba, log_loss
    

In [None]:
if __name__ == "__main__":
    
    #train_data = data(train_data_bow_path, train_data_manual_path, "train")
    
    #test_data = data(test_data_bow_path, test_data_manual_path, "test")
    
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    classifiers = dict()
    loss = []
    
    results = test_data.manual_df.loc[:, "id"]
    
    proba_master = []
    
    for label in labels:
        
        classifiers[label] = Classifier(0, 7, 0.03125, 64, label)
        classifier = classifiers.get(label)
        
        proba_class,loss_class = main(train_data, test_data, classifier, label)
        proba_master.extend([proba_class])
        loss.extend([loss_class])
        
        results = pd.concat([results, pd.DataFrame(proba_class, columns = [label])], axis = 1)
        
    #results = results["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]
    
    results.to_csv("../../data/output/output - LogReg.csv", index = False, float_format= "%.10f", encoding = "utf-8")
    
    print("Average log loss = %f"%np.average(loss))

Training for: toxic
done split
best c for toxic is 64.000000 with lowest log loss of: 0.071412 
Training model with label = toxic

Training for: severe_toxic
done split
best c for severe_toxic is 64.000000 with lowest log loss of: 0.017384 
Training model with label = severe_toxic

Training for: obscene
done split
best c for obscene is 64.000000 with lowest log loss of: 0.040114 
Training model with label = obscene

Training for: threat
done split
best c for threat is 64.000000 with lowest log loss of: 0.009765 
Training model with label = threat

Training for: insult
done split
best c for insult is 64.000000 with lowest log loss of: 0.048080 
Training model with label = insult

Training for: identity_hate
