In [5]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import *

In [None]:
COMMENT = 'comment_text'
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Problem understanding

# Dataset generation

In [None]:
train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')
test_labels = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test_labels.csv')
sample_submission = pd.read_csv('jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

# Metrics define

# Validation strategy

# Data processing (extract useful information)

In [16]:
import re
import textwrap
import unicodedata
import warnings
from string import ascii_letters, digits, punctuation

from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')


class BaseFilter:
    """Base filter class, it does NOTHING.
    This class is design for providing ``__repr__`` string, which is useful in
    python interactive prompt.
    """

    def __repr__(self):
        return '<%s>' % type(self).__name__

    def __call__(self, text: str) -> str:
        return text


class GeneralFilter(BaseFilter):

    def __repr__(self):
        return '<{clsn} repl=`{repl}`>'.format(clsn=type(self).__name__,
                                               repl=self.repl)

    def __call__(self, s: str, count: int = 0):
        return self.pattern.sub(self.repl, s, count)


class EmailFilter(GeneralFilter):
    
    def __init__(self, replace=' '):
        if isinstance(replace, str):
            self.repl = replace
        else:
            self.repl = lambda match: next(replace) # for iterator tag
        self.expression = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
        self.pattern = re.compile(self.expression)


class URLFilter(GeneralFilter):
    
    # copied from string_constants.py
    schemes = list(map(lambda x: '{}:\/\/'.format(x), ("http", "https", "ftp", "sftp", "irc", "magnet", "file", "data"))) + ['www.']
    schemes_lenient = list(map(lambda x: '{}:\/\/'.format(x), ("h?ttp", "h?ttps", "ftp", "sftp", "irc", "magnet", "file", "data"))) + ['www.']

    # from RFC
    gen_delims = ":/?#[]@"
    sub_delims = "!$&'()*+;=%"
    reserved = gen_delims + sub_delims
    unreserved = ascii_letters + digits + "-._~"
    valid_uri_characters = reserved + unreserved

    def __init__(self, replace=' ', allow_common_typoes=True):
        
        if isinstance(replace, str):
            self.repl = replace
        else:
            self.repl = lambda match: next(replace) # for iterator tag

        if allow_common_typoes:
            self.expression = r"((?:{})[{}]+)".format("|".join(self.schemes), re.escape(self.valid_uri_characters))
        else:
            self.expression = r"((?:{})[{}]+)".format("|".join(self.schemes_lenient), re.escape(self.valid_uri_characters))
        self.pattern = re.compile(self.expression)


class EscapeCharFilter(GeneralFilter):

    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'([{}])'.format('\r\n\t\v')
        self.pattern = re.compile(self.expression)


class InvisableCharFilter(GeneralFilter):
    
    def __init__(self, replace:str='', ):
        self.repl = replace
        self.expression = r'([{}])'.format(r'\xa0\xad\u200c')
        self.pattern = re.compile(self.expression)


class KeyWordsFilter(GeneralFilter):
    
    def __init__(self, replace:str=' ', keywords=('mailto',)):
        self.repl = replace
        self.expression = r'({})'.format('|'.join(map(lambda x: '({})'.format(x), keywords)))
        self.pattern = re.compile(self.expression)


class MultiSpacesFilter(GeneralFilter):
    
    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'(\s+)'
        self.pattern = re.compile(self.expression)


class FilterNonChars(GeneralFilter):

    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'(\W+)'
        self.pattern = re.compile(self.expression)
        
        
class HTMLFilter(GeneralFilter):
    def __init__(self, replace:str=''):
        self.repl = replace
        self.expression = "<[^>]*>"
        self.pattern = re.compile(self.expression)

class HTML2Text:
    
    def __init__(self, filters=('script', 'style', 'meta', 'noscript')):
        self.filters = filters

    def extract_text_from_html(self, html:str):

        soup = BeautifulSoup(html, features='lxml')
        for script in soup(self.filters): # remove all javascript and stylesheet code
            script.decompose()
        return soup.get_text()
    
    def __call__(self, html:str):
        try:
            return self.extract_text_from_html(html)
        except:
            return str()



class UnicodeStandarizer:
    
    def __init__(self, method='NFKC'): # NFD, NFC, NFKD, NFKC
        self.method = method
    
    def __call__(self, text:str):
        return unicodedata.normalize(self.method, text)


class Stripper:
    @staticmethod    
    def __call__(text:str):
        return text.strip()


class Lowercasting:
    @staticmethod    
    def __call__(text:str):
        return text.lower()


class SequentialProcessor(BaseFilter):
    """Sequential processor: A class provide an interface to create integral
    processor that would sequentially filter text by the given filters.
    Usage::
        text_preprocessor = SequentialProcessor(
            UnicodeStandarizer(),
            HTML2Text(),
            InvisableCharFilter(),
            LIDFilter(),
            Lowercasting(),
            KeyWordsFilter(),
            MultiSpacesFilter(),
            Stripper(),
        )
    """

    def __init__(self, *processors):
        self.pipeline = processors

    def __repr__(self):
        if len(self.pipeline) == 1:
            children_desc = repr(self.pipeline)
        else:
            # use textwrap to support nested processors
            children_desc = '\n' + ',\n'.join(
                textwrap.indent(repr(p), ' ' * 4)
                for p in self.pipeline)
        return '<{clsn} [{child}]>'.format(clsn=type(self).__name__,
                                           child=children_desc)

    def __call__(self, text: str):
        for processor in self.pipeline:
            text = processor(text)
        return text

class GeneralProcessor(SequentialProcessor):
    pass

class KeepWordSpace(GeneralFilter):
    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'([^\w\s])'
        self.pattern = re.compile(self.expression)
        
class WordModelPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), URLFilter(), EscapeCharFilter(), KeepWordSpace(), Lowercasting(), MultiSpacesFilter(), Stripper())

class LIDFilter(GeneralFilter):
    
    def __init__(self, replace=' '):
        self.repl = replace
        filters = EmailFilter(), URLFilter(), EscapeCharFilter(), FilterNonChars()
        self.expression = '|'.join(list(map(lambda x: x.expression, filters)))
        self.pattern = re.compile(self.expression)

class TextPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), InvisableCharFilter(), LIDFilter(), Lowercasting(), KeyWordsFilter(), MultiSpacesFilter(), Stripper())

In [19]:
class TextPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), URLFilter(), EmailFilter(), InvisableCharFilter(), LIDFilter(), Lowercasting(), KeyWordsFilter(), MultiSpacesFilter(), Stripper())

In [20]:
text_preprocessor = TextPreprocessor()
with Pool() as pool:
    train[COMMENT] = pool.map(text_preprocessor, train[COMMENT])
    test[COMMENT] = pool.map(text_preprocessor, test[COMMENT])

In [21]:
train[COMMENT][0]

'explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalisms just closure on some gas after i voted at new york dolls fac and please don t remove the template from the talk page since i m retired now 89 205 38 27'

# Data understanding & visualization

# De-noise (no drop data)

# Feature engineering

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=16384)

x_train = vec.fit_transform(train[COMMENT])
x_test = vec.transform(test[COMMENT])

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=16384, analyzer='char', ngram_range=(2, 4))

x_train_char = vec.fit_transform(train[COMMENT])
x_test_char = vec.transform(test[COMMENT])

# Offline augmentation

# Standarization

# Scaling

# Normalization

# Feature selection

# Data selection

# Optimization

In [64]:
# baseline model with preprocess.

from sklearn.linear_model import LogisticRegression
models = {l: LogisticRegression() for l in LABELS}
[models[l].fit(x_train, train[l]) for l in LABELS]
for l in LABELS:
    sample_submission[l] = models[l].predict_proba(x_test)[:, 1]
sample_submission.to_csv('submission.csv', index=False)

# ~ 97.43% (0.01% improved)



In [65]:
# baseline model with preprocess, 2-4 char grams.

from sklearn.linear_model import LogisticRegression
models = {l: LogisticRegression() for l in LABELS}
[models[l].fit(x_train_char, train[l]) for l in LABELS]
for l in LABELS:
    sample_submission[l] = models[l].predict_proba(x_test_char)[:, 1]
sample_submission.to_csv('submission.csv', index=False)

# ~ 97.53% (0.1 % improved)



In [73]:
# baseline model with preprocess, 2-4 char grams.

from sklearn.linear_model import LogisticRegression
models = {l: LogisticRegression(C=3, n_jobs=-1, solver='saga') for l in LABELS}
[models[l].fit(x_train_char, train[l]) for l in LABELS]
for l in LABELS:
    sample_submission[l] = models[l].predict_proba(x_test_char)[:, 1]
sample_submission.to_csv('submission.csv', index=False)

# 97.55% 



# Parameter tuning

# Online augmentation

# Model selection / blending

# Post-processing

# Evaluation

# Reasoning

# Monitoring