In [9]:
# import os
# print(os.listdir("../input/tinyversions"))

import numpy as np 
import pandas as pd 
import string

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer



In [2]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('data/tiny_train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
badwords = pd.read_csv('data/bad-words.csv', header=None).iloc[:,0].tolist()

# train = pd.read_csv('../input/tinyversions/tiny_train.csv').fillna(' ')
# train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv').fillna(' ')
# test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [3]:
# Word Vectorizer
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000)

# N-gram Character Vectorizer
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=30000)


In [4]:
# Parts of Speech Tag Count 
class PoS_TagFeatures(TransformerMixin):
    
    def tag_PoS(self, text):
        text_splited = text.split(' ')
        text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
        text_splited = [s for s in text_splited if s]
        pos_list = pos_tag(text_splited)
        noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
        adjective_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
        verb_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
        return [noun_count, adjective_count, verb_count]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'nouns': counts[0],
                 'adjectives': counts[1],
                 'verbs': counts[2]}
                for counts in map(self.tag_PoS, posts)]
    
# Pipelining Parts of Speech Tag Features with DictVectorizer for processing
posTag_vectorizer = Pipeline([
    ('parts_of_speech', PoS_TagFeatures()),
    ('dictVect', DictVectorizer(sparse = False))
])

In [15]:
# Bad Words Occurance Count
class BadWords_Features(TransformerMixin):
    
    def badWordCount(self, text):
        badCount = sum(text.count(w) for w in badwords)
        return [badCount, badCount/len(text.split()), badCount/len(text)]
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'badwordcount': badCounts[0],
                 'normByTotalWords': badCounts[1],
                 'normByTotalChars': badCounts[2]}
                for badCounts in map(self.badWordCount, posts)]
    
# Pipelining Bad Word Features with DictVectorizer for processing
badWord_vectorizer = Pipeline([
    ('bad_words', BadWords_Features()),
    ('dictVect', DictVectorizer(sparse = False))
])


In [46]:

class ExtractedFeatures(TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]
    
# Pipelining Extracted Features with DictVectorizer for processing
featureVectorizer = Pipeline([
    ('extractor', ExtractedFeatures()),
    ('dictVect', DictVectorizer(sparse = False))
])

In [16]:
combined_features = FeatureUnion([("word", word_vectorizer), ("char", char_vectorizer), ("pos_tags", posTag_vectorizer), 
                                  ("bad_word", badWord_vectorizer)])
train_features = combined_features.fit(train_text, train["toxic"]).transform(train_text)

print(train_features.shape)

(20, 6806)


In [None]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for clas in classes:
    train_target = train[clas]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(clas, cv_score))

    classifier.fit(train_features, train_target)
    submission[clas] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)