In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [2]:
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [3]:
import string, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import TweetTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [5]:
punctuation = string.punctuation
stop_word = set(stopwords.words('english'))
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

def preprocess(data):
    data = data.lower()
    data = re.sub(r'http\S+|www\S+|https\S+', '', data, flags=re.MULTILINE)
    data = re.sub(r'\@\w+|\#','', data)
    data = re.sub('[^a-zA-Z]', ' ', data)
    
    data = tokenizer.tokenize(data)
    
    data = [lemmatizer.lemmatize(word, "v") for word in data if word not in stop_word and word not in punctuation]
    data = [lemmatizer.lemmatize(word, "n") for word in data if word not in stop_word and word not in punctuation]
    data = [lemmatizer.lemmatize(word, "a") for word in data if word not in stop_word and word not in punctuation]
    return data

In [6]:
labels = train.columns[2:]
print(labels)

train_class = train[labels]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')


In [7]:
word_vectorizer = TfidfVectorizer(
        ngram_range =(1, 2),
        min_df=3,
        max_df=0.9,
        tokenizer=preprocess,
#         token_pattern=r"([a-zA-Z]{1,})"
        stop_words="english",
        analyzer="word"
)
char_vectorizer = TfidfVectorizer(ngram_range =(2, 5), max_features=50000, analyzer="char")

train_text = train["comment_text"]
test_text = test["comment_text"]
all_text = pd.concat([train_text, test_text])

In [8]:
word_vectorizer.fit(all_text)
train_word_vector = word_vectorizer.transform(train_text)
test_word_vector = word_vectorizer.transform(test_text)



In [9]:
char_vectorizer.fit(all_text)
train_char_vector = char_vectorizer.transform(train_text)
test_char_vector = char_vectorizer.transform(test_text)

In [10]:
train_vector = hstack([train_word_vector, train_char_vector])
test_vector = hstack([test_word_vector, test_char_vector])

In [11]:
print(train_vector.shape)
print(train_class["toxic"].shape)

(159571, 520760)
(159571,)


In [12]:
def probability(train_vector, y_i, y):
    prob = train_vector[y == y_i].sum(axis=0)
    # laplace smoothing
    return (prob + 1) / ((y == y_i).sum() + 1)

In [13]:
def NaiveBayes(train_vector, y):
    log_prob = np.log(probability(train_vector, 1, y) / probability(train_vector, 0, y))
    nb = train_vector.multiply(log_prob)
    return nb, log_prob

In [14]:
model = LogisticRegression(C=0.1, max_iter=2000)

In [15]:
submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

for label in labels:
    y = train_class[label].values
    nb, log_prob = NaiveBayes(train_vector, y)
    
    model.fit(nb, y)
    
    y_pred = model.predict(train_vector.multiply(log_prob))
    print('Training accuracy for {} is {}'.format(label, accuracy_score(y, y_pred)))
    
    test_y_prob = model.predict_proba(test_vector.multiply(log_prob))[:, 1]
    submission[label] = test_y_prob

Training accuracy for toxic is 0.9620106410312651
Training accuracy for severe_toxic is 0.9923733009130732
Training accuracy for obscene is 0.981262259433105
Training accuracy for threat is 0.9976875497427478
Training accuracy for insult is 0.9750518577937094
Training accuracy for identity_hate is 0.9935577266545926


In [16]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999439,0.128199,0.99915,0.010256,0.96973,0.118215
1,0000247867823ef7,0.025719,0.003005,0.010681,0.001024,0.012646,0.003113
2,00013b17ad220c46,0.026739,0.003238,0.010651,0.001034,0.010653,0.002692
3,00017563c3f7919a,0.00781,0.001782,0.005738,0.001415,0.006651,0.001993
4,00017695ad8997eb,0.032932,0.002264,0.010005,0.000965,0.011337,0.002083


In [17]:
submission.to_csv('submission.csv',index=False)