In [60]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [61]:
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
COMMENT = 'comment_text'

In [62]:
train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')
submission = pd.read_csv('jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

In [4]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()
vec = TfidfVectorizer(max_features=16384)

In [5]:
x_train = vec.fit_transform(train[COMMENT])
x_test = vec.transform(test[COMMENT])

In [6]:
models = {l: LogisticRegression() for l in LABELS}
[models[l].fit(x_train, train[l]) for l in LABELS]
for l in LABELS:
    submission[l] = models[l].predict_proba(x_test)[:, 1]
submission.to_csv('1.submission.csv', index=False)

# ~ 97.42%



In [7]:
for l in LABELS:
    submission[l] = models[l].predict_proba(x_test)[:, 1]
submission.loc[:, 'severe_toxic'][submission['toxic'] < 0.2] = 0
submission.to_csv('2.submission.csv', index=False)
# ~ 97.46%

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2


In [9]:
keras_input = keras.Input(shape=[x_test.shape[1]])
l = layers.Dense(1024, activation='relu', kernel_regularizer=l2(1e-5))(keras_input)
l = layers.Dense(512, activation='relu', kernel_regularizer=l2(1e-5))(l)
l = layers.Dense(128, activation='relu', kernel_regularizer=l2(1e-5))(l)
keras_output = layers.Dense(6, activation='sigmoid')(l)
model = keras.Model(keras_input, keras_output)
model.compile(keras.optimizers.Adam(3e-4), loss='binary_crossentropy', metrics=['acc'])

In [10]:
model.fit(x_train.todense(), 
          train[LABELS], 
          validation_split=0.05,
          epochs=10, 
          batch_size=128,
          shuffle=True, 
          callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, verbose=1), ])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 00003: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f61cc5306a0>

In [12]:
y_pred = model.predict(x_test.todense())

In [13]:
for i, l in enumerate(LABELS):
    submission[l] = y_pred[:, i]
submission.to_csv('3.submission.csv', index=False)
# ~ 97.37%

In [16]:
from sklearn.ensemble import RandomForestRegressor
models = {l: RandomForestRegressor(n_estimators=1, n_jobs=-1, max_depth=100) for l in LABELS}

for l in LABELS:
    print(l)
    models[l].fit(x_train, train[l])
    submission[l] = models[l].predict(x_test)
submission.to_csv('4.submission.csv', index=False)

# ~ 58.21% (might dure to max_depth and n_estimators limit for training time constrain)

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [None]:
models = {l: LogisticRegression(class_weight='balanced') for l in LABELS}
for l in LABELS:
    models[l].fit(x_train, train[l])
    submission[l] = models[l].predict_proba(x_test)[:, 1]
submission.to_csv('5.submission.csv', index=False)

# ~ 97.49%

In [58]:
model = sklearn.ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=10)
model.fit(x_train, train[LABELS])
y_pred = model.predict_proba(x_test)[0][:, 1]
for i, l in enumerate(LABELS):
    submission[l] = y_pred
submission.to_csv('6.submission.csv', index=False)
# ~ 94.15%

In [53]:
model = sklearn.ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=1000)
model.fit(x_train, train[LABELS])
y_pred = model.predict_proba(x_test)[0][:, 1]
for i, l in enumerate(LABELS):
    submission[l] = y_pred
submission.to_csv('7.submission.csv', index=False)
# ~ 95.66%