# _ prepare colab envirment _

In [31]:
# check running accerleration

## import
import os
import pprint
import tensorflow as tf

## check gpu
print('---gpu check---')
print(format(tf.test.gpu_device_name()))
### ' ' means no gpu
### '/device:GPU:0' means use gpu

##  check tpu
print('\n---tpu check---')
try:
    tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    print ('TPU address is', tpu_address)

    with tf.Session(tpu_address) as session:
        devices = session.list_devices()
        print('TPU devices:')
        pprint.pprint(devices)
except KeyError:
    print('/tpu:disabled')
else:
    pass
finally:
    pass

print('\n_check complete_')

# mount google drive

## install drive lib
!pip install -U -q PyDrive
## mount drive
from google.colab import drive
drive.mount('/content/drive/')

---gpu check---
/device:GPU:0

---tpu check---
/tpu:disabled

_check complete_
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [32]:
# set working dir

## import
import os

## set project working folder
tfolder = '/content/drive/My Drive/ml/project/toxic/'
pfolder = '/content/drive/My Drive/ml/nlppre/'

## change dir
os.chdir(tfolder)

## check position
print('\n---position---')
!pwd

## check file
print('\n---folder files---')
!ls -al


---position---
/content/drive/My Drive/ml/project/toxic

---folder files---
total 1768136
-rw------- 1 root root  21693763 Apr 29 03:32 s1_sub.csv
-rw------- 1 root root 656423264 May 10 21:37 s2_1_best.h5
-rw------- 1 root root      8071 May 10 21:42 s2_1_model.yaml
-rw------- 1 root root  21569176 May 10 21:43 s2_1_sub.csv
-rw------- 1 root root 656423784 May 10 23:21 s2_2_best.h5
-rw------- 1 root root      8110 May 10 23:27 s2_2_model.yaml
-rw------- 1 root root  17594737 May 10 23:27 s2_2_sub.csv
-rw------- 1 root root      4525 Apr 29 08:25 s2_model.yaml
-rw------- 1 root root  21985027 Apr 29 08:25 s2_sub.csv
-rw------- 1 root root 257006320 May 12 08:02 s3_best.h5
-rw------- 1 root root      4619 May 12 08:15 s3_model.yaml
-rw------- 1 root root  22408304 May 12 08:30 s3_sub.csv
-rw------- 1 root root   6279782 Apr 24 06:16 sample_submission.csv
-rw------- 1 root root  60354593 Apr 24 06:16 test.csv
-rw------- 1 root root  68802655 Apr 24 06:17 train.csv


In [33]:
# download fasttext

os.chdir(pfolder)
print('\n---position---')
!pwd


---position---
/content/drive/My Drive/ml/nlppre


In [34]:
# prepare prefiles(if need)

## change dir
os.chdir(pfolder)

## unzip file if need
# !unzip -qq glove.6B.zip

## check position
print('\n---position---')
!pwd

## check file
print('\n---folder files---')
!ls -al

## change dir
os.chdir(tfolder)

## check position
print('\n---position---')
!pwd

## check file
print('\n---folder files---')
!ls -al


---position---
/content/drive/My Drive/ml/nlppre

---folder files---
total 6527539
-rw------- 1 root root 1037962819 Apr 28 03:27 glove.6B.300d.txt
-rw------- 1 root root 5646236541 Oct 24  2015 glove.840B.300d.txt

---position---
/content/drive/My Drive/ml/project/toxic

---folder files---
total 1768136
-rw------- 1 root root  21693763 Apr 29 03:32 s1_sub.csv
-rw------- 1 root root 656423264 May 10 21:37 s2_1_best.h5
-rw------- 1 root root      8071 May 10 21:42 s2_1_model.yaml
-rw------- 1 root root  21569176 May 10 21:43 s2_1_sub.csv
-rw------- 1 root root 656423784 May 10 23:21 s2_2_best.h5
-rw------- 1 root root      8110 May 10 23:27 s2_2_model.yaml
-rw------- 1 root root  17594737 May 10 23:27 s2_2_sub.csv
-rw------- 1 root root      4525 Apr 29 08:25 s2_model.yaml
-rw------- 1 root root  21985027 Apr 29 08:25 s2_sub.csv
-rw------- 1 root root 257006320 May 12 08:02 s3_best.h5
-rw------- 1 root root      4619 May 12 08:15 s3_model.yaml
-rw------- 1 root root  22408304 May 12 08

In [35]:
# check env

## keras
import keras
print('\n---keras version: {}---'.format(keras.__version__))


---keras version: 2.2.4---


# _ solution brief _

- project - Toxic Comment Classification
- data - kaggle
- score - 0.96286 (kaggle private)
- solution3 - glove840b+LSTM
- solution brief 
    - glove840b
    - LSTM
- finding - 基本是最好得分了（又一个更好一点）改进思路：
    - embedding 使用 fasttext
    - model 使用 lstm
    - solution2 关闭

# I - prepare data

In [0]:
# 导入基础库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint as p

# 导入机器学习库
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_files

# 导入深度学习库
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential

from keras.layers import Input, Embedding, Flatten,Dense, Dropout, SpatialDropout1D, BatchNormalization
from keras.layers import MaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.layers import Conv1D, LSTM, Bidirectional, GRU

from keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from keras.models import model_from_json

# 设置参数显示长文本
pd.options.display.max_colwidth = 500

# 设置行内显示
%matplotlib inline

In [0]:
# import files

## read test
file = 'test.csv'
filepath = tfolder+file
filepath
test = pd.read_csv(filepath)

## read train
file = 'train.csv'
filepath = tfolder+file
filepath
train = pd.read_csv(filepath)

In [38]:
# check files

test.head(1)
## 第6行是良好评论数据
## 官方test_label中 -1 是可能的选择，0为非攻击性语言

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"


In [39]:
train.head(1)
## 注意第7行是train的目标数据处理结果
## 将涉及到的负面类型通过1来标记

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0


# II - word embedding

In [0]:
# get comment
train_comment = train.comment_text
test_comment = test.comment_text

In [41]:
# get lable
train_label = train.iloc[:,2:]
train_label.head(1)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0


In [42]:
# check data
p(train_comment[0])
print(train_comment.shape)
p(test_comment[0])
print(test_comment.shape)
print(train_label.shape)

('Explanation\n'
 'Why the edits made under my username Hardcore Metallica Fan were reverted? '
 "They weren't vandalisms, just closure on some GAs after I voted at New York "
 "Dolls FAC. And please don't remove the template from the talk page since I'm "
 'retired now.89.205.38.27')
(159571,)
("Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and "
 'hating you sad mofuckas...i should bitch slap ur pethedic white faces and '
 'get you to kiss my ass you guys sicken me. Ja rule is about pride in da '
 'music man. dont diss that shit on him. and nothin is wrong bein like tupac '
 'he was a brother too...fuckin white boys get things right next time.,')
(153164,)
(159571, 6)


In [0]:
# --- 6B 300D ---
# 根据测试，300D的成绩明显好于50D
# 也说明合适的维度非常重要

# 向量化词
### para

max_feature = 30000

## prepare tokenizer
t = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', num_words=max_feature )
## https://keras.io/preprocessing/text/
## 去掉奇怪字符、lower可以提高一点分数
t.fit_on_texts(train_comment)
## 这里是对输入句子进行拆词
## 在这个数据里，如果输入了1000句（测试文件中的train_short)，结果是10007个
## 如果输出的话在1000之后会有个...
## 全部输入的话是21万多
## 可以使用 num_words 做限制（这里没有生效，后续研究）

vocab_size = len(t.word_index) + 1
## 根据上面设置 vocab_size

## integer encode the documents
encoded_train = t.texts_to_sequences(train_comment)
encoded_test = t.texts_to_sequences(test_comment)

## pad documents to a max length of 300 words
### para
max_length = 300

## max_length 就是每个commet要处理的单词数
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

In [44]:
# check padded
print(padded_train.shape)
p(padded_train[99])

print(padded_test.shape)
p(padded_test[100])
## 可以看出来，100个词的 padded 数据，没有的用0填充了

(159571, 300)
array([    6,    40,    33,    42,   173,   282,   145,    89,    26,
          22,     6,    96,     5,   672,   884,    16,   528,     2,
          33,    57,     4,    18,    57,  2292,     8,    39,    73,
         371,     4,     6,   361,     2,    16,   101,  1463,    21,
         567,    37,     6,   292,    18,     5,   672,    50,    96,
          48,    11,  1018,   439,  3246, 11844,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [45]:
# check word list
len(t.word_index)
## 注意这里是词统计完的计数（输入文本中有多少词）
## 与 max_length 无关，因为 pad_sequences 是后发生的

210337

# III - modeling s3

In [46]:
# 创建 embedding 层（300D）
## 840b 时发现处理大文件要使用 with open，代码更新（之前的报错）

## 初始化参数
### para
embed_demention = 300
embeddings_index = dict( )

## 读取文件
file = 'glove.840B.300d.txt'
filepath = pfolder+file

with open(filepath,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Loaded %s word vectors.' % len(embeddings_index))

## 创建嵌入矩阵
embedding_matrix = np.zeros((vocab_size, embed_demention))

## 注意这里要和嵌入矩阵的维度300相同
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 2196016 word vectors.


In [47]:
# check embedding_matrix
embedding_matrix.shape

(210338, 300)

In [0]:
# split train and val
## test=0.1

x_train, x_val , y_train, y_val = train_test_split(padded_train, train_label, test_size=0.1, shuffle=True, random_state=42)

In [90]:
# model 3.2
## update from 3.1
## drop regulation
## keep only one cnn

### para
cnnfilter=128
grufilter=256
kernel=3

# model
model = Sequential()

## embedding layer
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)
model.add(e)

## dropout
model.add(SpatialDropout1D(0.1))

## normalization
#model.add(BatchNormalization())

## layers
model.add(Bidirectional(GRU(grufilter, return_sequences=True,dropout=0.1,recurrent_dropout=0.1)))
model.add(Conv1D(cnnfilter, kernel, padding='same', activation='relu'))
model.add(MaxPool1D())

model.add(Conv1D(cnnfilter*2, kernel, padding='same', activation='relu'))
model.add(MaxPool1D())

model.add(Conv1D(cnnfilter*4, kernel, padding='same', activation='relu'))
model.add(MaxPool1D())

model.add(Flatten())
model.add(Dense(6, activation='sigmoid'))
#### 6分类 Dense为6

## compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
## summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 300, 300)          63101400  
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 300, 300)          0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 300, 512)          855552    
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 300, 128)          196736    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 150, 128)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 150, 256)          98560     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 75, 256)           0         
__________

In [0]:
# set name
name = 's3_2'

mname = name+'_model.yaml'
wname = name+'_weight.h5'
sname = name+'_sub.csv'
bname = name+'_best.h5'

In [0]:
# callback
## update2.1

from keras.callbacks import EarlyStopping, ModelCheckpoint

## Set callback functions to early stop training and save the best model so far
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath=bname, monitor='val_loss', save_best_only=True)]

In [0]:
# checkpoint
# checkpoint = ModelCheckpoint(bname, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [80]:
# fit the model

### para
batch_size = 256
epochs = 20

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val), callbacks=callbacks, verbose=2)

Train on 143613 samples, validate on 15958 samples
Epoch 1/20
 - 489s - loss: 0.0677 - acc: 0.9770 - val_loss: 0.0469 - val_acc: 0.9825
Epoch 2/20
 - 483s - loss: 0.0482 - acc: 0.9821 - val_loss: 0.0445 - val_acc: 0.9830
Epoch 3/20
 - 482s - loss: 0.0449 - acc: 0.9827 - val_loss: 0.0426 - val_acc: 0.9834
Epoch 4/20
 - 478s - loss: 0.0422 - acc: 0.9837 - val_loss: 0.0425 - val_acc: 0.9836
Epoch 5/20
 - 481s - loss: 0.0401 - acc: 0.9844 - val_loss: 0.0418 - val_acc: 0.9844
Epoch 6/20
 - 477s - loss: 0.0384 - acc: 0.9850 - val_loss: 0.0412 - val_acc: 0.9843
Epoch 7/20
 - 477s - loss: 0.0363 - acc: 0.9855 - val_loss: 0.0409 - val_acc: 0.9841
Epoch 8/20
 - 476s - loss: 0.0345 - acc: 0.9861 - val_loss: 0.0415 - val_acc: 0.9841
Epoch 9/20
 - 476s - loss: 0.0330 - acc: 0.9868 - val_loss: 0.0418 - val_acc: 0.9836
Epoch 10/20
 - 475s - loss: 0.0312 - acc: 0.9873 - val_loss: 0.0438 - val_acc: 0.9829
Epoch 11/20
 - 476s - loss: 0.0297 - acc: 0.9878 - val_loss: 0.0440 - val_acc: 0.9835


<keras.callbacks.History at 0x7f03d6cb72e8>

In [81]:
# predicting

### para
predict_size=256

## load best
model.load_weights(bname)

## predicting
print('Predicting....')
y_pred = model.predict(padded_test,batch_size=predict_size,verbose=1)

Predicting....


In [82]:
submission = pd.read_csv('sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv(sname, index=False)
print('- submission saved as {}'.format(sname))

- submission saved as s3_2_sub.csv


In [0]:
## save model
model_yaml = model.to_yaml()
with open(mname, "w") as yaml_file:
    yaml_file.write(model_yaml)

# IV - conclusion

- eporch 20 很快就开始抖动了，设定了 earystoping
- 成绩不高应该是模型问题，单纯cnn的方式需要改进
- 发现增加层数（每层 filter*2）会提高一点成绩

#_ kaggle _

In [84]:
# set kaggle(run once)

## make dir
!mkdir ~/.kaggle

## copy token
!cp '/content/drive/My Drive/ml/kaggle.json' ~/.kaggle/

## check file
!ls ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists
kaggle.json


In [0]:
# search for competition(data the sameway)
#!kaggle competitions list -s toxic

# check competition score
#!kaggle competitions submissions -c jigsaw-toxic-comment-classification-challenge

In [86]:
# submit(once)
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f '/content/drive/My Drive/ml/project/toxic/s3_2_sub.csv' -m 'sub3:glove840B+BiGRU'

  0% 0.00/19.7M [00:00<?, ?B/s] 38% 7.38M/19.7M [00:00<00:00, 76.8MB/s] 50% 9.73M/19.7M [00:00<00:00, 45.0MB/s] 61% 12.0M/19.7M [00:00<00:00, 22.5MB/s]100% 19.7M/19.7M [00:02<00:00, 8.52MB/s]
Successfully submitted to Toxic Comment Classification Challenge

In [88]:
# check competition score
!kaggle competitions submissions -c jigsaw-toxic-comment-classification-challenge

fileName      date                 description                  status    publicScore  privateScore  
------------  -------------------  ---------------------------  --------  -----------  ------------  
s3_2_sub.csv  2019-05-12 12:52:01  sub3:glove840B+BiGRU         complete  0.97875      0.97887       
s3_1_sub.csv  2019-05-12 11:04:51  sub3:glove840B+BiGRU         complete  0.97918      0.97809       
s3_sub.csv    2019-05-12 08:30:41  sub3:glove840B+BiGRU         complete  0.97993      0.97828       
s2_2_sub.csv  2019-05-11 09:10:01  sub2_2:glove840B+CNN_update  complete  0.96766      0.96938       
s2_2_sub.csv  2019-05-10 23:28:39  sub2_2:glove840B+CNN_update  complete  0.96766      0.96938       
s2_2_sub.csv  2019-05-10 23:04:25  sub2_2:glove840B+CNN_update  complete  0.95929      0.96174       
s2_2_sub.csv  2019-05-10 22:47:04  sub2_2:glove840B+CNN_update  complete  0.96413      0.96396       
s2_1_sub.csv  2019-05-10 21:43:48  sub2_1:glove6B+CNN_update    complete  0.96291 

#_ resouces _

> 主要参考资料：