# Capestone Project Solution2 Testing
## / Toxic Comment Classification /

- - -
<ul>
<li><a href="#prepare">I 环境准备</a></li>
<li><a href="#wrangling">II 向量化</a></li>
<li><a href="#keras">III Keras Testing</a></li>
<li><a href="#vec">IV Keras Vec Testing</a></li>
<li><a href="#cnn">V CNN</a></li>
<li><a href="#conclusions">VI 结论</a></li>
</ul>

<a id='intro'></a>

<center><a id='prepare'>I 环境准备</a></center>

In [26]:
# prpare env 

# 用这个框对你计划使用的所有数据包进行设置
# 导入语句
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 设置参数显示长文本
pd.options.display.max_colwidth = 500

# 行内显示
%matplotlib inline

# 机器学习库
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [27]:
# import files

test = pd.read_csv('test.csv')
## found utf8 content
## -1 是可能的选择，0为非攻击性语言

test_labels = pd.read_csv('test_labels.csv')

train = pd.read_csv('train.csv')
## 1 是标记为恶毒的分类

In [28]:
# check files

test.head(1)
## 注意第6行是良好评论数据

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"


In [29]:
test_labels.head(1)
## 注意在test_labels中，提示了良好评论的分类（全为0）

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1


In [30]:
train.head(1)
## 注意第7行是train的目标数据处理结果
## 将涉及到的负面类型通过1来标记

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0


<center><a id='wrangling'>II 向量化</a></center>

In [31]:
# vectorize

## set classes
class_list = list(train.columns[2:])
class_list

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [32]:
# get comment
train_comment = train.comment_text
test_comment = test.comment_text

In [33]:
train_comment[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [34]:
train_comment.shape

(159571,)

<center><a id='eda'>IV Keras Test</a></center>

In [35]:
# import
from sklearn.datasets import load_files       
from keras.utils import np_utils
from keras.layers import Embedding
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer

import numpy as np
from glob import glob

In [36]:
# set embedding
input_dim = 200
output_dim = 32
input_length = 50

embedding = Embedding(input_dim, output_dim, input_length=input_length)

In [37]:
# build cnn
model = Sequential()

In [38]:
# ---testing---

# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])
## label 是正面评价的单词
## 来源：https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [39]:
# integer encode the documents
vocab_size = 50
## 设定50 防止 one_hot hash冲突（50比实际的词要多）
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

[[23, 16], [38, 39], [48, 12], [16, 39], [34], [32], [41, 12], [42, 38], [41, 39], [7, 14, 16, 2]]


In [40]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
## 将每个输入都扩展为4个单词，没有的用零补充

[[23 16  0  0]
 [38 39  0  0]
 [48 12  0  0]
 [16 39  0  0]
 [34  0  0  0]
 [32  0  0  0]
 [41 12  0  0]
 [42 38  0  0]
 [41 39  0  0]
 [ 7 14 16  2]]


In [41]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
# vocab_size 对应的是 Embedding 的层数，input_length为4，维度为8
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_4 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.999998


<center><a id='vec'>IV Keras Vec Testing</a></center>

In [43]:
# 向量化词

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [44]:
# 创建 embedding 层

## 导入pre-training
embeddings_index = dict()
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

## 创建嵌入矩阵
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [45]:
# 创建模型

model = Sequential()
## 设置embedding层
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
## compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
## summarize the model
print(model.summary())
## fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
## evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 4, 50)             750       
_________________________________________________________________
flatten_5 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 201       
Total params: 951
Trainable params: 201
Non-trainable params: 750
_________________________________________________________________
None
Accuracy: 89.999998


In [46]:
# 50维的不是很理想，换300维的试试
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [47]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
# define model
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 4, 300)            4500      
_________________________________________________________________
flatten_6 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 1201      
Total params: 5,701
Trainable params: 1,201
Non-trainable params: 4,500
_________________________________________________________________
None
Accuracy: 100.000000


In [None]:
<center><a id='cnn'>V Glove Testing</a></center>

In [48]:
## 检查输入
train_comment.shape

(159571,)

In [49]:
## 检查输入
test_comment.shape

(153164,)

In [61]:
# 使用上述方法完成 solution2 使用 6B 300D 的预训练数据

# 向量化词

## prepare tokenizer
t = Tokenizer()
t.fit_on_texts(train_comment)
## 这里是对输入句子进行拆词
## 在这个数据里，输入了1000句（train_short)，结果是10007个
## 如果输出的话在1000之后会有个...
## 全部输入的话是21万多
vocab_size = len(t.word_index) + 1
## 根据上面设置 vocab_size

## integer encode the documents
encoded_train = t.texts_to_sequences(train_comment)

## pad documents to a max length of 100 words
max_length = 50
## max_length 就是每个commet要处理的单词数
padded_training = pad_sequences(encoded_train, maxlen=max_length, padding='post')

In [62]:
print(padded_training[99])
## 可以看出来，100个词的 padded 数据，没有的用0填充了

[   40    33    42   173   282   145    89    26    22     6    96     5
   672   884    16   528     2    33    57     4    18    57  2292     8
    39    73   371     4     6   361     2    16   101  1463    21   567
    37     6   292    18     5   672    50    96    48    11  1018   439
  3246 11844]


In [63]:
len(t.word_index)
## 注意这里是先将词统计完的计数
## 与 max_length 无关，因为 pad_sequences 是后发生的

210337

In [64]:
# 创建 embedding 层
# （50D）

## 导入pre-training
embeddings_index = dict( )
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

## 创建嵌入矩阵
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [65]:
embedding_matrix.shape

(210338, 50)

In [72]:
# prepare label
train.head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0


In [71]:
# 创建模型(basic)
# 注意本代码框不再执行，eporch50时间较长

model = Sequential()
## 设置embedding层
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=50, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
## compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
## summarize the model
print(model.summary())
## fit the model
model.fit(padded_training, train.toxic, epochs=50, verbose=0)
## evaluate the model
loss, accuracy = model.evaluate(padded_training, train.toxic, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 50, 50)            10516900  
_________________________________________________________________
flatten_13 (Flatten)         (None, 2500)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 2501      
Total params: 10,519,401
Trainable params: 2,501
Non-trainable params: 10,516,900
_________________________________________________________________
None
Accuracy: 93.387270


In [75]:
model.fit

<bound method Model.fit of <keras.engine.sequential.Sequential object at 0x1a55c0bbe0>>

In [76]:
# 因为使用的是 Transder Learning，Epoch设置为1测试
# 变化不大

model = Sequential()
## 设置embedding层
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=50, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
## compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
## summarize the model
print(model.summary())
## fit the model
model.fit(padded_training, train.toxic, epochs=1, verbose=0)
## evaluate the model
loss, accuracy = model.evaluate(padded_training, train.toxic, verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 50, 50)            10516900  
_________________________________________________________________
flatten_14 (Flatten)         (None, 2500)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 2501      
Total params: 10,519,401
Trainable params: 2,501
Non-trainable params: 10,516,900
_________________________________________________________________
None
Accuracy: 93.144744


<center><a id='conclusions'>VI 结论</a></center>

In [73]:
# output submission
filename = 'submission_s2_1.csv'
submission.to_csv(filename, index=False)
print('Complete: output file saved as {}'.format(filename))

NameError: name 'submission' is not defined

> 主要参考资料：
1. [项目建议中的LR + 词袋模式](https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams)
2. [Cross-validation Performance](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

> 小结：
1. Solution1 为 LR + CBOW 的方式进行多分类计算
2. 输出结果是每个分类的可能性[0,1]

> Kaggle Score:
1. 0.97576