In [10]:

import numpy as np
import keras

import os
import json

INPUT_DIR = './'


Using TensorFlow backend.


# 数据处理

In [2]:
f = open(INPUT_DIR+'Dataset for Detection of Cyber-Trolls.json')
TrollsData = [] # Label 为 1
NonTrollsData = [] # Label 为 0
for i in f:
    temp = json.loads(i)
    content = temp['content']
    label = int(temp['annotation']['label'][0])
    Data = {
        "content":content,
        "annotation":label
    }
    if label == 0:
        NonTrollsData.append(Data)
    else:
        TrollsData.append(Data)
f.close()
print('数据总数：%d, 欺凌数据个数：%d, 友好数据个数：%d' % (len(TrollsData)+len(NonTrollsData),len(TrollsData),len(NonTrollsData))) 
#print(json.dumps(data))

数据总数：20001, 欺凌数据个数：7822, 友好数据个数：12179


可以看到数据不是很均衡,我们对欺凌数据进行重复采样。
采用复制加随机拼接的方法再生成出一份样本

In [3]:
import random 
import copy

new_temp_TrollsData =  copy.deepcopy(TrollsData)
random.shuffle(new_temp_TrollsData)

for i in range(len(new_temp_TrollsData)):
    content = TrollsData[i]['content'].split(' ')
    content_len = len(content)
    r = random.randint(0,content_len - 1)
    new_temp_TrollsData[i]['content'] += ' ' + content[r]
    r = random.randint(0,content_len - 1)
    new_temp_TrollsData[i]['content'] += ' ' + content[r]

TrollsData += new_temp_TrollsData
print('现在欺凌数据个数:%d'%len(TrollsData))

现在欺凌数据个数:15644


欺凌个数又超过友好数据的个数了，我们剪裁到和友好数据一样的大小

In [4]:
TrollsData = TrollsData[:12179]
print('数据总数：%d, 欺凌数据个数：%d, 友好数据个数：%d' % (len(TrollsData)+len(NonTrollsData),len(TrollsData),len(NonTrollsData))) 

数据总数：24358, 欺凌数据个数：12179, 友好数据个数：12179


把数据整合到一起，然后，打乱它

In [5]:
Data = TrollsData + NonTrollsData
random.shuffle(Data)
print(Data[:4],len(Data))

[{'content': 'lmfaoooooo. JEALOUS bitches make me sick!(; so nicoo  ahaah what th fxck is up! :D', 'annotation': 0}, {'content': 'is officially too gay to function! be owners', 'annotation': 1}, {'content': 'the nicest thing someone has ever done for would probaly be  when sam my sister just told me for no reason that i was a really good person and stuff', 'annotation': 0}, {'content': '" Fuck Sarah Palin  the political version on Sanjaya".....perfection.', 'annotation': 0}] 24358


In [None]:
random.shuffle(Data)

接下来创建词表

In [None]:
import re
WordFre = {} #词频
for d in Data:
    content = keras.preprocessing.text.text_to_word_sequence(d['content'])
    for c in content:
        if c == '':
            continue
        word = c.lower()
        if WordFre.get(word,None) == None:
            WordFre[word] = 0
        WordFre[word] += 1

In [None]:
threshold = 10
WordIdx = {}
indx = 0
for w in sorted(WordFre.items(),key=lambda x:x[1],reverse=True):
    key = w[0]
    fre = w[1]
    if fre < threshold:
        continue
    indx += 1
    WordIdx[key] = indx
WordIdx['_PAD_'] = indx+1,
WordIdx['_UNK_'] = indx+2


In [None]:
WordIdxLen = len(WordIdx)

In [9]:
WordIdx['_UNK_']

2631

In [None]:
json.dump(WordIdx,open('word.json','w'))

# 开始构建模型

In [25]:
from keras.models import Sequential
from keras.layers import Dense, Activation,Embedding,Bidirectional,GRU,Flatten,Dropout
from keras import regularizers

model = keras.models.Sequential()
model.add(Embedding(input_dim=WordIdxLen+1,output_dim=50,mask_zero=True))
model.add(Bidirectional(GRU(64,return_sequences=False,dropout=0.4)))
model.add(Dense(units=32,activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.2, noise_shape=None, seed=None))
model.add(Dense(units=16,activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)
model.summary()

print(model.input_shape) #查看模型输入shape
model.predict(np.asarray([
    [1,2,3,4,5,6,7],
    [2,3,4,5,6,7,8]
])).shape #查看模型输出的shape

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          131600    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               44160     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 180,433
Trainable params: 180,433
Non-trainable params: 0
_________________________________________________________________
(Non

(2, 1)

# 开始fit数据

In [None]:
#获取最大字符串长度，且对每一个content进行text_to_word_sequence
max_str_len = 0
for d in Data:
    d['content'] = keras.preprocessing.text.text_to_word_sequence(d['content'])
    if len(d['content']) > max_str_len :
        max_str_len = len(d['content'])


In [20]:
[WordIdx.get(i,WordIdx['_UNK_']) for i in Data[10]['content']] #查看一下转换的结果例子

[27, 79, 10, 8, 6, 2, 29]

In [None]:
#开始填充字符串
TrainData = []
TrainLabel = []
for d in Data:
    vec = [WordIdx.get(i,WordIdx['_UNK_']) for i in d['content']]
    pad = WordIdx.get('_PAD_')
    temp = keras.preprocessing.sequence.pad_sequences(sequences=[vec],maxlen=max_str_len,value=pad)
    TrainData.append(temp[0])
    TrainLabel.append(d['annotation'])


In [None]:
TrainData[:1],TrainLabel[:10] #查看填充结果

In [None]:
#分割训练和测试集
SPILT = 0.9
trainNum = int(len(TrainData)*SPILT)
finalTrainData = TrainData[0:trainNum]
finalTrainLable = TrainLabel[0:trainNum]
finalTestData = TrainData[trainNum:]
finalTestLabel = TrainLabel[trainNum:]

In [None]:
#模型训练
model.fit(np.asarray(finalTrainData),np.asarray(finalTrainLable),batch_size=500, epochs=100,validation_split=0.05)

In [None]:
model.save('model.h5') #训练结束，保存模型

# 模型测试

In [None]:
model2 = keras.models.load_model('model.h5') #加载模型

In [None]:
model2.evaluate(np.asarray(finalTestData),np.asarray(finalTestLabel)) #评估模型效果

In [13]:
#自定义的句子测试
WordIdx = json.load(open('word.json'))
s = '''I love you'''
s_q = keras.preprocessing.text.text_to_word_sequence(s)
s_v = [WordIdx.get(i,WordIdx['_UNK_']) for i in s_q]
print(s_v)
model2.predict(np.asarray([s_v]))

[1, 65, 2]


array([[0.03189052]], dtype=float32)