数据集路径

In [1]:
TRAIN_QLW_PATH = './data/qlw-train.json'
TRAIN_RUMOUR_PATH = './data/train.json'
DEV_PATH = './data/dev.json'
TEST_PATH = './data/test-unlabelled.json'

In [2]:
import json
import os

def import_from_json(file_path):
    with open(file_path) as json_file:
        data_dict = json.load(json_file)
        data_text = []
        labels = []
        for key in data_dict.keys():
            data_text.append(data_dict[key]['text'].lower()) 
            try:
                labels.append(int(data_dict[key]['label']))  
            except KeyError:
                pass
        return data_text, labels

def import_from_qlw(file_path):
    with open(file_path) as json_file:
        data_dict = json.load(json_file)
        data_text = []
        labels = []
        for key in data_dict.keys():
            data_text.append(data_dict[key].lower()) 
            try:
                labels.append(0)
            except KeyError:
                pass
        return data_text, labels

2 导入数据

2.1 导入网上爬的不是谣言的训练集

In [3]:
train_nonrumour, train_nonrumour_label = import_from_qlw(TRAIN_QLW_PATH)
print('训练集长度为: ', len(train_nonrumour))
# print(train_nonrumour[0])

训练集长度为:  2786


2.2 导入老师给是谣言的训练集

In [4]:
train_rumour, train_rumour_label= import_from_json(TRAIN_RUMOUR_PATH)
print('训练集长度为: ', len(train_rumour))
# print(train_rumour[0])

训练集长度为:  1168


2.3 导入测试集

In [5]:
test, _ = import_from_json(TEST_PATH)
print('测试集长度为: ', len(test))
# print(test[0])

测试集长度为:  1410


2.4 导入开发集

In [6]:
dev, dev_label = import_from_json(DEV_PATH)
print('开发集长度为: ', len(dev))
# print(dev[0])

开发集长度为:  100


3. 预处理

3.1 去除符号 - Word2Vec

In [53]:
all_ = train_rumour + train_nonrumour + dev + test
print('长度:', len(all_))
puncs = ['“', '”', '‘', '’', '–', '—', '...', '‐', '\u200b', '.\u2009.\u2009.', '\uf0b7', '\uf020', '\u200e', '\u2066', 
               '\u2069', '..', '. .', ',', '.', '"', '\u2018', '\u00a0', '\u2019']
for index_ in range(len(all_)):
    for punc in puncs:
        all_[index_] = all_[index_].replace(punc,'')

label = train_rumour_label + train_nonrumour_label + dev_label
print('符号去除完成！')

长度: 5464
符号去除完成！


3.2 去除 stopword + 去除非文字内容 - Word2Vec

In [54]:
import copy
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from collections import defaultdict

print("处理中 ...")
tt = TweetTokenizer()
stopwords = set(stopwords.words('english')) 

data_w2v = []
for line in all_:
    line_split = tt.tokenize(line)
    line_tmp = copy.deepcopy(line_split)
    for word in line_split:
        if re.match('\w+', str(word), flags=0) is None:
            try:
                line_tmp.remove(word)
            except ValueError:
                print("Not Found ", word)
        for stopword in stopwords:
            if word == stopword:
                line_tmp.remove(word)
    data_w2v.append(line_tmp)
# print(data_w2v[0])
print('Done!')

处理中 ...
['houston', 'flooding', 'isnt', 'sign', 'climate', 'change', 'distinguished', 'us', 'climate', 'scientist', 'dr', 'roy', 'spencer', 'writes', 'context', 'climate', 'change', 'seeing', 'houston', 'new', 'level', 'disaster', 'becoming', 'common', 'flood', 'disaster', 'unfolding', 'houston', 'certainly', 'unusual', 'natural', 'weather', 'disasters', 'always', 'occurred', 'always', 'occurmajor', 'floods', 'difficult', 'compare', 'throughout', 'history', 'ways', 'alter', 'landscape', 'example', 'cities', 'like', 'houston', 'expand', 'years', 'soil', 'covered', 'roads', 'parking', 'lots', 'buidings', 'water', 'rapidly', 'draining', 'rather', 'soaking', 'soil', 'population', 'houston', 'ten', 'times', '1920s', 'houston', 'metroplex', 'expanded', 'greatly', 'water', 'drainage', 'basically', 'direction', 'downtown', 'houston']
Done!


3.3 预处理以及计算词频 - Word2Vec忽略这步

In [7]:
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemma_process(word_token):
    word_token_pos_tag = nltk.pos_tag([word_token])
    for word_token, pos_tag in word_token_pos_tag:
        word_token_lower = word_token.lower()
        wordnet_pos = get_wordnet_pos(pos_tag)
        if wordnet_pos == None:
            word_token_lemma = lemmatizer.lemmatize(word_token_lower, wordnet.NOUN)
        else:
            word_token_lemma = lemmatizer.lemmatize(word_token_lower, wordnet_pos)
    return word_token_lemma

def calculate_frequency(data):
    frequency_list = []
    for value in data:
        text = value.replace('\n', ' ')
        frequency_dict = dict()
        stopword_removal = []
        for word_token in tt.tokenize(text.lower()):
            word_token_lemma = lemma_process(word_token)
            if (word_token_lemma not in punc) and (word_token_lemma not in chinesepunc) and (word_token_lemma not in 
                stopword_list) and (word_token_lemma not in letter):
                stopword_removal.append(word_token_lemma)
        for stopword in stopword_removal:
            if stopword in frequency_dict:
                frequency_dict[stopword] += 1
            else:
                frequency_dict[stopword] = 1
        frequency_list.append(frequency_dict)
    return frequency_list # , word_list
    
tt = TweetTokenizer()

stopword_list = set(stopwords.words('english'))
punc = string.punctuation
chinesepunc = ['“', '”', '‘', '’', '–', '—', '...', '‐', '\u200b', '.\u2009.\u2009.', '\uf0b7', '\uf020', '\u200e', '\u2066', 
               '\u2069', '..', '. .']
letter = ['a', 'b', 'c', 'd', "e", "f", 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 
          'x', 'y', 'z'] 

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

preprocessed_rumour = calculate_frequency(train_rumour)
print('preprocessed_rumour_list! well done!')
preprocessed_nonrumour = calculate_frequency(train_nonrumour)
print('preprocessed_nonrumour_list! well done!')
preprocessed_dev = calculate_frequency(dev)
print('preprocessed_dev_list! well done!')
preprocessed_test = calculate_frequency(test)
print('preprocessed_test_list! well done!')

preprocessed_rumour_list! well done!
preprocessed_nonrumour_list! well done!
preprocessed_dev_list! well done!
preprocessed_test_list! well done!


3.4 向量化 - Word2Vec忽略这步

In [9]:
# get vector for train_data, development_data, train_label

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def get_data_vector(rumour_data, nonrumour_data, test_data):
    vectorizer = DictVectorizer()
    transformer = TfidfTransformer(smooth_idf=False, norm=None)
    total_train = []
    train_label = []
    for rumour in rumour_data:
        total_train.append(rumour)
        train_label.append(1)
    for nonrumour in nonrumour_data:
        total_train.append(nonrumour)
        train_label.append(0)
    train_metrix = vectorizer.fit_transform(total_train)
    test_metrix = vectorizer.transform(test_data)
    train_vector = transformer.fit_transform(train_metrix)
    test_vector = transformer.transform(test_metrix)
    return train_vector, test_vector, train_label

train, test, train_label = get_data_vector(preprocessed_rumour, preprocessed_nonrumour, preprocessed_test)
print(train.shape, test.shape, len(train_label))
print('DONE!')

(3954, 76993) (1410, 76993) 3954
DONE!


4 Word2Vec

4.1 Word2Vec 处理

In [32]:
from  gensim.models import Word2Vec

Word2Vec(sentences=None,  #sentences可以是分词列表，也可以是大语料
        size=200,#特征向量的维度
        alpha=0.025,#学习率
        window=5,#一个句子内，当前词和预测词之间的最大距离
        min_count=3,#最低词频
        max_vocab_size=None,#
        sample=0.001, #随机下采样的阈值
        seed=1,#随机数种子
        workers=4,#进程数
        min_alpha=0.0001,#学习率下降的最小值
        sg=0, #训练算法的选择，sg=1，采用skip-gram，sg=0，采用CBOW
        hs=0,# hs=1,采用hierarchica·softmax，hs=10,采用negative sampling
        negative=5,#这个值大于0，使用negative sampling去掉'noise words'的个数（通常设置5-20）；为0，不使用negative sampling
        cbow_mean=1,#为0，使用词向量的和，为1，使用均值；只适用于cbow的情况
        iter = 5,#迭代次数
        null_word = 0,
        trim_rule = None, #裁剪词汇规则，使用None（会使用最小min_count）
        sorted_vocab = 1,#对词汇降序排序
        batch_words = 10000,#训练时，每一批次的单词数量
        compute_loss = False,
        callbacks = ())

<gensim.models.word2vec.Word2Vec at 0x12ccc4130>

In [43]:
import logging
#打印日志
# logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
#设置参数
num_features = 500 #Word vector dimensionality
min_word_count = 1 #
num_workers = 4 #number of threads to run in parallel
context = 10
downsampling = 1e-3 #Downsample setting for frequent words
 
#初始化和训练模型
from gensim.models import word2vec
print('Training model...')
model = word2vec.Word2Vec(data_w2v,workers=num_workers,size=num_features,min_count= min_word_count,
                          window = context,sample = downsampling)
model.init_sims(replace=True)
model_name = '300features_40minwords_10context'
#保存模型，以便下次使用或者继续训练
model.save(model_name)
print(model)
print("DONE!")

Training model...
Word2Vec(vocab=133641, size=500, alpha=0.025)
DONE!


4.2 叠加特征

In [44]:
import numpy as np

print('Working ...')
result = []

for row in all_:
    tmp_vector = np.array([0.0]*num_features)
    for word in row:
        try:
            tmp_vector += np.array(model[word])
        except KeyError:
            pass
    result.append(tmp_vector)
print(result[0])
print('DONE!')

Working ...


  tmp_vector += np.array(model[word])


[-1.48810518e+01  1.58107160e+01 -3.65508498e+01  4.33990656e+00
  7.34604349e-01 -1.39037077e+01  1.88867703e+00  1.49692652e+01
  7.78252502e+00  1.24737095e+01 -8.57881704e+00 -3.39123804e+00
 -4.72216342e+00  4.50229942e+00 -2.10437179e+01 -8.88675542e+00
 -1.07999345e+01  1.15242196e+01  3.77096729e+00  1.15474597e+01
 -4.70896056e+00 -1.38330012e+00 -7.57880453e+00  2.93278698e+01
 -6.50483945e+00  1.01466720e+01 -2.12065998e+01  7.38376442e-01
 -2.78989696e+01 -1.51044969e+01 -2.13757751e+01 -2.12411297e+00
 -4.29281308e-01 -1.25552405e+01 -8.29708270e+00  1.62017031e+01
  1.77938296e+01 -9.80730609e+00 -2.20074433e+01  5.46868805e+00
 -8.53115940e+00 -7.51626196e+00 -6.71424147e+00 -6.33084273e+00
  1.17008817e+00  6.36960573e+00  4.02695874e-01 -5.18053748e+00
 -1.54149718e+01 -1.65095117e+01  1.29232609e+01 -3.17786559e-01
  1.30894293e+01 -4.12434935e+00 -4.14448552e+00 -1.18720288e+01
 -3.12328275e+01  2.58260377e+01 -1.39281683e+01 -8.92473867e+00
 -8.58979710e+00  2.09819

4.3 监测数据准确性

In [68]:
from sklearn import svm
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import numpy as np

train = data_w2v[:-(len(test)+100)]
train_label = label[:-100]
print('训练集和label的长度:', len(train), len(train_label))

dev = result[-(len(test)+100):-len(test)]
print('dev 长度:',len(dev))

test = result[-len(test):]
print('test 长度', len(test))

训练集和label的长度: 3954 3954
dev 长度: 100
test 长度 1410


5 训练模型

5.1 Random Forest

In [46]:
print('Random Forest ...')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(train, train_label)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(train, train_label)
Y_pred = clf.predict(dev)
print ('The F-Score is:', f1_score(dev_label, Y_pred, average='macro'))

Random Forest ...
The F-Score is: 0.38694554633970435


5.2 MLP

In [None]:
from sklearn.neural_network import MLPClassifier
import numpy as np

mlp = MLPClassifier(hidden_layer_sizes=[5000],activation='logistic',solver ='adam',random_state=3,max_iter=1000)
mlp.fit(train, train_label)
Y_pred = mlp.predict(test)
print('DONE!')
# print ('The F-Score is:', f1_score(dev_label, Y_pred, average='macro'))

6 输出结果

In [86]:
TEST_LABEL_PREDICTION_PATH = './output/test-output.json'

def save_test_label(prediction_list, path):
    test_total_labels_dict = dict()
    num = 0
    for test_label in prediction_list:
        label_dict = dict()
        label_dict['label'] = test_label
        test_total_labels_dict['test-'+str(num)] = label_dict
        num = num + 1
    json_str = json.dumps(test_total_labels_dict)
    with open(path, 'w') as json_file:
        json_file.write(json_str)
Y_pred = Y_pred.tolist()
save_test_label(Y_pred, TEST_LABEL_PREDICTION_PATH)
print('DONE!')

DONE!
