In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import jieba.posseg as pseg
import os
from tensorflow import keras 
import pandas as pd

In [6]:
if os.path.isdir("../input/fake-news-pair-classification-challenge"):
    TRAIN_CSV_PATH = '../input/fake-news-pair-classification-challenge/train.csv'
    TEST_CSV_PATH = '../input/fake-news-pair-classification-challenge/test.csv'
    TOKENIZED_TRAIN_CSV_PATH = "../input/apply-jieba-tokenizer/tokenized_train.csv"
    TOKENIZED_TEST_CSV_PATH = "../input/apply-jieba-tokenizer/tokenized_test.csv"
else:
    TRAIN_CSV_PATH = 'train.csv'
    TEST_CSV_PATH = '../input/test.csv'
    TOKENIZED_TRAIN_CSV_PATH = None

In [7]:
train = pd.read_csv(TRAIN_CSV_PATH, index_col='id')
train.head(3)

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated


In [8]:
cols = ['title1_zh', 
        'title2_zh', 
        'label']
train = train.loc[:, cols]
train.head(3)

Unnamed: 0_level_0,title1_zh,title2_zh,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,unrelated
3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,unrelated
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,unrelated


In [9]:
text = '我是台中人，但是我在板橋上班'
words = pseg.cut(text)
[word for word in words]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 0.415 seconds.
Prefix dict has been built successfully.


[pair('我', 'r'),
 pair('是', 'v'),
 pair('台中人', 'n'),
 pair('，', 'x'),
 pair('但是', 'c'),
 pair('我', 'r'),
 pair('在', 'p'),
 pair('板橋', 'n'),
 pair('上班', 'v')]

In [10]:
def jieba_tokenizer(text):
    words = pseg.cut(text)
    return ' '.join([
        word for word, flag in words if flag != 'x'])

In [11]:
train.isna().any()

title1_zh    False
title2_zh     True
label        False
dtype: bool

In [12]:
train.title2_zh.fillna('UNKNOWN', inplace=True)
train.isna().any()

title1_zh    False
title2_zh    False
label        False
dtype: bool

In [13]:
def process(data):
    res = data.apply(jieba_tokenizer)
    return res


def check_merge_idx(data, res):
    assert((data.index == res.index).all(), 'Something error when merge data')

def parallelize(data, func):
    from multiprocessing import cpu_count, Pool
    cores = partitions = cpu_count()
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    res = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    check_merge_idx(data, res)
    return res

  assert((data.index == res.index).all(), 'Something error when merge data')


In [14]:
np.all(train.index == train.title1_zh.index)

True

In [None]:
if os.path.exists("../input/apply-jieba-tokenizer/tokenized_train.csv"):
    print("Use prepared tokenized train data")
    train = pd.read_csv(TOKENIZED_TRAIN_CSV_PATH, index_col='id')
else:
    print("start to training")
    train['title1_tokenized'] = parallelize(train.loc[:, 'title1_zh'], process)
    train['title2_tokenized'] = parallelize(train.loc[:, 'title2_zh'], process)
    train.to_csv('tokenized_train.csv',index=True)

start to training


In [None]:
train.loc[:, ["title1_zh", "title1_tokenized"]].head(10)

In [None]:
train.loc[:, ["title2_zh", "title2_tokenized"]].head(10)

In [None]:
train.fillna('UNKNOWN', inplace=True)

In [None]:
MAX_NUM_WORDS = 10000
tokenizer = keras \
    .preprocessing \
    .text \
    .Tokenizer(num_words=MAX_NUM_WORDS)

In [None]:
corpus_x1 = train.title1_tokenized
corpus_x2 = train.title2_tokenized
corpus = pd.concat([
    corpus_x1, corpus_x2])
corpus.shape

In [None]:
pd.DataFrame(corpus.iloc[:5],
             columns=['title'])

In [None]:
corpus.isna().any()

In [None]:
tokenizer.fit_on_texts(corpus)
x1_train = tokenizer \
    .texts_to_sequences(corpus_x1)
x2_train = tokenizer \
    .texts_to_sequences(corpus_x2)

In [None]:
len(x1_train)

In [None]:
x1_train[:1]

In [None]:
for seq in x1_train[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

In [None]:
MAX_SEQUENCE_LENGTH = 20
x1_train = keras \
    .preprocessing \
    .sequence \
    .pad_sequences(x1_train, 
                   maxlen=MAX_SEQUENCE_LENGTH)

x2_train = keras \
    .preprocessing \
    .sequence \
    .pad_sequences(x2_train, 
                   maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
x1_train[0]

In [None]:
for seq in x1_train + x2_train:
    assert len(seq) == 20
    
print("所有新聞標題的序列長度皆為 20 !")

In [None]:
train.label[:5]

In [None]:
import numpy as np 

# 定義每一個分類對應到的索引數字
label_to_index = {
    'unrelated': 0, 
    'agreed': 1, 
    'disagreed': 2
}

# 將分類標籤對應到剛定義的數字
y_train = train.label.apply(
    lambda x: label_to_index[x])

y_train = np.asarray(y_train) \
            .astype('float32')

y_train[:5]

In [3]:
pip install  tensorflow

Collecting h5py~=2.10.0
  Downloading h5py-2.10.0-cp38-cp38-win_amd64.whl (2.5 MB)
Installing collected packages: h5py
  Attempting uninstall: h5py
    Found existing installation: h5py 3.1.0
    Uninstalling h5py-3.1.0:
      Successfully uninstalled h5py-3.1.0
Successfully installed h5py-2.10.0
Note: you may need to restart the kernel to use updated packages.
