# Make predictions

In [1]:
# obtain reproducible results

import numpy as np
import tensorflow as tf
import random as rn

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(42)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(12345)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

# Check GPU device.
print(K.tensorflow_backend._get_available_gpus())

Using TensorFlow backend.


[]


In [2]:
#https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
#RNN

import os
import pandas as pd
import re
from tqdm import tqdm
from multiprocessing import Pool
from spellchecker import SpellChecker
import string
import math

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from sklearn.model_selection import train_test_split
from nltk import word_tokenize

# For encoding labels.
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [3]:
data = pd.read_pickle('/Users/yizhuoli/Downloads/WebCrawler/RICF_private/output_bin/df_proj.pkl.gz', compression='gzip')
data.head()

Unnamed: 0,proj_name,service_area,proj_desc_zh,proj_desc_en
0,购买导师、学位服,教育,支助武汉纺织大学教育事业发展,Support the development of education in Wuhan ...
1,奖励教师,教育,奖励教师,Reward teacher
2,奖励资助学生,教育,奖励资助学生,Reward funded students
3,教师及学生培训,教育,教师及学生培训费用,Teacher and student training fees
4,资助义务教育学生,教育,资助义务教育阶段,Funding compulsory education


In [4]:
# See the composition by NTEE major groups.
print(data.groupby('service_area')['proj_name'].count()/len(data))

service_area
体育         0.008939
公益事业发展     0.113826
医疗卫生       0.087505
志愿服务       0.013707
扶贫及社区发展    0.093564
政策倡导       0.006555
教育         0.539829
文化艺术       0.038439
法律与公民权力    0.005165
灾害救助       0.009138
生态环境       0.027910
社会服务       0.039035
科学研究       0.016389
Name: proj_name, dtype: float64


In [5]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(list(data.service_area.unique()))

data_y=lb.transform(data['service_area'])

In [6]:
data_y[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [7]:
import pickle
# Save LabelBinarizer class for developing package.
with open('../../output/chinese_lb_broad_cat.pkl', 'wb') as output:
    pickle.dump(lb, output, pickle.HIGHEST_PROTOCOL)

In [8]:
text_token_list=data['proj_desc_zh']

In [66]:
import jieba
jieba.enable_parallel(4) # 开启并行分词模式，参数为并行进程数 

token_sentence_list = []
sentences_for_token = []

for sentence in tqdm(text_token_list.to_list()):
    seg_list = jieba.cut(str(sentence), cut_all=False, HMM=True)
    cur_seg_list = list(seg_list)
    token_sentence_list.append(cur_seg_list)
    sentences_for_token.extend(cur_seg_list)

100%|██████████| 10068/10068 [00:07<00:00, 1364.19it/s]


In [70]:
token_sentence_list[:5]

[['支助', '武汉', '纺织', '大学', '教育', '事业', '发展'],
 ['奖励', '教师'],
 ['奖励', '资助', '学生'],
 ['教师', '及', '学生', '培训', '费用'],
 ['资助', '义务教育', '阶段']]

In [68]:
len(sentences_for_token)

699453

In [69]:
len(token_sentence_list)

10068

In [57]:
# Build word index for train and validation texts.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_for_token)
print(list(tokenizer.word_index.items())[0:50])

[('的', 1), ('、', 2), ('。', 3), ('项目', 4), ('年', 5), ('和', 6), ('为', 7), ('“', 8), ('”', 9), ('2017', 10), ('资助', 11), ('基金会', 12), ('在', 13), ('了', 14), ('元', 15), ('等', 16), ('学生', 17), ('月', 18), ('万元', 19), ('活动', 20), ('捐赠', 21), ('开展', 22), ('教育', 23), ('与', 24), ('用于', 25), ('发展', 26), ('公益', 27), ('学校', 28), ('及', 29), ('名', 30), ('是', 31), ('对', 32), ('中国', 33), ('社会', 34), ('进行', 35), ('人', 36), ('奖励', 37), ('建设', 38), ('由', 39), ('儿童', 40), ('支持', 41), ('通过', 42), ('文化', 43), ('该', 44), ('基金', 45), ('家庭', 46), ('工作', 47), ('设立', 48), ('日', 49), ('1', 50)]


In [58]:
import pickle
# Save tokenizer class for developing package.
with open('../../output/chinese_tokenizer.pkl', 'wb') as output:
    pickle.dump(tokenizer, output, pickle.HIGHEST_PROTOCOL)

In [71]:
seq_encoding_text = tokenizer.texts_to_sequences(token_sentence_list)

In [73]:
seq_encoding_text[:5]

[[4154, 724, 1092, 79, 23, 113, 26],
 [37, 56],
 [37, 11, 17],
 [56, 29, 17, 81, 203],
 [11, 1128, 769]]

In [74]:
# Pads sequences to the same length (i.e., prepare matrix).
data_sequences = pad_sequences(sequences=seq_encoding_text,
                               # Max length of the sequence.
                               maxlen=max([len(s) for s in seq_encoding_text]),
                               dtype="int32", padding="post", truncating="post",
                               # Zero is used for representing None or Unknown.
                               value=0
                               )

In [76]:
data_sequences[:5]

array([[4154,  724, 1092, ...,    0,    0,    0],
       [  37,   56,    0, ...,    0,    0,    0],
       [  37,   11,   17, ...,    0,    0,    0],
       [  56,   29,   17, ...,    0,    0,    0],
       [  11, 1128,  769, ...,    0,    0,    0]], dtype=int32)

In [91]:
from gensim.models import KeyedVectors
wv_from_text = KeyedVectors.load_word2vec_format('../classification_algorithms/WV/sgns.baidubaike.bigram-char',
                                                 binary=False, encoding="utf8",  unicode_errors='ignore')  # C text format
print("word2vec load succeed")

word2vec load succeed


In [109]:
EMBEDDING_DIM=300

embedding_matrix = np.zeros((len(tokenizer.word_index)+1, EMBEDDING_DIM)) # Plus one: embedding matrix starts from 0, word index starts from 1.

In [111]:
len(embedding_matrix[1])

300

In [112]:
len(wv_from_text.get_vector('的'))

300

In [113]:
for word, index in tokenizer.word_index.items():
    try:
        embedding_matrix[index] = wv_from_text.get_vector(word)
    except:
        pass
        # words not found in embedding index will be all-zeros.

In [114]:
embedding_matrix[2]

array([-2.38736004e-01,  3.99206012e-01,  4.68591988e-01,  2.27039997e-02,
        1.02711998e-01,  4.20415014e-01,  4.57598001e-01,  2.16646999e-01,
        7.90826023e-01,  8.36820006e-02, -1.47080002e-02, -4.44790989e-01,
       -3.42970006e-02,  7.74936020e-01, -4.60329987e-02,  4.66491014e-01,
       -2.89357007e-01,  5.02340019e-01,  3.28007996e-01, -6.75639987e-01,
        5.65692008e-01, -3.43730986e-01, -1.89420003e-02, -7.98878014e-01,
        4.05577004e-01,  1.72290001e-02,  4.81579006e-01,  2.92982012e-01,
       -4.76305008e-01,  5.46975970e-01,  5.50563991e-01, -3.19233000e-01,
        4.52850997e-01, -1.10495001e-01, -2.23923996e-01,  3.98712993e-01,
       -8.58199969e-02,  9.53070000e-02,  5.71292996e-01,  1.10839996e-02,
       -2.97479004e-01, -3.58787000e-01,  6.14437997e-01,  1.67586997e-01,
       -8.35470036e-02,  3.57475996e-01, -2.91913986e-01,  4.54210997e-01,
        6.67499006e-01,  6.94697022e-01, -5.39278984e-01,  4.34902012e-01,
       -3.34470004e-01,  

In [115]:
# https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, # Size of vocabulary.
                            input_length=max([len(s) for s in seq_encoding_text]), # Length of input, i.e., length of padded sequence.
                            output_dim=EMBEDDING_DIM, # Size of the vector space in which words will be embedded.
                            weights=[embedding_matrix],
                            trainable=False
                           )

In [126]:
data.service_area.unique()

array(['教育', '医疗卫生', '志愿服务', '扶贫及社区发展', '社会服务', '公益事业发展', '文化艺术', '生态环境',
       '法律与公民权力', '体育', '政策倡导', '科学研究', '灾害救助'], dtype=object)

In [128]:
# token y
y_tk = Tokenizer()
y_tk.fit_on_texts(data.service_area.unique())
index_list = y_tk.texts_to_sequences(data.service_area.to_list())

In [131]:
data.service_area.unique()

array(['教育', '医疗卫生', '志愿服务', '扶贫及社区发展', '社会服务', '公益事业发展', '文化艺术', '生态环境',
       '法律与公民权力', '体育', '政策倡导', '科学研究', '灾害救助'], dtype=object)

In [130]:
len(data.service_area.unique())

13

In [None]:
y = pad_sequences(index_list, maxlen=max([len(s) for s in index_list]),)

In [123]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    data_sequences, data.service_area.to_list(), test_size=0.33, random_state=42)

In [124]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, GlobalMaxPool1D, Conv1D

# with tf.device('/gpu:1'): # Specify which GPU to use.
#     # define the model
model = Sequential()
model.add(embedding_layer)
# model.add(Flatten())
model.add(Conv1D(128, 5, activation='softplus'))
model.add(GlobalMaxPool1D())
model.add(Dense(units=32, activation='sigmoid'))
model.add(Dense(units=32, activation='softplus'))
model.add(Dense(units=16, activation='tanh'))
model.add(Dense(units=16, activation='softplus'))
model.add(Dense(units=len(y_train[0]), activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc', 
#                                                                      precision, recall
                                                                    ])
# summarize the model
print(model.summary())

# fit the model
history=model.fit(x_train, y_train, validation_split=0.2, epochs=1, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 919, 300)          11999100  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 915, 128)          192128    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_6 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_7 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_8 (Dense)              (None, 16)                272       
__________

AttributeError: 'str' object has no attribute 'ndim'

In [None]:
# Continue previous work.
df_history=pd.read_csv('../../output/chinese_grid_search_history_broad_cat.tsv', sep='\t', index_col=0)

In [None]:
param_list_done=set(map(tuple, 
                        df_history[['conv_num_filters', 'conv_kernel_size', 'conv_act', 'out_act']].values.tolist()
                       )
                   )

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, GlobalMaxPool1D, Conv1D
from datetime import datetime

# df_history=pd.DataFrame()
for num_filters in [32, 64, 128]:
    for kernel_size in [3,5,7]:
        for conv_act in ['sigmoid', 'softplus', 'tanh', 'softmax']:
            for out_act in ['sigmoid', 'softplus', 'tanh', 'softmax']:
                param=tuple((num_filters, kernel_size, conv_act, out_act))
                if param not in param_list_done:
                    t1=datetime.now()
                    # Run NN on a specified GPU.
                    with tf.device('/device:GPU:0'):
                        # define the model
                        model = Sequential()
                        model.add(embedding_layer)
                        # model.add(Flatten())
                        model.add(Conv1D(num_filters, kernel_size, activation=conv_act))
                        model.add(GlobalMaxPool1D())
                        model.add(Dense(units=32, activation='sigmoid'))
                        model.add(Dense(units=32, activation='softplus'))
                        model.add(Dense(units=16, activation='tanh'))
                        model.add(Dense(units=16, activation='softplus'))
                        model.add(Dense(units=len(y_train[0]), activation=out_act))
                        # compile the model
                        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
                        # F1, precision, and recall removed. https://github.com/keras-team/keras/issues/5794
                        # fit the model
                        history=model.fit(x_train, y_train, validation_split=0.2, epochs=50, verbose=0)
                        y_prob = model.predict(x_val, verbose=0)
                    # Save history.
                    acc = history.history['acc']
                    val_acc = history.history['val_acc']
                    loss = history.history['loss']
                    val_loss = history.history['val_loss']
                    epochs = range(1, len(acc) + 1)
                    # Calculate on validation dataset.
                    y_classes = y_prob.argmax(axis=-1)
                    y_classes_prob=[s.max() for s in y_prob]
                    y_classes_val=y_val.argmax(axis=-1)
                    df_val=pd.DataFrame({'pred':y_classes, 
                                         'true':y_classes_val, 
                                         'prob':y_classes_prob})
                    val_acc_real=len(df_val[df_val.pred==df_val.true])/len(df_val)
                    # Save history to datafame.
                    df_history_temp=pd.DataFrame()
                    df_history_temp['acc']=acc
                    df_history_temp['val_acc']=val_acc
                    df_history_temp['val_acc_real']=[math.nan]*(len(epochs)-1)+[val_acc_real]
                    df_history_temp['loss']=loss
                    df_history_temp['val_loss']=val_loss
                    df_history_temp['epochs']=epochs
                    df_history_temp['conv_num_filters']=[num_filters]*len(epochs)
                    df_history_temp['conv_kernel_size']=[kernel_size]*len(epochs)
                    df_history_temp['conv_act']=[conv_act]*len(epochs)
                    df_history_temp['out_act']=[out_act]*len(epochs)
                    df_history_temp['time_stamp']=[str(t1)]+[math.nan]*(len(epochs)-2)+[str(datetime.now())]
                    df_history=df_history.append(df_history_temp, ignore_index=True)
                    df_history.to_csv('../../output/grid_search_history_broad_cat.tsv', sep='\t')