In [1]:
import tensorflow
print(tensorflow.keras.__version__)
print(tensorflow.__version__)


  from ._conv import register_converters as _register_converters


2.1.6-tf
1.12.0


In [13]:
# !pip install tensorflow==1.12.0
# !pip install gensim
#!pip install pandas sklearn 

In [2]:
import pandas
import numpy as np
import pickle  
import time  
import json
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, Activation, LSTM,Dropout, InputLayer
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from gensim.models import KeyedVectors

In [5]:
df = pandas.read_table('./data/asmsr/unigram_input.utf8', header=None)
label = pandas.read_table('./data/asmsr/unigram_label.txt', header=None)

In [5]:
# label.shape

(12418519, 1)

In [6]:
df['label'] = label[0]
df.columns = ['character', 'label']

In [7]:
df.head()

Unnamed: 0,character,label
0,“,S
1,人,B
2,们,E
3,常,S
4,说,S


In [8]:
### Create sequence

vocabulary_size = 128

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(df['character'])
sequences = tokenizer.texts_to_sequences(df['character'])
data = pad_sequences(sequences, maxlen=50)

data.shape

(12418519, 50)

In [9]:
labels = LabelEncoder().fit_transform(df.label)
labels[:5]

array([3, 0, 1, 3, 3])

In [10]:
labels.shape

(12418519,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)

In [12]:
X_train.shape, y_train.shape

((8320407, 50), (8320407,))

In [14]:
####Load Embedding
embedding_model = KeyedVectors.load_word2vec_format('./data/asmsr/wang.txt')
embedding_model

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1231d6550>

In [15]:
embedding_dim = len(embedding_model[next(iter(embedding_model.vocab))])
embedding_dim

100

In [16]:
embedding_matrix = np.random.rand(256, embedding_dim)
embedding_matrix

array([[0.36236937, 0.13579486, 0.37293746, ..., 0.63957408, 0.64697332,
        0.69093361],
       [0.23546561, 0.46767522, 0.16060438, ..., 0.37154723, 0.6736566 ,
        0.04704969],
       [0.0251205 , 0.32985056, 0.31671518, ..., 0.71157356, 0.54858831,
        0.3097355 ],
       ...,
       [0.98849906, 0.54533457, 0.69901819, ..., 0.66508425, 0.11274037,
        0.02664581],
       [0.3132925 , 0.91735305, 0.72715888, ..., 0.1536537 , 0.06369135,
        0.0883931 ],
       [0.51778437, 0.93261935, 0.20276315, ..., 0.70576082, 0.86636102,
        0.98828809]])

In [17]:
word_index = tokenizer.word_index

for word, i in word_index.items():
    if i < vocabulary_size:
        try:
          embedding_vector = embedding_model.get_vector(word)
          embedding_matrix[i] = embedding_vector
        except:
          pass

In [18]:
def precision(y_true, y_pred):	
    """Precision metric.	
    Only computes a batch-wise average of precision. Computes the precision, a
    metric for multi-label classification of how many selected items are
    relevant.
    """	
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))	
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))	
    precision = true_positives / (predicted_positives + K.epsilon())	
    return precision

In [19]:
CLASSES = 4
max_len = 50
word_size = 100

def build_model(char_size, dropout, lr, optimizer):
    model = Sequential()
    model.add(Embedding(char_size, word_size, input_length=max_len, weights=[embedding_matrix]))
    
    model.add(LSTM(char_size, dropout=dropout, recurrent_dropout=dropout, return_sequences=True,name='backward_lstm')) 
    
    model.add(LSTM(char_size, dropout=dropout, recurrent_dropout=dropout, return_sequences=False, name='forward_lstm')) 
    
    model.add(Dropout(dropout))
    model.add((Dense(CLASSES, activation='softmax')))
    sgd = optimizers.SGD(lr=lr)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=[precision, 'accuracy'])
    return model

In [21]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y_train, 4)

In [22]:
y.shape

(8320407, 4)

In [None]:
filepath="/Users/oluwayetty1/Downloads/weights-improvement-{epoch:02d}-{precision:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor=precision, verbose=1, save_best_only=True, mode='max')
csv_logger = CSVLogger('/Users/oluwayetty1/Downloads/training.log', separator=',', append=False)
callbacks_list = [checkpoint,csv_logger]

model = build_model(256, 0.15, 0.04, 'adam')

history = model.fit(X_train, y, validation_split=0.33, epochs=1, batch_size=10, callbacks=callbacks_list, verbose=1)

# serialize model to JSON
model_json = model.to_json()
with open("/Users/oluwayetty1/Downloads/asmsr_model_stacking2.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights('/Users/oluwayetty1/Downloads/asmsr_weights_stacking2.h5', overwrite=True)

Train on 5574672 samples, validate on 2745735 samples
Epoch 1/1
  30130/5574672 [..............................] - ETA: 91:24:34 - loss: 1.2400 - precision: 0.2414 - acc: 0.4027

In [25]:
from sklearn.metrics import classification_report

In [95]:
y_true = y_test

In [96]:
tg = np.unique(y_test)

In [28]:
y_pred = model.predict_classes(X_test[:500])

In [29]:
y_pred

array([1, 3, 1, 3, 3, 3, 0, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,
       3, 1, 1, 1, 0, 1, 1, 3, 1, 3, 1, 3, 1, 3, 0, 3, 3, 3, 1, 3, 1, 1,
       3, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 1, 3, 1, 0, 1, 1, 1, 1, 3,
       1, 3, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 3, 3, 1, 1, 0, 1, 3, 1, 1,
       3, 1, 3, 3, 1, 1, 3, 1, 0, 1, 1, 0, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1,
       1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 3, 3, 1, 3, 1, 1, 1, 3, 0, 3, 3, 1, 0, 1, 1, 3,
       3, 1, 1, 1, 3, 1, 0, 1, 0, 3, 1, 3, 1, 1, 3, 3, 0, 1, 1, 1, 0, 1,
       3, 1, 1, 1, 3, 3, 3, 0, 3, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 3, 3, 3, 1, 1, 1, 3, 1, 1, 0, 1, 3, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 3, 0, 1, 1, 1, 1,
       1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 1, 3, 1, 1, 1, 1, 3, 0, 3, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 3, 0, 1, 1, 1, 3, 1, 3, 3, 0,

In [30]:
y_tra[:500]

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [93]:
print(classification_report(y_true, y_pred, target_names=['0', '1', '2', '3']))

              precision    recall  f1-score   support

           0       0.43      0.88      0.57       225
           1       0.60      0.23      0.34       232
           2       0.50      0.02      0.04        43
           3       0.87      0.62      0.72       200

   micro avg       0.54      0.54      0.54       700
   macro avg       0.60      0.44      0.42       700
weighted avg       0.61      0.54      0.50       700

