In [None]:
!nvidia-smi

In [None]:
from IPython.display import Image, display_png
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score, precision_score, recall_score, confusion_matrix
from collections import Counter
from tqdm import tqdm
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers , a
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Bidirectional , LSTM, GlobalMaxPool1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Embedding
from tensorflow. keras.layers import Flatten, Dropout, Activation, Input, Dense, concatenate, GRU, Conv1D

In [None]:
print(tf.__version__)

<img src="https://i.imgur.com/ZdNNZz3.jpg" width="800">

<img src="https://i.imgur.com/BXyi2Bz.png" width="800">

In [None]:
def multi_input_proportional_generator(datasets,
                                       label,
                                       p=[0.1, 0.9],
                                       batch_size=128):
    # p indicate number of class and sampling prob
    while (True):
        batch_data = [[], []]
        batch_label = []
        sample_id = np.random.choice(len(p), batch_size, p=p)
        query_idx = [
            np.where(label == class_id)[0] for class_id in range(len(p))
        ]
        for class_id in sample_id:
            query_id = np.random.choice(query_idx[class_id], 1)[0]
            batch_data[0].append(datasets[0][query_id])
            batch_data[1].append(datasets[1][query_id])
            batch_label.append(label[query_id])
        batch_data[0] = np.array(batch_data[0])
        batch_data[1] = np.array(batch_data[1])
        yield batch_data, np.array(batch_label)
        
def load_fasttext_fast(word_index, max_words, embed_size,file_name = "../input/word-vec-thai/cc.th.300.vec"):
    EMBEDDING_FILE = file_name
    emb_mean, emb_std = -0.0033470048, 0.109855264

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:       
        for line in f:
            if len(line) <= 100:
                continue
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def microf1(y_true, y_pred):

    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


def macrof1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2 * p * r / (p + r + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [None]:
def process_input(num_words, X_train, X_test, X_dev):
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token='<UNK>')
    tokenizer.fit_on_texts(X_train)

    #convert text data to numerical indexes
    train_seqs = tokenizer.texts_to_sequences(X_train)
    dev_seqs = tokenizer.texts_to_sequences(X_dev)
    test_seqs = tokenizer.texts_to_sequences(X_test)

    max_sequnce_len = max([len(x) for x in train_seqs])

    train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=max_sequnce_len, padding="post")
    test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=max_sequnce_len, padding="post")
    dev_seqs=tf.keras.preprocessing.sequence.pad_sequences(dev_seqs, maxlen=max_sequnce_len, padding="post")

    return  train_seqs, test_seqs, dev_seqs, max_sequnce_len, tokenizer

In [None]:
def process_input_combined(num_words, X_train, X_dev, X_test):
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token='<UNK>')
    X_combined = list(X_train[:, 0]) + list(X_train[:, 1])
    tokenizer.fit_on_texts(X_combined)

    #convert text data to numerical indexes
    train_seqs1 = tokenizer.texts_to_sequences(X_train[:, 0])
    train_seqs2 = tokenizer.texts_to_sequences(X_train[:, 1])
    
    dev_seqs1 = tokenizer.texts_to_sequences(X_dev[:, 0])
    dev_seqs2 = tokenizer.texts_to_sequences(X_dev[:, 1])
    
    test_seqs1 = tokenizer.texts_to_sequences(X_test[:, 0])
    test_seqs2 = tokenizer.texts_to_sequences(X_test[:, 1])
    
    max_len = max([len(x) for x in train_seqs1 + train_seqs2])

    train_seqs1 = tf.keras.preprocessing.sequence.pad_sequences(train_seqs1, maxlen=max_len, padding="post")
    test_seqs1=tf.keras.preprocessing.sequence.pad_sequences(test_seqs1, maxlen=max_len, padding="post")
    dev_seqs1 =tf.keras.preprocessing.sequence.pad_sequences(dev_seqs1, maxlen=max_len, padding="post")
    
    train_seqs2 = tf.keras.preprocessing.sequence.pad_sequences(train_seqs2, maxlen=max_len, padding="post")
    test_seqs2=tf.keras.preprocessing.sequence.pad_sequences(test_seqs2, maxlen=max_len, padding="post")
    dev_seqs2 =tf.keras.preprocessing.sequence.pad_sequences(dev_seqs2, maxlen=max_len, padding="post")

    return  train_seqs1, train_seqs2, dev_seqs1, dev_seqs2, test_seqs1, test_seqs2, max_len, tokenizer

In [None]:
df = pd.read_pickle('../input/siamese-legal/processed_torts20210321.pkl')
df.head()

In [None]:
idx = {251 : 'ประมวลกฎหมายวิธีพิจารณาความแพ่ง 55',
692 : 'ประมวลกฎหมายแพ่งและพาณิชย์ 425',
704 : 'ประมวลกฎหมายแพ่งและพาณิชย์ 438',
154 : 'ประมวลกฎหมายวิธีพิจารณาความแพ่ง 172',
135 : 'ประมวลกฎหมายวิธีพิจารณาความแพ่ง 142',
715 : 'ประมวลกฎหมายแพ่งและพาณิชย์ 448',
688 : 'ประมวลกฎหมายแพ่งและพาณิชย์ 421',
202 : 'ประมวลกฎหมายวิธีพิจารณาความแพ่ง 249',
100 : 'ประมวลกฎหมายวิธีพิจารณาความอาญา 46',
417 :  'ประมวลกฎหมายแพ่งและพาณิชย์ 1336'}

In [None]:
sorted_idx = list(idx.keys())

In [None]:
sorted_idx[0:10]

In [None]:
Dataset = {}
for label in sorted_idx:   
    positive_df = df[df['legal_encoded'] == label]
    negative_df = df[~df.case_id.isin(positive_df.case_id)]
    
    x2 = df[df['legal_encoded'] == label].iloc[0].legal_content_token
    positve_X1 = positive_df.plaintiff_token.values
    negative_X1 = negative_df.plaintiff_token.values
    Y = []
    X = []
    for x in positve_X1:
        X.append([x, x2])
        Y.append([1])
    for x in negative_X1:
        X.append([x, x2])
        Y.append([0])
    X = np.array(X ,dtype=object)
    Y = np.array(Y)
    print((X.shape, Y.shape))
    Dataset[label] = {'X': X , 'Y' : Y}

In [None]:
def bilstm_base_network(input_len: int,
                        num_words: int,
                        embedding_size: int = 300):

    input_layer = tf.keras.layers.Input(shape=(input_len,))
    embedding_layer = tf.keras.layers.Embedding(num_words, embedding_size, trainable=True)(input_layer) 
    bilstm_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128, recurrent_dropout=0.3))(embedding_layer)
    ff_1 = tf.keras.layers.Dense(100, activation='tanh')(bilstm_1)
    drop_1 = tf.keras.layers.Dropout(0.5)(ff_1)
    encoded_layer = tf.keras.layers.Flatten()(drop_1)
    return input_layer, bilstm_1


def siamese_bilstm(input_left: int,
                   input_right: int,
                   num_words: int,
                   tokenizer_left,
                   tokenizer_right,
                   embedding_size: int = 300):
    input_layer_left, encoded_layer_left = bilstm_base_network_pretrained_emb(input_left, num_words, tokenizer_left, embedding_size)
    input_layer_right, encoded_layer_right  = bilstm_base_network_pretrained_emb(input_right,num_words, tokenizer_right, embedding_size)

    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = tf.keras.layers.Lambda(lambda tensors:tf.math.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_layer_left, encoded_layer_right])
    prediction = tf.keras.layers.Dense(1, activation='sigmoid')(L1_distance)
    model = tf.keras.models.Model([input_layer_left, input_layer_right], prediction)
    model.summary()
    return model

#Add bilstm pretrained embedding layer
def bilstm_base_network_pretrained_emb(input_len: int,
                        num_words: int,
                        tokenizer,
                        embedding_size: int = 300):
    embedding_matrix =  load_fasttext_fast(tokenizer.word_index,num_words,embedding_size)
    input_layer = tf.keras.layers.Input(shape=(input_len,))
    embedding_layer = tf.keras.layers.Embedding(num_words, embedding_size,weights=[embedding_matrix],mask_zero = True, trainable=False)(input_layer) #embedding label + article
    bilstm_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128, recurrent_dropout=0.3))(embedding_layer)
    ff_1 = tf.keras.layers.Dense(100, activation='tanh')(bilstm_1)
    drop_1 = tf.keras.layers.Dropout(0.5)(ff_1)
    encoded_layer = tf.keras.layers.Flatten()(drop_1)
    return input_layer, bilstm_1


In [None]:
#model = siamese_bilstm(max_sequence_len_1, max_sequence_len_2, NUM_WORDS, tokenizer_1,tokenizer_2, 300)
#model.compile(optimizer="Nadam", loss="binary_crossentropy", metrics=['accuracy', tf.keras.metrics.AUC(), recall, precision, microf1, macrof1])

In [None]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [None]:
BATCH_SIZE=64
EPOCHS = 20
NUM_WORDS=8000
embedding_size = 300

In [None]:
for label in sorted_idx[0:10]:   
    X = Dataset[label]['X']
    Y = Dataset[label]['Y']
    
    #split data
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    
    #process data
    train_seqs1, train_seqs2, dev_seqs1, dev_seqs2, test_seqs1, test_seqs2, max_len, tokenizer = process_input_combined(NUM_WORDS, X_train, X_dev, X_test)
    
    #load embedding
    embedding_matrix =  load_fasttext_fast(tokenizer.word_index,NUM_WORDS,embedding_size) #load word embedding
 
    train_generator = multi_input_proportional_generator([train_seqs1, train_seqs2], Y_train, p=[0.5, 0.5], batch_size=BATCH_SIZE)
    validation_generator = multi_input_proportional_generator([dev_seqs1, dev_seqs2], Y_dev, batch_size=BATCH_SIZE)
    
    """sharing weight layer"""
    #process layer
    conv_layer = Conv1D(128,15, activation='relu',
                                 kernel_regularizer = regularizers.l2(0.0001),
                                 bias_regularizer = regularizers.l2(0.0001))
    #emb layer
    emb_layer = Embedding(NUM_WORDS, embedding_size ,input_length = max_len, weights=[embedding_matrix],trainable = False , mask_zero = True)
    
    #Flatten
    flatten = Flatten()
    
    """2 input"""
    #input1
    input_layer = Input(shape=(max_len,))
    e1 = emb_layer(input_layer)
    cnn1 = conv_layer(e1)
    flatten1 = flatten(cnn1)
    
    #input2
    input_layer2 = Input(shape=(max_len,))
    e2 = emb_layer(input_layer2)
    cnn2 = conv_layer(e2) 
    flatten2 = flatten(cnn2)
    
    #distance function
    merged = tf.keras.layers.Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([flatten1, flatten2])
    preds = tf.keras.layers.Dense(1, activation='sigmoid')(merged)
    model = tf.keras.Model(inputs=[input_layer, input_layer2], outputs=preds)
  
    model.compile(optimizer= 'Adam',loss="binary_crossentropy", metrics=['accuracy', recall, precision, microf1, macrof1])
    model.summary()

    num_batches = int(len(train_seqs1)/BATCH_SIZE)
    
    history = model.fit(train_generator, epochs=EPOCHS, steps_per_epoch=num_batches,validation_data=validation_generator,validation_steps=num_batches,verbose = 1)

    print("Evaluation")
    print(f'<<<<<<label {label}>>>>>>>>' )

    predictions=model.predict([test_seqs1,test_seqs2], verbose=1)
    y_pred = [1 if lst[0] > 0.5 else 0  for lst in predictions ]
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print(classification_report(Y_test, y_pred))      
    print('------------------------------------------------------------ ')
    print(confusion_matrix(Y_test, y_pred))   
    print('**************************************************************')

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True,to_file='model.png')
display_png(Image('model.png'))