# FUNCTIONS

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding 
from keras.layers import LSTM, Bidirectional 
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D 

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping


from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math, codecs

from prettytable import PrettyTable

MAX_NB_WORDS = 30000

Using TensorFlow backend.


## Pretrained Embeddings

### Indonesian Word Embeddings

In [2]:
def load_indonesian_word_embeddings():
    print('loading indonesian word embeddings...')
    indonesian_embeddings_index = {}
    fasttext_indo = codecs.open(
        '/Users/andikawilliam/Code/clickbait_models/input/embedding/cc.id.300.vec', encoding='utf-8'
    )
    for line in tqdm(fasttext_indo):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        indonesian_embeddings_index[word] = coefs

    fasttext_indo.close()
    print('found %s word vectors' % len(indonesian_embeddings_index))
    
    return indonesian_embeddings_index

### English Word Embeddings

In [3]:
def load_english_word_embeddings():
    print('loading english word embeddings...')
    english_embeddings_index = {}
    fasttext_english = codecs.open(
        '/Users/andikawilliam/Code/clickbait_models/input/embedding/cc.en.300.vec', encoding='utf-8'
    )

    for line in tqdm(fasttext_english):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        english_embeddings_index[word] = coefs
    fasttext_english.close()
    print('found %s word vectors' % len(english_embeddings_index))
    
    return english_embeddings_index

## Input Data (Headlines)

### Input file paths

In [2]:
file_paths = {
    'main_with_symbol' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/1b_with_symbols/main_with_symbol.csv',
    'fa_with_symbol' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/1b_with_symbols/fa_with_symbol.csv',
    
    'main_no_symbol' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/2b_no_symbols/main_no_symbol.csv',
    'fa_no_symbol' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/2b_no_symbols/fa_no_symbol.csv',
    
    'main_stemmed_words' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/3b_stemmed_words/main_stemmed_words.csv',
    'fa_stemmed_words' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/3b_stemmed_words/fa_stemmed_words.csv',
    
    'chakraborty' : '/Users/andikawilliam/Code/clickbait_models/input/clickbait/chakraborty/chakraborty_data.csv',
}

### Choose Dataset Version

In [3]:
def show_dataset_versions():
    print("Dataset Versions")
    print("-"*30)
    for index, dataset in enumerate(file_paths):
        print("%s. %s" % (index+1, dataset))

def choose_dataset_version():
    chosen_index = int(input("\nWhich dataset version would you like to use? "))
    chosen_dataset = list(file_paths.items())[chosen_index-1]
    print("Chosen Dataset : %s" % chosen_dataset[0])
    
    return chosen_dataset

### Load Chosen Dataset

In [4]:
def load_chosen_dataset_input(input_file_path):
    df = pd.read_csv(input_file_path, sep=',', header=0)

    label_names=['label_score']
    headline = df['title']
    label = df[label_names].values

    X_train, X_test, y_train, y_test = train_test_split(
            headline, label, stratify=label, test_size=0.2)

    print("total: ", df['title'].shape[0])
    print("train: ", X_train.shape[0])
    print("test: ", X_test.shape[0])
    
    return X_train, X_test, y_train, y_test

## Pre-process Dataset

In [5]:
def preprocess_dataset(X_train, X_test):
    raw_docs_train = X_train
    raw_docs_test = X_test
    tokenizer = RegexpTokenizer(r'\w+')
    max_seq_len = 20
    num_classes = 1

    print("pre-processing train data...")
    processed_docs_train = []
    for doc in tqdm(raw_docs_train):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens]
        processed_docs_train.append(" ".join(filtered))

    processed_docs_test = []
    for doc in tqdm(raw_docs_test):
        tokens = tokenizer.tokenize(doc)
        filtered = [word for word in tokens]
        processed_docs_test.append(" ".join(filtered))
        
    return processed_docs_train, processed_docs_test


### Create Word Index Dictionary

In [6]:
def create_word_index_dict(all_processed_data):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(all_processed_data)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))
    
    return tokenizer

In [7]:
print(word_index)

NameError: name 'word_index' is not defined

In [8]:
# print(len(word_index))
# print(word_index)

# import json
# print(json.dumps(word_index, indent=4))

### Tokenizing Input

In [11]:
def tokenize_input(train_data_processed, test_data_processed, tokenizer):
    print("Tokenizing Input Data")
    print("-"*30)
    print("Fit all existing words (assigning index)...")

    print("\nTransform input data into a sequence of integers(index)..")
    train_data_sequenced = tokenizer.texts_to_sequences(train_data_processed)
    test_data_sequenced = tokenizer.texts_to_sequences(test_data_processed)
    
    return train_data_sequenced, test_data_sequenced
    

### Pad Sequences

In [12]:
def pad_sequences(train_data_sequenced, test_data_sequenced, max_seq_len):
    print("padding sequences to have same length...")
    train_data_padded = sequence.pad_sequences(train_data_sequenced, maxlen=max_seq_len)
    test_data_padded = sequence.pad_sequences(test_data_sequenced, maxlen=max_seq_len)
    
    return train_data_padded, test_data_padded

### Plot Embedding Matrix

In [13]:
def plot_embedding_matrix(word_index, embeddings_index):
    embed_dim = 300
    words_not_found = []
    
    print('preparing embedding matrix...')

    nb_words = min(MAX_NB_WORDS, len(word_index)+1)
    
    embedding_matrix = np.zeros((nb_words, embed_dim))

    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    print("sample words not found: ", np.random.choice(words_not_found, 10))
    
    return embedding_matrix, nb_words

In [66]:
print(embedding_matrix[17917])

[ 0.0111 -0.0403 -0.0555 -0.0055  0.0195 -0.0477  0.0017  0.0145  0.0015
 -0.0478 -0.0162  0.0196  0.0094  0.01   -0.0049  0.0136  0.0176 -0.0066
  0.0068  0.0307 -0.036  -0.0094  0.0117  0.0329  0.0031 -0.0088  0.0141
  0.024   0.0313 -0.027   0.0403 -0.0038 -0.0118  0.0308  0.0038  0.0213
 -0.0248 -0.0032  0.0363  0.0082  0.0259 -0.0162 -0.033  -0.007   0.015
 -0.013   0.0025 -0.006   0.006   0.0248 -0.0064 -0.0144 -0.0364 -0.0409
 -0.0054  0.0024  0.0478  0.005  -0.0333  0.0125  0.0215 -0.0305 -0.0196
  0.043  -0.025  -0.0079  0.0243 -0.0153  0.0043 -0.0206  0.06   -0.0125
 -0.008   0.0033  0.0064 -0.0153 -0.0063 -0.0403 -0.0238 -0.0223  0.0211
 -0.0034  0.0065  0.0153 -0.0118 -0.0055 -0.09   -0.0036  0.0149 -0.0277
  0.0092 -0.0087  0.0022  0.0248  0.0387 -0.0339 -0.0305 -0.0062  0.0036
  0.0241  0.001  -0.0085 -0.016  -0.004   0.0251 -0.015  -0.0162 -0.0213
 -0.0103  0.0182  0.0054  0.0263  0.0043  0.0114 -0.018  -0.006   0.034
  0.0096  0.0132 -0.0087 -0.0104 -0.0218  0.0151 -0.0

In [65]:
print(len(embedding_matrix))
print(embedding_matrix[0])
print(nb_words)

print(word_index.items())

17918
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
17918


### Define Evaluation Metrics

In [16]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Model Training

### BiLSTM Model

In [17]:
def get_model_bilstm(metrics_input):
    model_bilstm = Sequential()
    model_bilstm.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
#     model_bilstm.add(Embedding(max_features, 128))
    model_bilstm.add(Bidirectional(LSTM(128)))
    model_bilstm.add(Dropout(0.5))
    model_bilstm.add(Dense(1, activation='sigmoid'))

    model_bilstm.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics_input)
    return model_bilstm

### CNN Model

In [18]:
def get_model_cnn(metrics_input):
    model_cnn = Sequential()
    model_cnn.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
    
    model_cnn.add(Conv1D(num_filters, 7, activation='relu'))
    model_cnn.add(MaxPooling1D(2))
    model_cnn.add(Conv1D(num_filters, 7, activation='relu'))
    model_cnn.add(GlobalMaxPooling1D())
    model_cnn.add(Dropout(0.5))
    
    model_cnn.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model_cnn.add(Dense(1, activation='sigmoid'))

    model_cnn.compile(loss='binary_crossentropy', optimizer=adam, metrics=metrics_input)
    
    return model_cnn

### Choose and Fit model

In [19]:
metrics = ['accuracy', f1_m, precision_m, recall_m]

def choose_model_architecture():    
    print("Models: \n 1. CNN \n 2. Bi-LSTM \n")
    chosen_model = input("chosen model: ")
    if chosen_model == '1':
        model = get_model_cnn(metrics)
    elif chosen_model == '2':
        model = get_model_bilstm(metrics)
    model.summary()
    
    return model

### Model Test

In [20]:
def print_evaluation_results(loss, accuracy, precision, recall, f1_score):
    table = PrettyTable()
    table.add_column('Loss', [loss])
    table.add_column('Accuracy', [accuracy])
    table.add_column('Precision', [precision])
    table.add_column('Recall', [recall])
    table.add_column('F1 Score', [f1_score])

    print(table)

In [16]:
# plt.figure()
# epoch_count = range(1, len(model_train.history['loss']) + 1)
# plt.plot(epoch_count,model_train.history['loss'], lw=2.0, color='b', label='loss')
# plt.title('Model Sentiment')
# plt.xlabel('Epochs')
# plt.ylabel('Cross-Entropy Loss')
# plt.legend(loc='upper right')
# plt.show()

# plt.figure()
# epoch_count = range(1, len(model_train.history['accuracy']) + 1)
# plt.plot(epoch_count,model_train.history['accuracy'], lw=2.0, color='r', label='accuracy')
# plt.title('Moel Sentiment')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(loc='upper right')
# plt.show()

# Run Program

### Setup Load embeddings ( one time )

In [21]:
# english_embedding_index = load_english_word_embeddings()
indonesian_embedding_index = load_indonesian_word_embeddings()

991it [00:00, 9907.13it/s]

loading indonesian word embeddings...


2000001it [03:12, 10385.08it/s]

found 2000000 word vectors





In [64]:
print(indonesian_embedding_index.get("jinhyuk"))

None


### Procedure

In [22]:
max_seq_len = 20

In [36]:
show_dataset_versions()
chosen_dataset = choose_dataset_version()

X_train, X_test, y_train, y_test = load_chosen_dataset_input(chosen_dataset[1])

if chosen_dataset[0] == 'chakraborty':
    embedding_index = english_embedding_index
else:
    embedding_index = indonesian_embedding_index

train_data_processed, test_data_processed = preprocess_dataset(X_train, X_test)

tokenizer = create_word_index_dict(train_data_processed+test_data_processed)

word_index = tokenizer.word_index

embedding_matrix, nb_words = plot_embedding_matrix(word_index, embedding_index)

train_data_sequenced, test_data_sequenced = tokenize_input(train_data_processed, test_data_processed, tokenizer)

train_data, test_data = pad_sequences(train_data_sequenced, test_data_sequenced, max_seq_len)

Dataset Versions
------------------------------
1. main_with_symbol
2. fa_with_symbol
3. main_no_symbol
4. fa_no_symbol
5. main_stemmed_words
6. fa_stemmed_words
7. chakraborty

Which dataset version would you like to use? 1


100%|██████████| 12000/12000 [00:00<00:00, 182652.23it/s]
100%|██████████| 3000/3000 [00:00<00:00, 173758.73it/s]

Chosen Dataset : main_with_symbol
total:  15000
train:  12000
test:  3000
pre-processing train data...





dictionary size:  17917
preparing embedding matrix...
number of null word embeddings: 1739
sample words not found:  ['brigjend' 'jinhyuk' 'victon' 'ylki' 'ipw' 'bouttier' 'racunalo'
 'lacazette' 'mediokeritas' 'konecki']
Tokenizing Input Data
------------------------------
Fit all existing words (assigning index)...

Transform input data into a sequence of integers(index)..
padding sequences to have same length...


In [74]:
print(train_data_processed[9])
print(train_data_sequenced[9])

tahun depan ada kebijakan cukai baru apa saja isinya
[31, 112, 28, 1147, 206, 25, 117, 372, 3422]


In [59]:
print(train_data_sequenced[9])

[31, 112, 28, 1147, 206, 25, 117, 372, 3422]


In [75]:
print(train_data_sequenced[9])
print(train_data[9])

[31, 112, 28, 1147, 206, 25, 117, 372, 3422]
[   0    0    0    0    0    0    0    0    0    0    0   31  112   28
 1147  206   25  117  372 3422]


In [73]:
test = ["aku sedang ingin mansnothot"]
y_test = "0" 
after_test = tokenizer.texts_to_sequences(test)

print(after_test)

[[1308, 1108, 66]]


### Parameters

In [76]:
batch_size = 256
num_epochs = 7

num_filters = 64
max_features = nb_words +1
embed_dim = 300
weight_decay = 1e-4

adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
adadelta = optimizers.Adadelta(learning_rate=1.0, rho=0.95)

In [77]:
print(train_data_processed[0])
print(train_data[0])
print(y_train[0])

print(train_data_processed[3])
print(train_data[3])

jadi menteri susi pudjiastuti sebut kembalikan duit negara 94 t
[   0    0    0    0    0    0    0    0    0    0    7   95  399 2012
   46 1586 2320  160 4993 1587]
[0]
baju sulli tersingkap hingga ekspos bagian tubuh karena tak pakai bra saat siaran live
[    0     0     0     0     0     0   622  7405 10226    23  7406  2778
  1006    63    10    93  2527    17   917   219]


In [1]:
model = choose_model_architecture()
model_train = model.fit(train_data, y_train, 
                        batch_size=batch_size, epochs=num_epochs, verbose=2)

NameError: name 'choose_model_architecture' is not defined

In [None]:
loss, accuracy, f1_score, precision, recall = model.evaluate(test_data, y_test, verbose=1)

In [None]:
print_evaluation_results(loss, accuracy, f1_score, precision, recall)

In [None]:
model.save('/Users/andikawilliam/Documents/Kuliah/Skripsi/final_models/model_cnn.h5')
del model

In [None]:
from keras.models import load_model
model = load_model('/Users/andikawilliam/Documents/Kuliah/Skripsi/final_models/model_cnn.h5',
                   custom_objects={'f1_m':f1_m, 'precision_m':precision_m, 'recall_m':recall_m, 'f1_m':f1_m})
loss, accuracy, f1_score, precision, recall = model.evaluate(test_data, y_test, verbose=1)

In [None]:
print_evaluation_results(loss, accuracy, f1_score, precision, recall)