### Tensorflow CNN with my own pre-trained vectors (by CBOW)
- CBOW embeddings come from Tensorflow generated embeddings
- Tensorflow
- Modeled from: https://github.com/huseinzol05/Text-Classification-Comparison/blob/master/Deep-learning/cnn-vector.ipynb

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, metrics, decomposition, pipeline, dummy
from nltk.tokenize import TweetTokenizer
import os

import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = ''


import helpers.preprocessing as prep
import helpers.regprep as regprep
# to perform evaluations (new one - mines)
import helpers.evaluate as ev
evaluator = ev.Evaluate()
import matplotlib.pyplot as plt
%matplotlib inline
import helpers.pickle_helpers as ph
import gensim
from gensim.models import KeyedVectors
import time
import math
from sklearn.cross_validation import train_test_split
import re
import helpers.husein as H



### Data Preparation

In [3]:
train_data = ph.load_from_pickle(directory="data/husein_emotion/emotion-english/merged_training.pkl")

In [4]:
train_data.text = train_data.text.apply(lambda d: H.clearstring(d))

In [5]:
train_data, test_data, val_data = prep.split_original(train_data, 20)

In [6]:
### Sampling
train_data = train_data.sample(n=10000, random_state=10).copy()
test_data = test_data.sample(n=10000, random_state=10).copy()
val_data = val_data.sample(n=10000, random_state=10).copy()

### Pre-trained word embeddings

In [7]:
### load word embeddings and accompanying vocabulary
wv = ph.load_from_pickle("data/husein_emotion/tf_embeddings/tf_cbow_embeddings.p")
vocab = ph.load_from_pickle("data/husein_emotion/tf_embeddings/tf_cbow_dictionary.p")

### Tokenization and Label Binarization

In [8]:
def remove_unknown_words(tokens):
    return [t for t in tokens if t in vocab]

def check_size(c, size):
    if len(c) <= size:
        return False
    else:
        return True
    
### tokens and tokensize
train_data["tokens"] = train_data.text.apply(lambda t: remove_unknown_words(t.split()))
train_data["tokensize"] = train_data.tokens.apply(lambda t: len(t))
test_data["tokens"] = test_data.text.apply(lambda t: remove_unknown_words(t.split()))
test_data["tokensize"] = test_data.tokens.apply(lambda t: len(t))
val_data["tokens"] = val_data.text.apply(lambda t: remove_unknown_words(t.split()))
val_data["tokensize"] = val_data.tokens.apply(lambda t: len(t))


### filter by tokensize
train_data = train_data.loc[train_data["tokens"].apply(lambda d: check_size(d, 7)) != False].copy()
test_data = test_data.loc[test_data["tokens"].apply(lambda d: check_size(d, 7)) != False].copy()
val_data = val_data.loc[val_data["tokens"].apply(lambda d: check_size(d, 7)) != False].copy()

### sorting by tokensize (not needed for CNN because bucketing is not needed)
#train_data.sort_values(by="tokensize", ascending=True, inplace=True)
#test_data.sort_values(by="tokensize", ascending=True, inplace=True)
#val_data.sort_values(by="tokensize", ascending=True, inplace=True)

### resetting index
train_data.reset_index(drop=True, inplace=True);
test_data.reset_index(drop=True, inplace=True);
val_data.reset_index(drop=True, inplace=True);

### Binarization
emotions = list(set(train_data.emotions.unique()))
num_emotions = len(emotions)

# binarizer
mlb = preprocessing.MultiLabelBinarizer()

train_data_labels =  [set(emos) & set(emotions) for emos in train_data[['emotions']].values]
test_data_labels =  [set(emos) & set(emotions) for emos in test_data[['emotions']].values]
val_data_labels =  [set(emos) & set(emotions) for emos in val_data[['emotions']].values]

y_bin_emotions = mlb.fit_transform(train_data_labels)
test_y_bin_emotions = mlb.fit_transform(test_data_labels)
val_y_bin_emotions = mlb.fit_transform(val_data_labels)

train_data['bin_emotions'] = y_bin_emotions.tolist()
test_data['bin_emotions'] = test_y_bin_emotions.tolist()
val_data['bin_emotions'] = val_y_bin_emotions.tolist()

### Batching by Bucketing appraoch

In [9]:
### TODO: put this into data helpers
### renders embeddings with paddings; zeros where missing tokens
def generate_embeds_with_pads(tokens, max_size):
   
    padded_embedding = []
    for i in range(max_size):
        if i+1 > len(tokens): # do padding
            padded_embedding.append(list(np.zeros(EMBEDDING_DIM)))
        else: # do embedding for existing tokens
            padded_embedding.append(list(wv[vocab[tokens[i]]]))  
    return padded_embedding

### generate the actual batches
def generate_batches(data, batch_size):
    actual_batches = math.ceil(len(data) / batch_size)
    bins = np.linspace(0, len(data), actual_batches + 1) # this renders actual batches bins of size batch_size
    groups = data.groupby(np.digitize(data.index, bins))
    
    groups_indices = groups.indices
    groups_maxes = groups.max().tokensize
    
    return groups.indices, groups_maxes

### CNN Model

In [10]:
MAX_LEN = 20 # needed to ensure equal size of input
location = os.getcwd()
EMBEDDING_DIM = 128
NUM_LAYERS = 3
size_layer = 256
learning_rate = 0.0001
BATCH_SIZE = 100
filter_sizes = [2,3,4,5]

In [22]:
class CNN:
    def __init__(self, sequence_length, dimension_input, dimension_output, 
                 learning_rate, filter_sizes, out_dimension):
        
        self.X = tf.placeholder(tf.float32, shape=[None, sequence_length, dimension_input, 1])
        self.Y = tf.placeholder(tf.float32, shape=[None, dimension_output])
        
        pooled_outputs = []
        
        for i in filter_sizes:
            w = tf.Variable(tf.truncated_normal([i, dimension_input, 1, out_dimension], stddev=0.1)) #
            b = tf.Variable(tf.truncated_normal([out_dimension], stddev = 0.01))
            conv = tf.nn.relu(tf.nn.conv2d(self.X, w, strides=[1, 1, 1, 1],padding="VALID") + b) # [N, seq_len-1, 1, out_dim]
            pooled = tf.nn.max_pool(conv,ksize=[1, sequence_length - i + 1, 1, 1],strides=[1, 1, 1, 1],padding='VALID') # [N, 1, 1, out_dim]
            pooled_outputs.append(pooled)
        
        h_pool = tf.concat(pooled_outputs, 3) # [N, 1, 1, out_dim*len(filter_sizes)]
        h_pool_flat = tf.nn.dropout(tf.reshape(h_pool, [-1, out_dimension * len(filter_sizes)]), 0.1) # [N, out_dim*len(filter_sizes)] 
        
        w = tf.Variable(tf.truncated_normal([out_dimension * len(filter_sizes), dimension_output], stddev=0.1))
        b = tf.Variable(tf.truncated_normal([dimension_output], stddev = 0.01))
        self.logits = tf.matmul(h_pool_flat, w) + b #FC
        
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        l2 = sum(0.0005 * tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables())
        self.cost += l2
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))

### Training

In [23]:
### define model
tf.reset_default_graph()
sess = tf.InteractiveSession()
#model = Model(maxlen, dimension, len(label), learning_rate, filter_sizes, size_layer)
model = CNN(MAX_LEN, EMBEDDING_DIM, num_emotions, learning_rate, filter_sizes, size_layer)
sess.run(tf.global_variables_initializer())
dimension = EMBEDDING_DIM
saver = tf.train.Saver(tf.global_variables())
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0

### defining batch generation
train_groups_indices, train_groups_maxes = generate_batches(train_data, BATCH_SIZE)
test_groups_indices, test_groups_maxes = generate_batches(test_data, BATCH_SIZE)
val_groups_indices, val_groups_maxes = generate_batches(val_data, BATCH_SIZE)

n_train = len(train_data) // BATCH_SIZE
n_test = len(test_data) // BATCH_SIZE
n_val = len(val_data) // BATCH_SIZE

In [13]:
### training
while True:
    lasttime = time.time()
    ### early stoping to avoid overfitting
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:', EPOCH)
        break
    train_acc, train_loss, val_acc , val_loss = 0, 0, 0, 0
    
    for b in range(n_train):
        batch_x = train_data.iloc[train_groups_indices[b+1]].tokens.apply(lambda d: 
                                                                          generate_embeds_with_pads(d, MAX_LEN) ).values.tolist()
        batch_y = train_data.loc[train_groups_indices[b+1]].bin_emotions.values.tolist()
        
        batch_x = np.expand_dims(batch_x, axis=-1)
        
        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})
        train_loss += loss
        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})
        
    for b in range(n_val):
        batch_x = val_data.iloc[val_groups_indices[b+1]].tokens.apply(lambda d: 
                                                                          generate_embeds_with_pads(d, MAX_LEN) ).values.tolist()
        batch_y = val_data.loc[val_groups_indices[b+1]].bin_emotions.values.tolist()
        batch_x = np.expand_dims(batch_x, axis=-1)

        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})
        val_loss += loss
        val_acc += acc
    
    train_loss /= n_train
    train_acc /= n_train
    val_loss /= n_val
    val_acc /= n_val
    
    if val_acc > CURRENT_ACC:
        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', val_acc)
        CURRENT_ACC = val_acc
        CURRENT_CHECKPOINT = 0
        saver.save(sess, os.getcwd() + "/model/husein_emonet/model-cnn-vector.ckpt")
    else:
        CURRENT_CHECKPOINT += 1
    EPOCH += 1
    print('time taken:', time.time()-lasttime)
    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', val_loss, ', valid acc:', val_acc)

epoch: 0 , pass acc: 0 , current acc: 0.38328013924712484
time taken: 111.49427223205566
epoch: 1 , training loss: 2.7404070660606035 , training acc: 0.31944340576511715 , valid loss: 2.3440612626798227 , valid acc: 0.38328013924712484
epoch: 1 , pass acc: 0.38328013924712484 , current acc: 0.47112449089234526
time taken: 99.59676480293274
epoch: 2 , training loss: 2.1953228446105713 , training acc: 0.43311172963769895 , valid loss: 2.057679238192963 , valid acc: 0.47112449089234526
epoch: 2 , pass acc: 0.47112449089234526 , current acc: 0.5456565468374527
time taken: 99.25273585319519
epoch: 3 , training loss: 1.9006530315262737 , training acc: 0.5105206305553766 , valid loss: 1.759758419159687 , valid acc: 0.5456565468374527
epoch: 3 , pass acc: 0.5456565468374527 , current acc: 0.6105387018937053
time taken: 98.94453477859497
epoch: 4 , training loss: 1.6265683773850332 , training acc: 0.5837434305223115 , valid loss: 1.5164523237582408 , valid acc: 0.6105387018937053
epoch: 4 , pas

KeyboardInterrupt: 