In [2]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import glob
from tqdm import tqdm
%matplotlib inline

In [3]:
class classifer():
    def __init__(self):
        self.lr = 5 * 1e-4
        self.n_classes = 13
        
        self.make_placeholders()
        self.make_nn()
        self.make_loss()
        self.make_train_op()
        
        self.sess = tf.Session()
        self.sess.run(tf.initializers.global_variables())
        
        self.saver = tf.train.Saver()
        
    def save(self):
        self.saver.save(self.sess, 'nn-classifier-v2')
        
    def load(self, name):
        self.saver.restore(self.sess, name)

    def make_placeholders(self):
        self.input = tf.placeholder(tf.float32, shape=[None, 4096], name='X')
        self.label = tf.placeholder(tf.int32, shape=[None, self.n_classes], name='label')
        
    def make_nn(self):
        X = tf.layers.dense(self.input, 512, activation=tf.nn.relu,
                              kernel_initializer=tf.keras.initializers.glorot_normal(), name='Dense_1')
        X = tf.layers.dense(X, 512, activation=tf.nn.relu,
                              kernel_initializer=tf.keras.initializers.glorot_normal(), name='Dense_2')
        X = tf.layers.dense(X, 512, activation=tf.nn.relu,
                              kernel_initializer=tf.keras.initializers.glorot_normal(), name='Dense_3')
        X = tf.layers.dense(X, 512, activation=tf.nn.relu,
                              kernel_initializer=tf.keras.initializers.glorot_normal(), name='Dense_4')
        self.logit = tf.layers.dense(X, self.n_classes, activation=None,
                              kernel_initializer=tf.keras.initializers.glorot_normal(), name='logits')
        
        self.prediction = tf.nn.softmax(self.logit)
        
    def make_loss(self):
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(self.label, self.logit))
    
    def make_train_op(self):
        self.optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = self.optimizer.minimize(self.loss)
    
    def train(self, X, Y):
        loss, _ = self.sess.run([self.loss, self.train_op], feed_dict={self.input:X, self.label:Y})
        return loss
    
    def predict(self, X):
        prediction = self.sess.run([self.prediction], feed_dict={self.input:X})
        return prediction

In [4]:
tf.reset_default_graph()
NN = classifer()

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [5]:
NN.load('nn-classifier-v2')

INFO:tensorflow:Restoring parameters from nn-classifier-v2


In [6]:
# import stuff
%load_ext autoreload
%autoreload 2

from random import randint

import numpy as np
import torch

# Load model
from InferSent.models import InferSent
model_version = 1
MODEL_PATH = "InferSent/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'InferSent/GloVe/glove.840B.300d.txt' if model_version == 1 else 'InferSent/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [7]:
import pickle
Ticks = np.array(['Other', 'Politics', 'Media', 'Fashion', 'Foreign Policy', 'Immigration', 
         'Economy', 'Health', 'Art', 'Gender', 'Sport', 'Violence', 'Climate'])

In [8]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
#pickle.dump([], open('CNN_visit.p', 'wb'))

In [23]:
channel = 'CNN'

all_files = glob.glob('../files/CableNews/%s/*.p'%channel)
read_files = pickle.load(open('%s_visit.p'%(channel), 'rb'))
counter = len(read_files)

for file in tqdm(all_files):
    if file in read_files:
        continue
    else:
        read_files.append(file)
        if np.random.rand() < 0.3:
            pickle.dump(read_files, open('%s_visit.p'%(channel), 'wb'))     
            
    res = pickle.load(open(file, 'rb'))
    results = {}
    prev_text = ""
    all_text = []
    all_keys = []
    for key in res.keys():
        meta_data = res[key][0] # First in the list
        if len(meta_data['text']) < 10:
            continue

        # Make sure we drop the duplicates: Texts should be differents
        current_text = meta_data['text'][:10]
        if current_text == prev_text:
            continue
        else:
            prev_text = current_text
        
        text = tokenizer.tokenize(meta_data['text'])
        if len(text) <= 2:
            continue
        # Drop the first sentence
        text = text[1:]
        senteces = []
        for s in text: #Drop super small and super large senteces
            if len(s.split()) > 30 and len(s.split()) < 50:
                senteces.append(s)
        if len(senteces) == 0:
            continue
        # Calculate the embedding
        all_text.extend(senteces)
        all_keys.extend([key]*len(senteces))
    all_embed = model.encode(all_text, bsize=128, tokenize=True, verbose=False)
    all_predictions = NN.predict(all_embed)[0] # Merge the probabilties and take top 2:
    prev_key = None
    total_prob = np.zeros((13, 1))
    key_counter = 0
    for current_key in all_keys:
        if current_key==prev_key:
            total_prob[:, 0] += all_predictions[key_counter, :]
        else:
            Topics = Ticks[np.flip(np.argsort(total_prob[:, 0])[-2:])]; 
            Probs = np.flip(np.sort(total_prob[:, 0])[-2:]) * 100
            results[current_key] = {'Topics': list(Topics), 'Probs': list(Probs), 'gender': res[current_key][0]['gender'],
                       'persons': res[current_key][0]['persons'], 'locations': res[current_key][0]['locations']}
            prev_key = current_key
            total_prob = np.zeros((13, 1))
            total_prob[:, 0] += all_predictions[key_counter, :]
        key_counter += 1
    pickle.dump(results, open('processed_data/%s/%d.p'%(channel, counter), 'wb'))
    counter += 1





  0%|          | 0/16320 [00:00<?, ?it/s][A[A[A[A



  3%|▎         | 471/16320 [00:01<00:48, 326.01it/s][A[A[A[A

470
471
472
473
474
475
476
477
478
479






  3%|▎         | 471/16320 [00:13<00:48, 326.01it/s][A[A[A[A



  3%|▎         | 481/16320 [00:14<1:45:49,  2.49it/s][A[A[A[A

480


KeyboardInterrupt: 

In [17]:
current_key

95

In [18]:
res[current_key]

[{'text': ". has a loyal following and what ' s been fascinate for me is how much he has a loyal following amongst young people and the money that he can raise . so he ' s a threat obviously on a certain level . but at the same time , you can ' t beat somebody with nobody . and at the moment , i don ' t see that somebody yet that the ",
  'gender': ['m'],
  'persons': None,
  'locations': [[0.38, 0.11, 0.62, 0.57]]}]