# Data Ingestion

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

def get_raw_data():
    # Get file from http://archive.ics.uci.edu/ml/datasets/News+Aggregator
    df = pd.read_csv(
        "./../data/demo_data/NewsAggregatorDataset/newsCorpora.csv", 
        sep='\t', 
        names=['id', 'headline', 'url', 'publisher', 'category', 'story', 'hostname', 'timestamp']
    )
    
    # Category: b = business, t = science and technology, e = entertainment, m = health
    return df[['category', 'headline']]

df = get_raw_data()
df.head()

train_df, test_df = train_test_split(df, test_size=0.2)

x_train = train_df['headline'].values
y_train = train_df['category'].values
x_test = test_df['headline'].values
y_test = test_df['category'].values

In [2]:
x_train = [sentence[:300] for sentence in x_train]
x_test = [sentence[:300] for sentence in x_test]

In [3]:
def build_y_encoder(y):
    label2idx = {}
    idx2label = {}
    for label in list(set(y)):
        label2idx[label] = len(label2idx)
        idx2label[label2idx[label]] = label
        
    print(label2idx)
    print(idx2label)
    
    return label2idx, idx2label
    
label2idx, idx2label = build_y_encoder(y_train)

{'e': 0, 't': 1, 'm': 2, 'b': 3}
{0: 'e', 1: 't', 2: 'm', 3: 'b'}


# Skip-Thoughts Embeddings

In [4]:
# Copyright 2018 Edward Ma. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import numpy as np
import os, datetime
from torch import LongTensor
from torch.autograd import Variable
from skipthoughts import UniSkip, BiSkip

class SkipThoughtsEmbeddings:
    DICTIONARY_URL = "http://www.cs.toronto.edu/~rkiros/models/dictionary.txt"
    UNISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/utable.npy"
    BISKIP_URL = "http://www.cs.toronto.edu/~rkiros/models/btable.npy"
    UNISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz"
    BISKIPS_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz"
    UNISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl"
    BISKIPS_PKL_URL = "http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl"
    
    def __init__(self, model_dir, algorithm='uniskip', tokenizer=None, verbose=0):
        self.verbose = verbose
        
        self.model_dir = model_dir
        self.algorithm = algorithm
        self.vocab = {}
        self.vocabs = []
        if tokenizer is None:
            self.tokenizer = self._tokenizer_space
        else:
            self.tokenizer = tokenizer
        self.max_sentence_len = -1
        
    def _tokenizer_space(self, sentence):
        return sentence.split(' ')
        
    def _download(self, src, dest_dir, dest_file, unzip):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
    
        if dest_file is None:
            dest_file = os.path.basename(src)
            
        if not os.path.exists(dest_dir + dest_file):
            print('Downloading from %s' % (src))
            file = urllib.request.urlopen(url)
            with open(dest_dir + dest_file,'wb') as output:
                output.write(file.read())
        else:
            print('Found %s in %s' % (dest_file, dest_dir))
        
    def downloads(self, dest_dir, sources=None):
        if sources is None:
            sources = [self.DICTIONARY_URL, self.UNISKIP_URL, self.BISKIP_URL, 
                       self.UNISKIPS_URL, self.BISKIPS_URL, self.UNISKIPS_PKL_URL, 
                       self.BISKIPS_PKL_URL]
        
        for src in sources:
            self._download(src=src, dest_dir=dest_dir, dest_file=None, unzip=False)
        
    def build_vocab(self, sentences, clear_vocab=True, max_sentence_len=-1):
        if clear_vocab:
            self.vocab = {}
            
        self.max_sentence_len = max_sentence_len
        
        for sentence in sentences:
            words = self.tokenizer(sentence)
            if max_sentence_len == -1:
                self.max_sentence_len = max(self.max_sentence_len, len(words))

            for word in words:
                if word not in self.vocab:
                    self.vocabs.append(word)
                    # Reserve the first one for padding
                    self.vocab[word] = len(self.vocab) + 1

    def process(self, sentences):
        word_id_sentences = []
        for sentence in sentences:
            word_ids = [self.vocab[w] for w in self.tokenizer(sentence) if w in self.vocab]
            
            if self.max_sentence_len > len(word_ids):
                for i in range(0, self.max_sentence_len-len(word_ids)):
                    word_ids.append(0)
            elif self.max_sentence_len < len(word_ids):
                word_ids = word_ids[:self.max_sentence_len]
                    
            word_id_sentences.append(word_ids)
            
        return word_id_sentences
    
    def get_algorithm(self, words, model_dir=None):
        if model_dir is None:
            model_dir = self.model_dir
            
        if self.algorithm == 'uniskip':
            return UniSkip(model_dir, words)
        else:
            return BiSkip(model_dir, words)
        
    def _convert_to_numpy_layer(self, layer):
        return layer.detach().numpy()        

    def predict(self, sentences, output_format='torch'):
        transformed_sentences = self.process(sentences)
        
        algo = self.get_algorithm(self.vocabs)
        inputs = Variable(LongTensor(transformed_sentences))
        outpus = algo(inputs, lengths=[len(words) for words in transformed_sentences])
        
        if output_format == 'np':
            return self._convert_to_numpy_layer(outpus)
        elif output_format == 'torch':
            return outpus
        
    def predict_batch(self, sentences, output_format='torch', batch_size=1000):
        batches = [sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size-1) // batch_size)]

        results = []
        for batch in batches:
            results.append(skip_thoughts_emb.predict(sentences=batch, output_format=output_format))

        if output_format == 'np':
            return np.concatenate(results, axis=0)
        elif output_format == 'torch':
            return torch.cat(results, 0)

dest_dir = './skip_thoughts/'
skip_thoughts_emb = SkipThoughtsEmbeddings(model_dir=dest_dir)
skip_thoughts_emb.downloads(dest_dir=dest_dir)
print(datetime.datetime.now(), 'build')
skip_thoughts_emb.build_vocab(x_train)

Found dictionary.txt in ./skip_thoughts/
Found utable.npy in ./skip_thoughts/
Found btable.npy in ./skip_thoughts/
Found uni_skip.npz in ./skip_thoughts/
Found bi_skip.npz in ./skip_thoughts/
Found uni_skip.npz.pkl in ./skip_thoughts/
Found bi_skip.npz.pkl in ./skip_thoughts/
2018-09-29 15:52:09.704205 build


# Modeling

In [5]:
print(datetime.datetime.now(), 'get')
x_train_t = skip_thoughts_emb.predict_batch(x_train, output_format='np')
x_test_t = skip_thoughts_emb.predict_batch(x_test, output_format='np')
print(datetime.datetime.now(), 'done')

2018-09-29 15:52:11.043806 get


  "num_layers={}".format(dropout, num_layers))








2018-09-29 17:14:50.400833 done


In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
model = LogisticRegression(solver='newton-cg')
model.fit(x_train_t, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
y_pred = model.predict(x_test_t)

In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.80643672174612946