In [1]:
import os
import sys
import gzip
import numpy as np
from six.moves.urllib.request import urlretrieve
import tensorflow as tf
import keras

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

np.random.seed(1337)

Using TensorFlow backend.


# Data Prep
### download data

In [2]:
data_sources = {
    'bait': {
        'url': 'https://github.com/bhargaviparanjape/clickbait/blob/master/dataset/clickbait_data.gz?raw=true',   
        'size': 345283
    },
    'legit': {
        'url': 'https://github.com/bhargaviparanjape/clickbait/blob/master/dataset/non_clickbait_data.gz?raw=true',
        'size': 383115
    }
}

In [3]:
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
    """A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 5% change in download progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
    else:
        sys.stdout.write(".")
        sys.stdout.flush()
        
    last_percent_reported = percent
        
def maybe_download(url, filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

bait_filename = maybe_download(data_sources['bait']['url'], 'bait.gz', data_sources['bait']['size']);
legit_filename = maybe_download(data_sources['legit']['url'], 'legit.gz', data_sources['legit']['size']);

Found and verified bait.gz
Found and verified legit.gz


### extract data to arrays of strings, one per line

In [4]:
def get_data(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .gz
    if os.path.isdir(root) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        with gzip.open(filename, 'r') as f:
            lines = [x.decode('utf8').strip() for x in f.readlines()]
            lines = list(filter(None, lines))
            print(filename + ': ' + str(len(lines)))
            return lines
        
legit_raw_data = get_data(legit_filename)
bait_raw_data = get_data(bait_filename)

Extracting data for legit. This may take a while. Please wait.
legit.gz: 16001
Extracting data for bait. This may take a while. Please wait.
bait.gz: 15999


### concat clickbait and legit data

In [5]:
data = np.array(legit_raw_data + bait_raw_data)
labels = np.concatenate((np.ones(len(legit_raw_data)), np.zeros(len(bait_raw_data))))

print('labels: ', len(labels))
print('data: ', len(data))

labels:  32000
data:  32000


### tokenize data
This turns our arrays of strings (e.g. "ten ways to lose weight fast") into arrays of integers ([134, 26, 13, 219, 1065, 740]) where each number represents a particular string. In this case, the numbers also correspond to the rank of the word in popularity, so "the" will be close to 1 and rare words will be higher

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
data = np.array(tokenizer.texts_to_sequences(data))

In [7]:
word_lookup = {}
for key in tokenizer.word_index:
    word_lookup[tokenizer.word_index[key]] = key
# print(word_lookup)

def seq_to_words(arr):
    output = []
    for word in arr:
        output.append(word_lookup[word])
    return output

vocabulary_size = len(word_lookup)

### shuffle data

In [8]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [9]:
shuffled_data, shuffled_labels = unison_shuffled_copies(data, labels)

print('shuffled_data: ', len(shuffled_data))
print('shuffled_labels: ', len(shuffled_labels))

def show_data(index):
    print('data: ', shuffled_data[index])
    print('decoded data: ', seq_to_words(shuffled_data[index]))
    print('label: ', shuffled_labels[index], ' legit' if shuffled_labels[index] == 1.0 else ' clickbait')

for i in range(10):
    rand_num = np.random.randint(len(shuffled_data))
    print()
    print(rand_num)
    show_data(rand_num)

shuffled_data:  32000
shuffled_labels:  32000

21081
data:  [3, 547, 4, 1283, 533, 801, 1, 278, 607]
decoded data:  ['the', 'names', 'of', 'popular', 'celebrities', 'according', 'to', 'my', 'mom']
label:  0.0  clickbait

11227
data:  [27, 8, 413, 1, 115, 19, 1304, 907, 1959]
decoded data:  ['us', 'and', 'russia', 'to', 'sign', 'new', 'arms', 'control', 'treaty']
label:  1.0  legit

23364
data:  [26, 2175, 597, 128, 11, 10, 1855, 44, 9, 10, 97]
decoded data:  ['which', 'breaking', 'bad', 'character', 'is', 'your', 'soulmate', 'based', 'on', 'your', 'zodiac']
label:  0.0  clickbait

13796
data:  [23, 12, 5070, 98, 1898, 11, 95, 247, 30, 3563, 5275, 9, 51, 2877]
decoded data:  ['people', 'are', 'proving', 'no', 'dick', 'is', 'too', 'big', 'by', 'dropping', 'condoms', 'on', 'their', 'heads']
label:  0.0  clickbait

13915
data:  [6, 1690, 1, 82, 1751]
decoded data:  ['a', 'chance', 'to', 'get', 'outside']
label:  1.0  legit

5093
data:  [3178, 13603, 2, 2532, 3203]
decoded data:  ['republic

### pad data to the same length

In [10]:
max_length_input = max(shuffled_data, key=len)
print(max_length_input)
print(len(max_length_input))

shuffled_data = sequence.pad_sequences(shuffled_data, maxlen=len(max_length_input), padding="post", truncating="post")
print(shuffled_data[0])


[14, 134, 4, 6, 217, 2516, 13050, 1, 6, 232, 23227, 696, 209, 11, 278, 105, 192, 8, 75, 173, 49, 1, 34, 2657, 105, 192]
26
[ 3657  2512 12896     8   142    21     3   371   193    57     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


### divide data into train and test portions

In [11]:
train_length = len(shuffled_data) - len(shuffled_data) // 10

train_data = shuffled_data[0:train_length]
train_labels = shuffled_labels[0:train_length]

test_data = shuffled_data[train_length:]
test_labels = shuffled_labels[train_length:]

print('train_data: ', len(train_data))
print('train_labels: ', len(train_labels))
print('test_data: ', len(test_data))
print('test_labels: ', len(test_labels))

train_data:  28800
train_labels:  28800
test_data:  3200
test_labels:  3200


# create and fit model

In [15]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# hyperparameters
batch_size = 50
embedding_size = 128
lstm_size = 128
dropout_percent = 0.2

# model
model = Sequential()

model.add(Embedding(vocabulary_size + 2, embedding_size))
model.add(LSTM(lstm_size, dropout=dropout_percent, recurrent_dropout=dropout_percent))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_data, train_labels, batch_size=batch_size, epochs=2, validation_split=.2)

score, acc = model.evaluate(test_data, test_labels, batch_size=batch_size)
print('\nHoldout Set Accuracy: ', acc * 100, '%')

Train on 23040 samples, validate on 5760 samples
Epoch 1/2
Epoch 2/2

Holdout Set Accuracy:  97.9687503539 %


# demo predictions

In [13]:
def get_predictions(input_strings):
    coded_inputs = tokenizer.texts_to_sequences(input_strings)
    padded_inputs = sequence.pad_sequences(coded_inputs, maxlen=len(max_length_input), padding="post", truncating="post")
    return model.predict(padded_inputs, 1)

def is_it_bait(input_strings):
    predictions = get_predictions(input_strings)
    for i in range(len(input_strings)):
        print()
        print(input_strings[i])
        print('score: ', predictions[i][0])
        if predictions[i] >= .5:
            print('very' if predictions[i][0] > .8 else 'somewhat', 'likely to be legit')
        else:
            print('very' if predictions[i][0] < .2 else 'somewhat', 'likely to be clickbait')

In [14]:
input_strings = ['10 ways to lose weight fast', 'nutrition and exercice key to weight, not fad diets', 
                    # actual headlines I pulled from realclearpolitics.com
                    "The New Party of 'No'", # Megan McArdle, Bloomberg
                    "The Real Reasons Trump Can't Work with Democrats", # Matthew Yglesias, Vox
                    "Susan Rice's White House Unmasking: An Explosive Scandal", # Andrew McCarthy, NRO
                    "Trump's Russia Distractions Are Getting Desperate", # Fred Kaplan, Slate
                    "Democrats' Crusade to End the Filibuster Finally Pays Off", # David Harsanyi, Federalist
                    "CNN Had a Problem. Donald Trump Solved It.", # Jonathan Mahler, NY Times Magazine
                    "Nazis on the Roof of the World" # Matthias Schulz, Der Spiegel
                ]

is_it_bait(input_strings)


10 ways to lose weight fast
score:  0.112391
very likely to be clickbait

nutrition and exercice key to weight, not fad diets
score:  0.999138
very likely to be legit

The New Party of 'No'
score:  0.999143
very likely to be legit

The Real Reasons Trump Can't Work with Democrats
score:  0.00676517
very likely to be clickbait

Susan Rice's White House Unmasking: An Explosive Scandal
score:  0.999245
very likely to be legit

Trump's Russia Distractions Are Getting Desperate
score:  0.945979
very likely to be legit

Democrats' Crusade to End the Filibuster Finally Pays Off
score:  0.245497
somewhat likely to be clickbait

CNN Had a Problem. Donald Trump Solved It.
score:  0.011701
very likely to be clickbait

Nazis on the Roof of the World
score:  0.994046
very likely to be legit
