In [49]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Reshape, Masking
from keras.preprocessing import sequence
from keras import optimizers
from keras.applications import imagenet_utils

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os.path
import json
from glob import glob

import spacy

In [50]:
nlp = spacy.load('en', vectors='en_glove_cc_300_1m')

In [51]:
random_state = 42

In [52]:
DATA_DIR = '../../funding_monitor/funding_monitor/output/'

In [53]:
MODEL_DIR = '../model'

# Prepare data

### Read the web scraping news

In [54]:
def read_json(file_name):
    with open(file_name, 'rb') as fp:
        json_lines = [json.loads(line) for line in fp]
        return pd.DataFrame.from_dict(json_lines)

def data_input_fn(df, num_epochs, shuffle):
    return tf.estimator.inputs.pandas_input_fn(
        x = pd.DataFrame({k: df[k].values for k in FEATURES}),
        y = pd.Series(df[LABEL].values),
        num_epochs=num_epochs,
        shuffle=shuffle)

In [55]:
df = pd.concat([read_json(file_name=file_name) for file_name in glob(os.path.join(DATA_DIR, '*.json'))])

In [56]:
df

Unnamed: 0,article_id,article_title,author,content,story_url,time
0,16158401,Meet the $50 million bedding startup that want...,Jason Del Rey,"[In some entrepreneurial circles, the amount o...",https://www.recode.net/2017/10/3/16394360/boll...,"Oct 3, 2017, 9:59am EDT"
1,16115695,Venture firm IVP has raised its biggest fund y...,Theodore Schleifer,[With valuations of private companies climbing...,https://www.recode.net/2017/9/26/16351654/ivp-...,"Sep 26, 2017, 7:00am EDT"
2,16046233,Whole Foods gives Amazon hundreds of return ce...,Jason Del Rey,"[When Amazon acquired Whole Foods last month, ...",https://www.recode.net/2017/9/13/16282192/happ...,"Sep 13, 2017, 6:00am EDT"
3,16023229,Podcast network Gimlet Media has raised anothe...,Peter Kafka,[Yet another vote of confidence in podcasting ...,https://www.recode.net/2017/9/6/16259188/wpp-g...,"Sep 6, 2017, 9:12am EDT"
4,16108309,Hyperloop One raised an $85 million round at a...,Johana Bhuiyan,"[Hyperloop One, the company built on Elon Musk...",https://www.recode.net/2017/9/21/16344268/hype...,"Sep 21, 2017, 8:48am EDT"
5,16096653,"Patreon, one of the most interesting media sta...",Peter Kafka,"[Patreon, which helps fans fund their favorite...",https://www.recode.net/2017/9/19/16332612/patr...,"Sep 19, 2017, 11:00am EDT"
6,15999337,Mark Zuckerberg says Facebook has raised $10 m...,Meghann Farnsworth,[Hurricane Harvey continues to pummel Houston ...,https://www.recode.net/2017/8/31/16235296/mark...,"Aug 31, 2017, 3:56pm EDT"
7,16098565,Comcast’s top government guy says Trump won’t ...,Tony Romm,"[President Donald Trump has previously , threa...",https://www.recode.net/2017/9/20/16334524/comc...,"Sep 20, 2017, 6:00am EDT"
8,16007493,"Juicero, the $700 juicer startup, is looking f...",Jason Del Rey,"[Juicero, the Silicon Valley startup that rais...",https://www.recode.net/2017/9/1/16243452/juice...,"Sep 1, 2017, 6:05pm EDT"
9,15987251,Many of the FCC’s record-breaking 21 million n...,Tony Romm,"[As the Trump administration prepares to , scr...",https://www.recode.net/2017/8/30/16223210/net-...,"Aug 30, 2017, 9:00am EDT"


### Add Label

#### Use regex to label the news with funding news (XXX raise $YYY) as a starting point

In [57]:
df_regex_raise_money = df['article_title'].str.extract(r'\b([A-Z\d]\w+)?.{1,5}([Rr]aised?|[rR]aising).{1,10}(\$\s*[\d.,]+).{1,10}?([MmbB]illion|[tT]housand)?', expand=True).dropna(how='all')
df_regex_raise_money

Unnamed: 0,0,1,2,3
3,Media,raised,$5,million
4,Hyperloop,raised,$85,million
5,,raised,$60,million
6,Facebook,raised,$10,million
10,Reddit,raised,$200,million
11,Pinterest,raised,$150,million
12,Media,raised,$15,million
16,,raised,$10,million
19,,raised,$2.5,million
20,,raised,$15,million


In [58]:
df.loc[df_regex_raise_money.index.tolist(), 'has_funding_news'] = 1
df.loc[list(set(df.index.tolist()) - set(df_regex_raise_money.index.tolist())), 'has_funding_news'] = 0

In [59]:
df

Unnamed: 0,article_id,article_title,author,content,story_url,time,has_funding_news
0,16158401,Meet the $50 million bedding startup that want...,Jason Del Rey,"[In some entrepreneurial circles, the amount o...",https://www.recode.net/2017/10/3/16394360/boll...,"Oct 3, 2017, 9:59am EDT",0.0
1,16115695,Venture firm IVP has raised its biggest fund y...,Theodore Schleifer,[With valuations of private companies climbing...,https://www.recode.net/2017/9/26/16351654/ivp-...,"Sep 26, 2017, 7:00am EDT",0.0
2,16046233,Whole Foods gives Amazon hundreds of return ce...,Jason Del Rey,"[When Amazon acquired Whole Foods last month, ...",https://www.recode.net/2017/9/13/16282192/happ...,"Sep 13, 2017, 6:00am EDT",1.0
3,16023229,Podcast network Gimlet Media has raised anothe...,Peter Kafka,[Yet another vote of confidence in podcasting ...,https://www.recode.net/2017/9/6/16259188/wpp-g...,"Sep 6, 2017, 9:12am EDT",1.0
4,16108309,Hyperloop One raised an $85 million round at a...,Johana Bhuiyan,"[Hyperloop One, the company built on Elon Musk...",https://www.recode.net/2017/9/21/16344268/hype...,"Sep 21, 2017, 8:48am EDT",1.0
5,16096653,"Patreon, one of the most interesting media sta...",Peter Kafka,"[Patreon, which helps fans fund their favorite...",https://www.recode.net/2017/9/19/16332612/patr...,"Sep 19, 2017, 11:00am EDT",1.0
6,15999337,Mark Zuckerberg says Facebook has raised $10 m...,Meghann Farnsworth,[Hurricane Harvey continues to pummel Houston ...,https://www.recode.net/2017/8/31/16235296/mark...,"Aug 31, 2017, 3:56pm EDT",1.0
7,16098565,Comcast’s top government guy says Trump won’t ...,Tony Romm,"[President Donald Trump has previously , threa...",https://www.recode.net/2017/9/20/16334524/comc...,"Sep 20, 2017, 6:00am EDT",0.0
8,16007493,"Juicero, the $700 juicer startup, is looking f...",Jason Del Rey,"[Juicero, the Silicon Valley startup that rais...",https://www.recode.net/2017/9/1/16243452/juice...,"Sep 1, 2017, 6:05pm EDT",1.0
9,15987251,Many of the FCC’s record-breaking 21 million n...,Tony Romm,"[As the Trump administration prepares to , scr...",https://www.recode.net/2017/8/30/16223210/net-...,"Aug 30, 2017, 9:00am EDT",0.0


#### Turn article_title to word embedding

In [60]:
def clean_word_vector(word_vector, upper_bound=1.0e+3, lower_bound=-1.0e+3):
#     remove nan
#     print(word_vector)
    word_vector[np.isnan(word_vector)] = 0.0
#     word_vector[word_vector > upper_bound] = 0.0
#     word_vector[word_vector < lower_bound] = 0.0
    return np.clip(word_vector, a_min=lower_bound, a_max=upper_bound)
#     remove extreme values

    

In [61]:
article_title_embedding = df['article_title'].apply(lambda x: [clean_word_vector(word.vector) for word in nlp(x)]).tolist()

In [62]:
max_sent_len = max([len(word_em_list) for word_em_list in article_title_embedding])

mean_sent_len = np.mean([len(word_em_list) for word_em_list in article_title_embedding])

steps_len = int(max_sent_len)

In [47]:
article_title_embedding.shape

NameError: name 'article_title_embedding' is not defined

In [15]:
max_sent_len, mean_sent_len, steps_len

(33, 12.584678933533608, 33)

In [16]:
# padding zero to the front of the word embedding
article_title_embeddin_padded = sequence.pad_sequences(article_title_embedding, 
                                                                          maxlen=steps_len,
#                                                       maxlen=int(mean_sent_len),
#                                                                          padding='post',
                                                                         dtype='float32',
#                                                                          truncating='post'
                                                     )

In [17]:
article_title_embeddin_padded[0]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.32789001, -0.20737   ,  0.25799   , ..., -0.39886999,
        -0.20615   , -0.38857999],
       [-0.10648   , -0.016295  , -0.22755   , ..., -0.31343001,
         0.087424  , -0.1661    ],
       [-0.24898   ,  0.087833  , -0.39399999, ..., -0.46252999,
         0.15523   ,  0.33537999]], dtype=float32)

In [18]:
features = np.array(article_title_embeddin_padded).astype("float32")

# features = np.reshape(features, [-1, 300, int(mean_sent_len)])

features[np.isnan(features)] = 0.0

np.sort(features)

df_feature_arr_max = pd.DataFrame([np.sort(np.amax(features, axis=(1, 2))), np.argsort(np.amax(features, axis=(1, 2)))]).T

df_feature_arr_min = pd.DataFrame([np.sort(np.amin(features, axis=(1, 2))), np.argsort(np.amin(features, axis=(1, 2)))]).T

df_very_big_word_em = df_feature_arr_max[df_feature_arr_max[df_feature_arr_max.columns[0]] > 1.0e+10].sort_values(df_feature_arr_max.columns[0], ascending=False)

df_very_small_word_em = df_feature_arr_min[df_feature_arr_min[df_feature_arr_min.columns[0]] < -1.0e+10] 

df_very_small_word_em

exclude_indices = df_very_small_word_em[df_very_small_word_em.columns[1]].astype(int).tolist() + df_very_big_word_em[df_very_big_word_em.columns[1]].astype(int).tolist()
exclude_indices

[]

In [19]:
np.amax(features)

4.1901002

In [20]:
np.amin(features)

-4.1444998

In [21]:
len(article_title_embeddin_padded[0])

33

#### Features and labels are ready to go!

In [22]:
features.shape

(5326, 33, 300)

In [23]:
labels = df['has_funding_news'].astype("float32")

In [24]:
labels.shape

(5326,)

#### train_test_split

In [25]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=random_state)

In [30]:
features_train[0]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [-0.60711998,  0.42544001,  0.5104    , ..., -0.18588001,
         0.025965  , -0.48231   ],
       [-0.62684   ,  0.72842997,  0.47222   , ..., -0.16216999,
         0.16379   ,  0.025549  ],
       [-0.15081   ,  0.34948   , -0.088026  , ..., -0.52019   ,
        -0.31755999,  0.056673  ]], dtype=float32)

# Build the Model

In [31]:
# create the model
embedding_vecor_length = 300
# steps_len = int(mean_sent_len)
learning_rate = 1.0e-3
dropout_rate = .2
batch_size = 256
epochs = 10

model = Sequential()
model.add(LSTM(100, input_shape=(steps_len, embedding_vecor_length), return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(LSTM(100, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(LSTM(100, return_sequences=False, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(Dense(1, activation='sigmoid'))

# optmzr = optimizers.Adagrad(lr=learning_rate, epsilon = 1.0e-15, clipvalue=1.0e-1)
# optmzr = optimizers.Adam(lr=learning_rate, clipnorm=1.0e+1)
optmzr = optimizers.Adam(lr=learning_rate)
# optmzr = optimizers.rmsprop(lr=learning_rate, clipnorm=1.0e-1)
model.compile(loss='binary_crossentropy', optimizer=optmzr, metrics=['accuracy'])
print(model.summary())
model.fit(features_train, labels_train, validation_data=(features_test, labels_test), epochs=epochs, batch_size=batch_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 33, 100)           160400    
_________________________________________________________________
lstm_5 (LSTM)                (None, 33, 100)           80400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 321,301
Trainable params: 321,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 4260 samples, validate on 1066 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15cb80b00>

In [32]:
scores = model.evaluate(features_test, labels_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 77.49%


# Save the Model

In [33]:
model.save(os.path.join(MODEL_DIR, 'fundingNewsClassifier.h5'))

In [None]:
import dill
from collections import defaultdict

with open('../data/headline_corpus/headlines_by_link_june2015_august2016.dill', 'rb') as f:
    headlines = dill.load(f)

with open('../data/headline_corpus/timestamps_by_headline_june2015_august2016.dill', 'rb') as f:
    timestamps = dill.load(f)

all_timestamps = [{'source': source, 'link': link, 'timestamp': timestamp} for source, val in timestamps.items() for link, _timestamps in val.items() for timestamp in _timestamps]

all_headlines = [{'source': source, 'link': link, 'headline': headline} for source, val in headlines.items() for link, _headlines in val.items() for headline in _headlines]

df_all_headlines = pd.DataFrame.from_dict(all_headlines)

df_all_headlines