In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tensorflow.keras.utils import to_categorical

In [2]:
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.1.0
Hub version:  0.7.0


In [3]:
def get_hdf_from_file(filename, key):
    store = pd.HDFStore(filename)
    data_pdf = store[key]
    store.close()
    return data_pdf

In [4]:
domain_lookup = get_hdf_from_file('domain_lookup.h5', 'domain_lookup')

In [5]:
data_array = []
for i in range(0,10):
    print(f"Loading dataset {i}")
    df = get_hdf_from_file(f'tokenized_{i}.h5', 'clean_data')
    df['ids'] = df['ids'].map(lambda x: np.asarray(x, dtype=np.int32))
    df['masks'] = df['masks'].map(lambda x: np.asarray(x, dtype=np.int32))
    df['segments'] = df['segments'].map(lambda x: np.asarray(x, dtype=np.int32))
    data_array.append(df)


Loading dataset 0
Loading dataset 1
Loading dataset 2
Loading dataset 3
Loading dataset 4
Loading dataset 5
Loading dataset 6
Loading dataset 7
Loading dataset 8
Loading dataset 9


In [6]:
all_data_pdf = pd.concat([data_array[i] for i in range(0, 10)])

In [7]:
del data_array

In [8]:
counts = all_data_pdf.groupby('source_index').source_domain.count().reset_index()
print(counts)

    source_index  source_domain
0              0          15673
1              1          67486
2              2         110735
3              3          43842
4              4           5438
5              5          18184
6              6          20603
7              7           9970
8              8          39968
9              9          53953
10            10          40974
11            11           6826
12            12          15391
13            13           5701
14            14           6326
15            15          22750
16            16          12175
17            17           4422
18            18          28463
19            19          68977
20            20          21510
21            21            426
22            22            942
23            23          59722
24            24          35215
25            25          64080
26            26          21924
27            27          31774
28            28          62495
29            29          29985
30      

In [9]:
source_index = all_data_pdf['source_index'].values

In [10]:
all_y_array = to_categorical(source_index)

In [11]:
NUM_CLASSES = len(domain_lookup)
print("Number of classes: ", NUM_CLASSES)

Number of classes:  47


In [12]:
all_data_pdf.url.count()

source_domain     1523208
text_or_desc      1523208
clean_text        1523208
published_date    1523208
year              1523208
title             1523208
url               1523208
weeks             1523208
tokens            1523208
masks             1523208
segments          1523208
ids               1523208
source_index      1523208
dtype: int64

In [13]:
len(all_y_array)

1523208

In [14]:
X_train, X_test,  y_train, y_test, index_train, index_test = train_test_split(all_data_pdf, all_y_array,  source_index,
      test_size=0.2, shuffle=True, stratify=source_index)

In [15]:
ids = X_train['ids'].values
masks = X_train['masks'].values
segments = X_train['segments'].values

In [16]:
inputs = [np.vstack(ids), np.vstack(masks), np.vstack(segments)]

In [17]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)

In [18]:
MAX_SEQ_LEN = 256

In [19]:
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="segment_ids")

In [20]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [21]:
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(1024, activation="relu")(x)
out = tf.keras.layers.Dense(NUM_CLASSES, activation="softmax", name="dense_output")(x)

model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=out)


model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])


In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [23]:
history = model.fit(inputs, y_train, epochs=1, batch_size=500, validation_split=0.2, shuffle=True)

Train on 974852 samples, validate on 243714 samples


In [25]:
model.save('model5a')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: model5a/assets


INFO:tensorflow:Assets written to: model5a/assets


In [26]:
history.history

{'loss': [1.4596867563409386],
 'accuracy': [0.58781946],
 'val_loss': [1.1931939908461242],
 'val_accuracy': [0.6586491]}

In [27]:
history2 = model.fit(inputs, y_train, epochs=1, batch_size=500, validation_split=0.2, shuffle=True)

Train on 974852 samples, validate on 243714 samples


In [28]:
model.save('model5b')

INFO:tensorflow:Assets written to: model5b/assets


INFO:tensorflow:Assets written to: model5b/assets


In [29]:
history2.history

{'loss': [1.2297965598921385],
 'accuracy': [0.6460981],
 'val_loss': [1.0817883255891478],
 'val_accuracy': [0.69136775]}

In [30]:
history3 = model.fit(inputs, y_train, epochs=1, batch_size=500, validation_split=0.2, shuffle=True)

Train on 974852 samples, validate on 243714 samples


In [31]:
model.save('model5c')

INFO:tensorflow:Assets written to: model5c/assets


INFO:tensorflow:Assets written to: model5c/assets


In [32]:
ids_test = X_test['ids'].values
masks_test = X_test['masks'].values
segments_test = X_test['segments'].values

In [34]:
inputs_test = [np.vstack(ids_test), np.vstack(masks_test), np.vstack(segments_test)]

In [35]:
y_preds = model.predict(inputs_test)

In [37]:
y_top_preds = np.argmax(y_preds, axis=1)

In [39]:
y_top_preds.shape

(304642,)

In [41]:
index_test.shape

(304642,)

In [42]:
y_comparison = pd.DataFrame(y_top_preds, index_test).reset_index()
y_comparison.columns = ['prediction', 'actual']

In [43]:
y_comparison['correct'] = (y_comparison.actual == y_comparison.prediction)

In [44]:
y_comparison.correct.mean()

0.6982392447528575

In [45]:
pub_averages = y_comparison.groupby('actual').correct.mean()

In [46]:
pub_averages

actual
0     0.729595
1     0.706350
2     0.915888
3     0.814475
4     0.987654
5     0.761108
6     0.485577
7     0.476543
8     0.645165
9     0.999907
10    0.954767
11    0.797546
12    0.620924
13    0.663278
14    0.589474
15    0.731622
16    0.567450
17    0.709150
18    0.536300
19    0.540493
20    0.867082
22    0.430108
23    0.855584
24    0.439782
25    0.689167
26    0.643535
27    0.694782
28    0.790786
29    0.526674
30    0.994275
31    0.506118
32    0.596006
33    0.853653
34    0.780442
35    0.683043
36    0.459562
37    0.491178
38    0.529154
39    0.641924
40    0.796569
41    0.754348
42    0.410714
43    0.539665
44    0.780181
45    0.571429
46    0.657078
Name: correct, dtype: float64

In [47]:
domain_lookup

ajc.com                    0
americanthinker.com        1
apnews.com                 2
axios.com                  3
bbc.com                    4
boston.com                 5
breitbart.com              6
cbsnews.com                7
chicago.suntimes.com       8
chicagotribune.com         9
chron.com                 10
cnbc.com                  11
dailykos.com              12
dallasnews.com            13
denverpost.com            14
economist.com             15
fivethirtyeight.com       16
forbes.com                17
foreignpolicy.com         18
foxnews.com               19
ft.com                    20
latimes.com               21
msnbc.com                 22
nbcnews.com               23
newrepublic.com           24
newsday.com               25
newsmax.com               26
npr.org                   27
nydailynews.com           28
nypost.com                29
nytimes.com               30
prospect.org              31
reason.com                32
reuters.com               33
rt.com        

In [56]:
type(index_test)

numpy.ndarray

In [59]:
X_test.to_pickle('X_test.pickle')

In [62]:
pd.DataFrame(y_test).to_pickle('y_test.pickle')

In [63]:
pd.Series(index_test).to_pickle('index_test.pickle')

In [None]:
X_train.to_parquet('X_train.parquet')

In [None]:
pd.DataFrame(y_train).to_pickle('y_tr')