In [204]:
import sklearn
assert sklearn.__version__ >= "0.20"
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras import layers
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [6]:
# Batch Size -- we set to 1 to just read it in
BATCH_SIZE = 1
DATASET_SIZE = 10000
train_size = 8000
test_size =2000

In [166]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'text/',
    batch_size=32,
    subset='training',
    validation_split=0.2, 
    seed=49
)

Found 10000 files belonging to 2 classes.
Using 8000 files for training.


In [167]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
   'text/',
    batch_size=32,
    validation_split=0.2, 
    subset='validation', 
    seed=49)

Found 10000 files belonging to 2 classes.
Using 2000 files for validation.


In [88]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to clickbait
Label 1 corresponds to normal


In [89]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return lowercase

In [191]:
max_features = 10000
sequence_length = 20

vectorize_layer = TextVectorization(
#     standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [192]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [193]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [194]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [195]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [177]:
embedding_dim = 10

In [206]:
keras.backend.clear_session()
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  LSTM(32, return_sequences=True),
  layers.GlobalAveragePooling1D(),
  layers.Dense(units=32, activation='relu'),
  layers.BatchNormalization(),
  layers.Dense(1,activation='sigmoid')])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          100010    
_________________________________________________________________
lstm (LSTM)                  (None, None, 32)          5504      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 106,731
Trainable params: 106,667
Non-trainable params: 64
_________________________________________________

In [207]:
#[tf.metrics.BinaryAccuracy(threshold=0.5]
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [208]:
epochs = 100
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

KeyboardInterrupt: 

In [202]:
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]

model.predict(examples)

UnimplementedError:  Cast string to float is not supported
	 [[node sequential/Cast (defined at <ipython-input-202-9b4beb687ef0>:7) ]] [Op:__inference_predict_function_614409]

Function call stack:
predict_function


In [82]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=100000)
encoder.adapt(raw_train_ds.map(lambda text, label: text))

In [83]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [84]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [85]:
history = model.fit(raw_train_ds, epochs=10,
                    validation_data=raw_val_ds, 
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [33]:
for text_batch, label_batch in full_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

Label 0


IndexError: index 1 is out of bounds for axis 0 with size 1

In [14]:
# Convert csv to dataset
def csv_to_dataset(csv,batch_size=1,shuffle=False):
    ds = tf.data.experimental.make_csv_dataset(
        csv,
        batch_size=batch_size, # Artificially small to make examples easier to show.
        label_name='isClickbait',
        num_epochs=1,
        shuffle=False,
        #column_defaults=['float64','float64','float64','int64','float64','float64','float64','float64','int64','float64','float64','float64','float64','float64'],
        ignore_errors=True,)
    return ds

In [8]:
# get full dataset
full_ds = csv_to_dataset('tweets.csv',batch_size=BATCH_SIZE)
# #shuffle
tf.random.set_seed(49)
full_ds = full_ds.shuffle(buffer_size=10000,reshuffle_each_iteration=False)
# get train val ds
train_val_ds = full_ds.take(train_size)
# get test ds
test_ds = full_ds.skip(train_size)

# # get train / valid ds
train_ds = train_val_ds.take(int(train_size*.75))
valid_ds = train_val_ds.skip(int(train_size*.75))

In [9]:
# Batch into 32
train_val_ds=train_ds.batch(32)
train_ds=train_ds.batch(32)
valid_ds=valid_ds.batch(32)
test_ds=test_ds.batch(32)

In [10]:
# column labels
labels= ['postMedia', 'postText', 'id', 'targetCaptions', 'targetParagraphs',
       'targetTitle', 'postTimestamp', 'targetKeywords', 'targetDescription',
       'isClickbait']
categorical = ['CHAS','RAD']
numeric = ['CRIM','ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT',]
features = ['postText', 'id', 'targetCaptions', 'targetParagraphs',
       'targetTitle', 'postTimestamp', 'targetKeywords', 'targetDescription',]