#Imports:
Necessary to run this cell for the rest of the notebook

In [None]:
# Import relevant libraries
# Article scraping
!pip install newspaper3k
import newspaper

# Deep learning
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

# Various useful libraries
import numpy as np
import os
import time

# Google drive
from google.colab import drive
drive.mount('/content/drive')

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 5.3 MB/s 
[?25hCollecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 27.8 MB/s 
Collecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
Collecting cssselect>=0.9.2
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
Collecting feedparser>=5.2.1
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.8 MB/s 
[?25hCollecting tldextract>=2.0.1
  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 2.5 MB/s 
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Building wheels for co

#Creating the Dataset:
This section of code **only needs to be run once**

It creates text documents containing the text of articles in one, and the titles of articles in the other

In [None]:
# This does not need to be run anymore, as we already have a text document with data in it!
# Next, in order to save time scraping articles all day, we'll make a text file with the relevant information from the articles
# Create a news pool
from newspaper import news_pool

# Lists of information we'll use later
articles_text = []
articles_authors = []
articles_titles = []

# List of sources: can be extended or shortened as needed
# memorize_articles shouldn't matter but I still included it
# I could add keep_article_html=True as a parameter to keep the html, meaning I could have AI generated html... i'll test this once I have articles being made
cbs_paper = newspaper.build('http://cbs.com', memoize_articles = False)
slate_paper = newspaper.build('http://slate.com', memoize_articles = False)
espn_paper = newspaper.build('http://espn.com', memoize_articles = False)

papers = [cbs_paper, slate_paper, espn_paper]
news_pool.set(papers, threads_per_source = 2)
news_pool.join() # Note: this line takes roughly 3 and a half minutes to run

In [None]:
# This does not need to be run anymore, as we already have a text document with data in it!
# Now that we have the articles, let's make 2 text documents with information we'll need: Article title and article text
for paper in papers: # Note: this for loop takes roughly a minute and a half to run
  for item in range(paper.size()):
    articles = paper.articles[item]
    articles.parse()
    articles_text.append(articles.text)
    articles_titles.append(articles.title)
    articles_authors.append(articles.authors)

In [None]:
# This does not need to be run anymore, as we already have a text document with data in it!
print(articles_titles, file=open('articletitles.txt', 'w'))
print(articles_text, file=open('articletext.txt', 'w'))

In [None]:
# Import the dataset created above
path_to_file = '/content/drive/MyDrive/AINewsData/articletext.txt' #change to fit where you put your file
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Print out the length of the data, along with the unique characters
print(f'Length of text: {len(text)} characters')

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 4777890 characters
240 unique characters


#Build the Model:
Source: https://www.tensorflow.org/text/tutorials/text_generation

#Vectorization:

In [None]:
# Here we want to vectorize the data, in other words we'll be assigning a number value for each letter
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
tf.strings.reduce_join(chars, axis=-1).numpy()

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

#Training Examples:

In [None]:
# We want to create examples of text that the RNN can train on
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))


[
'
'
,
 
'
S
t
e
p


In [None]:
# Using longer sequences
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b"['', 'Stephen Colbert brings his signature satire and comedy to The Late Show with Stephen Colbert, t"
b'he #1 show in late night, where he talks with an eclectic mix of guests about what is new and relevan'
b't in the worlds of politics, entertainment, business, music, technology, and more. Featuring bandlead'
b'er Jon Batiste with his band Stay Human, the Emmy Award-nominated show is broadcast from the historic'
b' Ed Sullivan Theater. Stephen Colbert, Chris Licht, Tom Purcell, and Jon Stewart are executive produc'


In [None]:
'''
For training you'll need a dataset of (input, label) pairs where input and label are sequences. 
At each time step the input is the current character and the label is the next character.

Here's a function that takes a sequence as input, duplicates, and shifts it to align the input and label for each timestep:
'''
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


Input : b"['', 'Stephen Colbert brings his signature satire and comedy to The Late Show with Stephen Colbert, "
Target: b"'', 'Stephen Colbert brings his signature satire and comedy to The Late Show with Stephen Colbert, t"


#Create Training Batches:

In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

#Building the RNN Model:

In [None]:
# As we have a large dataset and we want to maintain some level of memory, we'll use a GRU RNN here
# Additionally, as the internal state will need to be maintained, we will be defining the RNN manually rather than just using a sequential model

# Length of the vocabulary in chars
vocab_size = len(vocab)
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 1024

In [None]:
# Define the model class using TF and Keras
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()

(64, 100, 241) # (batch_size, sequence_length, vocab_size)
Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  61696     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  247025    
                                                                 
Total params: 4,247,025
Trainable params: 4,247,025
Non-trainable params: 0
_________________________________________________________________


#Train the Model:

In [None]:
# Now that we have the model, we need to actually train it so it can make any actual predictions
# Loss Function
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (64, 100, 241)  # (batch_size, sequence_length, vocab_size)
Mean loss:         5.485957


In [None]:
# Compile with Adam optimizer, otherwise default
model.compile(optimizer='adam', loss=loss)

In [None]:
# Next, we'll be setting up checkpoints so that we can return to a previous state of the model if something goes wrong
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/drive/MyDrive/AINewsData'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [None]:
# Run the training - 20 epochs for higher accuracy
history = model.fit(dataset, epochs=20, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#Create Predicted Text:

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(['Body:', 'Body:', 'Body:', 'Body:', 'Body:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

tf.Tensor(
[b'Body: Belling all day he has denyer to bowl from Argentine champions I was take nor here that lifted him during that rap window paymead.\\n\\n"It took a message as well as he explains that the franchise to leave, sometimes of Drey Jones, GT. Wose named Pep Guardiola began, there are also me well was as \xe2\x80\x9cmuch not a genuine injury in some of the field.\\n\\nI\\\'d seen a nodd from the top biggest highest promotion said.\\n\\nBradford: 15-9\\n\\nGamp of that mistaka hints at the WSL (2014 and Curry (c), or Mexico) in them have a great older right customal, too kicked a foot common consistent ingraside part of the top best when an injury against Umark Choights -- UFC strawe from the 2019 Cavaliers were on the program and insists to the 2016-20 season. Aldres settled in the last winner. "If there are not predictable after him her goal on our stories. The free that I just take better everything happen before here being talked and then doing this, it was an vehable to