### Coding with tensorflow framework

## Import Modules

In [1]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
# from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

2022-08-07 08:50:53.999907: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-07 08:50:53.999960: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
BASE_DIR = "../data/"
WORKING_DIR = "../"
MODEL_DIR = "../trained_model/"

## Extract Image Features

In [3]:
# lan sau thu voi InceptionNet16
# load vgg16 model
# model = InceptionV3()
model = VGG16()

# restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# summary
model.summary()

INPUT_WEIGHT = 224    # VGG16 -> 224 / 299 (InceptionV3)
INPUT_HEIGHT = 224    # VGG16 -> 224 / 299 (InceptionV3)

2022-08-07 08:51:01.308855: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-07 08:51:01.309292: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-07 08:51:01.309414: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-08-07 08:51:01.309478: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-08-07 08:51:01.309539: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
# from ipywidgets import IntProgress
# extract feature from image
features = {}
directory = os.path.join(BASE_DIR, "Images")

for img_name in tqdm(os.listdir(directory)):
  # load the image from file
  img_path = os.path.join(directory, img_name)
  image = load_img(img_path, target_size=(INPUT_WEIGHT, INPUT_HEIGHT))
  
  # convert image pixels to numpy array
  image = img_to_array(image)

  # reshape data for model
  image = np.expand_dims(image, axis=0)

  # preprocess image for vgg model
  image = preprocess_input(image)

  # extract features
  feature = model.predict(image, verbose=0)

  # get image ID
  image_id = img_name.split(".")[0]

  # store feature
  features[image_id] = feature

In [None]:
# store features in pickle
pickle.dump(features, open(os.path.join(BASE_DIR, "features.pkl"), "wb"))

In [7]:
# load features from pickle
with open(os.path.join(BASE_DIR, "features.pkl"), "rb") as f:
  features = pickle.load(f)

In [8]:
len(features)

8091

## Load the captions data

In [9]:
with open(os.path.join(BASE_DIR, "captions.txt"), "r") as f:
  next(f)
  captions_doc = f.read()

In [None]:
# create mapping of image to captions
mapping = {}

i = 0
# process lines
for line in tqdm(captions_doc.split("\n")):
  # split the line by comma (,)
  tokens = line.split(',')
  if len(line) < 2: 
    continue

  image_id, caption = tokens[0].split('.')[0], tokens[1].replace(" .", "")

  # create list if needed
  if image_id not in mapping:
    mapping[image_id] = []
  
  # store the caption
  mapping[image_id].append(caption)

In [None]:
len(mapping)

## Preprocess Text Data

In [None]:
def clean(mapping):
  for key, captions in mapping.items():
    for i in range(len(captions)):
      # take one caption at a time
      caption = captions[i]

      # preprocessing steps
      # convert to lowercase
      caption = caption.lower()

      # delete digits, special chars, ets ...
      caption = caption.replace('[^A-Za-z]', '')

      # delete additional space
      caption = caption.replace('\s+', ' ')

      # add start and end tags to the caption
      caption = 'startseq ' + " ".join([word for word in caption.split() if len(word) > 1]) + " endseq"

      captions[i] = caption

In [None]:
# before preprocess of text
mapping['1000268201_693b08cb0e']

In [None]:
# preprocess the text
clean(mapping)
mapping['1000268201_693b08cb0e']

In [None]:
all_captions = []
for key in mapping: 
  for caption in mapping[key]:
    all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
# tokenizer the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
seq = tokenizer.texts_to_sequences(["two dogs on pavement moving toward each other"])[0]
seq

In [None]:
vocab_size

In [None]:
# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

In [None]:
len(all_captions)

## Train Test Split

In [None]:
import random
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.9)

train = random.sample(image_ids, split)
test = [item for item in image_ids if item not in train]

In [None]:
# startseq girl going into wooden building endsed
#    X                                       y
# startseq                                  girl
# startseq girl                             going
# startseq girl going                       into
# ...................
# startseq girl going to wooden building    endsed

# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
  # loop over images
  X1, X2, y = list(), list(), list()
  n = 0
  while True: 
    for key in data_keys:
      n += 1
      captions = mapping[key]

      # process each caption
      for caption in captions:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([caption])[0]

        # split the sequence into X, y pairs
        for i in range(1, len(seq)):
          # split into input and output pairs
          in_seq, out_seq = seq[:i], seq[i]

          # pad input sequence
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]

          # encode ouput sequence -> to_category with len of classes = len of vocab
          out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

          X1.append(features[key][0])
          X2.append(in_seq)
          y.append(out_seq)

      if n == batch_size: 
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
        yield [X1, X2], y

        X1, X2, y = list(), list(), list()
        n = 0

## Model Creatation
* Following ideal model diagram \
<img src="../images/architecture-1.png">

* Detail Model Architecture \
<img src="../images/model.png">

In [None]:
# encoder model
# image feature layers
# inputs1 = Input(shape=(4096, )) # dung voi vgg16
inputs1 = Input(shape=(2048, ))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation="relu")(fe1)

# sequence feature layers
inputs2 = Input(shape=(max_length, ))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder mode
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation="relu")(decoder1)
outputs = Dense(vocab_size, activation="softmax")(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss="categorical_crossentropy", optimizer="adam")

# plot the model
plot_model(model, show_shapes=True)

In [None]:
# train the model
epochs = 40
batch_size = 64
steps = len(train) // batch_size

for i in range(epochs):
  # create data generator
  generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)

  # fit for one epoch
  model.fit(generator, epochs=2, steps_per_epoch=steps, verbose=1)

In [None]:
# save the model
model.save(os.path.join(MODEL_DIR, "best_model.h5"))

## Generate Captions for Image

In [None]:
# convert index (integer) -> real word
def idx_to_word(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index ==<img src="files/subdir/image.png"> integer: 
      return word

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
  # add start tag for generation process
  in_text = 'startseq'

  # iterate over the max length of sequence
  for i in range(max_length):
    # encode input sequence
    sequence = tokenizer.texts_to_sequences([in_text])[0]

    # pad the sequence
    sequence = pad_sequences([sequence], max_length)

    # predict n<img src="files/subdir/image.png">ext word
    yhat = model.predict([image, sequence], verbose=0)

    # get index with hight probability
    yhat = np.argmax(yhat)

    # covert index to word
    word = idx_to_word(yhat, tokenizer)

    # stop if word not found
    if word is None: 
      break

    # append word as inputs for generating next word
    in_text += " " + word

    # stop if we reach "endseq" tag
    if word == 'endseq':
      break
    
  return in_text

In [None]:
<img src="files/subdir/image.png">from nltk.translate.bleu_score import corpus_bleu

# validate with test data
actual, predicted = list(), list()

for key in tqdm(test):
  # get actual caption
  captions = mapping[key]

  # predict the caption for image
  y_pred = predict_caption(model, features[key], tokenizer, max_length)

  # split into words
  actual_captions = [caption.split() for caption in captions]
  y_pred = y_pred.split()

  # append to the list
  actual.append(actual_captions)
  predicted.append(y_pred)

# Calculate BLEU score
print(f"BLEU-1: {corpus_bleu(actual, predicted, weights=(1.0, 0.0, 0.0))}")
print(f"BLEU-2: {corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0.0))}")

## Visualize the Results

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

def plot_generate_caption(image_name):
  # load the image
  # image_name = "1001773457_577c3a7d70.jpg"
  image_id = image_name.split('.')[0]
  img_path = os.path.join(BASE_DIR, "Images", image_name)
  image = Image.open(img_path)
  captions = mapping[image_id]

  print('---------------------Actual---------------------')
  for caption in captions:
    print(caption)

  # predict the caption
  y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
  print('--------------------Predicted--------------------')
  print(y_pred)
  
  plt.imshow(image)

In [None]:
plot_generate_caption("1030985833_b0902ea560.jpg")

In [None]:
plot_generate_caption("1055753357_4fa3d8d693.jpg")

In [None]:
plot_generate_caption("109738916_236dc456ac.jpg")

In [None]:
# should change <start> -> startseq and <end> -> endseq

## NEXT

# Dataset Information

Develop a Deep Learning program to identify when an article might be fake news

## Attributes
* id: unique id for a new article
* title: the title of a news article
* author: author of the news article
* text: the text of the article; could be incomplete
* label: a label that marks the article as potentially unrealiable
  + 1: unrealiable
  + 0: realiable


* source ref: https://github.com/aswintechguy/Deep-Learning-Projects/tree/main/Fake%20News%20Detection%20Analysis%20-%20LSTM%20Classification

## Import Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import warnings

%matplotlib inline

warnings.filterwarnings('ignore')

## Loading the Dataset

* URL dataset: https://www.kaggle.com/c/fake-news/data?select=test.csv

In [None]:
df = pd.read_csv("train.csv")
df.head()

In [None]:
df["title"][0]

In [None]:
df["text"][0]

In [None]:
df.info()

## Data Processing

In [None]:
# drop unnessary columns
df = df.drop(columns=['id', 'title', 'author'], axis=1)

In [None]:
# drop null values
df = df.dropna(axis=0)

In [None]:
# remove special characters and punctuations
df['clean_news'] = df['text'].str.lower()
df['clean_news']

In [None]:
df['clean_news'] = df['clean_news'].str.replace("[^A-Za-z0-9\s']", "")
df['clean_news'] = df['clean_news'].str.replace("\n", "")
df['clean_news'] = df['clean_news'].str.replace("\s+", " ")

In [None]:
# remove stop word
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['clean_news'] = df['clean_news'].apply(lambda x: " ".join([word for word in x.split() if word not in stop]))
df.head()

## Exploratory Data Analysis

In [None]:
from wordcloud import WordCloud

In [None]:
# visualize the frequent words
all_words = " ".join([sentence for sentence in df['clean_news']])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph 
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# visualize the frequenct words for realiable news
all_words = " ".join([sentence for sentence in df["clean_news"][df["label"] == 0]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# visualize the frequenct words for realiable news
all_words = " ".join([sentence for sentence in df["clean_news"][df["label"] == 1]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Create Word Embeddings

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_news'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

In [None]:
# padding data
sequences = tokenizer.texts_to_sequences(df["clean_news"])
padded_seq = pad_sequences(sequences, maxlen=700, padding="post", truncating="post")

In [None]:
# create embedding index
embedding_index = {}

with open('glove.6B.100d.txt', encoding='utf-8') as f:
  for line in f: 
    values = line.split()

    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embedding_index[word] = coefs

    # print(len(coefs)) # coefs with length 100
    # break

In [None]:
# create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
  embedding_vector = embedding_index.get(word)

  if embedding_vector is not None: 
    embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix[0]

In [None]:
embedding_matrix[1]

## Input Split

In [None]:
padded_seq[0]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_seq, df["label"], test_size=0.2, random_state=42, stratify=df["label"])


## Model Training

In [None]:
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

model = Sequential([
    Embedding(vocab_size+1, 100, weights=[embedding_matrix], trainable=False), 
    Dropout(0.2), 
    LSTM(128, return_sequences=True),
    LSTM(128), 
    Dropout(0.2), 
    Dense(512), 
    Dropout(0.2), 
    Dense(256), 
    Dropout(0.2), 
    Dense(1, activation="sigmoid")

])

# create checkpoint
path_checkpoint = "./checkpoint.ckpt"
callback = ModelCheckpoint(filepath=path_checkpoint, save_weights_only=True, verbose=1)

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:
# train the model
history = model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=128, 
    validation_data=[X_test, y_test], 
    callbacks=[callback]
)

In [None]:
# save model
model.save("./best_model.h5")

# load model C1
# 1. create instan model | 2. load model
# model = create_model()
# model.load("./best_model.h5")

# load model C2: 
# new_model = tf.keras.models.load_model("./best_model.h5")

# model save_weights
# model.save_weights("./save_weight.ckpt")

In [None]:
# load model from checkpoint_weight

# create a new model instance
# model.save_weights()
# model = create_model()
# model.load_weights(path_checkpoint)

In [None]:
# visualize the results
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend(["Train", "Test"])
plt.show()

plt.plot(history.history["loss"])
plt.plot(history.history["loss"])
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(["Train", "Test"])
plt.show()