In [1]:
import sys
sys.path.append('..')  # this appends the parent directory to the sys.path list
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
import tensorflow as tf
from nltk.util import ngrams
import itertools
from Utils import DataProcessing, FeatureCreation, Model
N_for_NGram = 3
Sequence_length = 200
epochs = 2
n_classes = 2

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [2]:
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")

In [3]:
train_df, test_df = train_test_split(url_dataframe, test_size=0.20, stratify=url_dataframe['type'])
train_df, val_df = train_test_split(train_df, test_size=0.20, stratify=train_df['type'])

In [4]:
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
Batch_Size = BATCH_SIZE
max_tokens = 25000
steps_per_epoch = DataProcessing.steps_per_epoch(train_df, Batch_Size)
validation_steps = DataProcessing.steps_per_epoch(val_df, Batch_Size)
train_ds = DataProcessing.process_train_data(train_df,Batch_Size)
Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                  #  max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x,z: x[0])
Vectorize_Layer.adapt(train_text)
train_ds = train_ds.unbatch()
train_ds = train_ds.shuffle(10000)
train_ds = train_ds.repeat()
train_ds = train_ds.batch(Batch_Size)
train_ds = train_ds.map(lambda x, z: DataProcessing.vectorize_text(x[0],x[1],z,Vectorize_Layer))
train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [5]:
val_ds = DataProcessing.process_data(val_df,Batch_Size,Vectorize_Layer)
test_ds = DataProcessing.process_data(test_df,Batch_Size,Vectorize_Layer)

In [6]:
max_tokens = len(Vectorize_Layer.get_vocabulary())

In [7]:
with strategy.scope():
    model = Model.create_model(Sequence_length, max_tokens, 18)
    print(model.summary())
    early_stopping = tf.keras.callbacks.EarlyStopping(patience=40, monitor='val_loss', mode='min', restore_best_weights=True)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'],steps_per_execution = 64,)

In [9]:
history = model.fit(
    train_ds,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_ds,
    validation_steps=validation_steps,
    callbacks=[early_stopping])

In [10]:
from Utils import Evaluation
result = Evaluation.evaluate_model(model, test_ds, 0.5)