# Implementation of N-Gram

## 2-GRAM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 5000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_5k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_5k_result.pkl", "wb") as f:
    pickle.dump(result, f)

2023-03-04 00:56:48.964391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 00:56:49.080740: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-04 00:56:49.080767: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-04 00:56:49.105188: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-04 00:56:49.630240: W tensorflow/stream_executor/platform/de

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           320000    
                                                                 
 lstm (LSTM)                 (None, 200, 64)           33024     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 390,273
Trainable params: 390,273
Non-trai



INFO:tensorflow:Assets written to: ram://d6004f0e-44c6-41bb-8192-5836045e03cd/assets


INFO:tensorflow:Assets written to: ram://d6004f0e-44c6-41bb-8192-5836045e03cd/assets


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 10000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_10k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_10k_result.pkl", "wb") as f:
    pickle.dump(result, f)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 64)           640000    
                                                                 
 lstm_2 (LSTM)               (None, 200, 64)           33024     
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 710,273
Trainable params: 710,273
Non-tr



INFO:tensorflow:Assets written to: ram://23c3bf3e-892f-4354-bc0d-873837bde72d/assets


INFO:tensorflow:Assets written to: ram://23c3bf3e-892f-4354-bc0d-873837bde72d/assets


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 15000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_15k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_15k_result.pkl", "wb") as f:
    pickle.dump(result, f)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 200, 64)           960000    
                                                                 
 lstm_4 (LSTM)               (None, 200, 64)           33024     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,030,273
Trainable params: 1,030,273
No



INFO:tensorflow:Assets written to: ram://807d443d-14c5-4203-87ca-fef92b80eef3/assets


INFO:tensorflow:Assets written to: ram://807d443d-14c5-4203-87ca-fef92b80eef3/assets


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens =20000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_20k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_20k_result.pkl", "wb") as f:
    pickle.dump(result, f)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 64)           1280000   
                                                                 
 lstm_6 (LSTM)               (None, 200, 64)           33024     
                                                                 
 lstm_7 (LSTM)               (None, 64)                33024     
                                                                 
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,350,273
Trainable params: 1,350,273
No



INFO:tensorflow:Assets written to: ram://10451035-61d9-4ec9-a8fc-3851c3a19b8b/assets


INFO:tensorflow:Assets written to: ram://10451035-61d9-4ec9-a8fc-3851c3a19b8b/assets


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 25000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_25k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_25k_result.pkl", "wb") as f:
    pickle.dump(result, f)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 64)           1600000   
                                                                 
 lstm_8 (LSTM)               (None, 200, 64)           33024     
                                                                 
 lstm_9 (LSTM)               (None, 64)                33024     
                                                                 
 dense_8 (Dense)             (None, 64)                4160      
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,670,273
Trainable params: 1,670,273
No



INFO:tensorflow:Assets written to: ram://820649b6-0cc1-4152-93f2-7c55e3d9a9a0/assets


INFO:tensorflow:Assets written to: ram://820649b6-0cc1-4152-93f2-7c55e3d9a9a0/assets


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 30000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_30k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_30k_result.pkl", "wb") as f:
    pickle.dump(result, f)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 200, 64)           1920000   
                                                                 
 lstm_10 (LSTM)              (None, 200, 64)           33024     
                                                                 
 lstm_11 (LSTM)              (None, 64)                33024     
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,990,273
Trainable params: 1,990,273
No

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 35000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_35k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_35k_result.pkl", "wb") as f:
    pickle.dump(result, f)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 40000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   max_tokens = max_tokens,
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)


def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_40k_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_40k_result.pkl", "wb") as f:
    pickle.dump(result, f)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import re
from nltk.util import ngrams
import itertools
N_for_NGram = 3
Sequence_length = 200
Batch_Size = 64
epochs = 10
n_classes = 2
max_tokens = 5000
# Define a function to preprocess the link
def preprocess_link(link):
    
    # Convert all letters to lowercase
    link = link.lower()
    
    link = link.replace("http://","")
    link = link.replace("https://","")
    link = link.replace("www.","")
    link = link.replace(" ","")
    link = link.strip()
    
    # Remove punctuation and special characters
    link = re.sub('[^A-Za-z0-9]+', '', link)
    
    return link
from sklearn.model_selection import train_test_split
url_dataframe = pd.read_csv("../dataset/df_final.csv")
url_dataframe['url'] = url_dataframe['url'].apply(preprocess_link)
url_dataframe = url_dataframe.drop_duplicates(subset=['url'])
url_dataframe = url_dataframe.dropna().reset_index(drop=True)

import tensorflow as tf
train_df, test_df = train_test_split(url_dataframe, test_size=0.20)

train_ds = tf.data.Dataset.from_tensor_slices((train_df['url'].values, train_df['type'].values))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['url'].values, test_df['type'].values))

train_ds = train_ds.batch(Batch_Size)
test_ds = test_ds.batch(Batch_Size)

Vectorize_Layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',
                                                   split="character",
                                                   ngrams=(N_for_NGram,),
                                                   output_mode='int',
                                                   output_sequence_length=Sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
Vectorize_Layer.adapt(train_text)

max_tokens = len(Vectorize_Layer.get_vocabulary())
print(max_tokens)

def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return Vectorize_Layer(text), label

train_ds = train_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_tokens, 64, input_length=Sequence_length),
  tf.keras.layers.LSTM(64, return_sequences=True) ,
  tf.keras.layers.LSTM(64,) ,
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    train_ds,
    epochs=epochs)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function
def evaluate_model(model, test_dataset):
    y_true = []
    for x, y in test_dataset:
        for yi in y:
            y_true.append(yi.numpy())
    
    y_true = np.array(y_true)
    y_true = y_true.flatten()
    y_pred = model.predict(test_dataset)
    y_pred = y_pred.flatten()
    y_pred = [round(i) for i in y_pred]
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy:", acc)

    # Precision
    prec = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", prec)

    # Recall
    rec = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", rec)

    # F1 Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1 Score:", f1)
    
    return [cm,acc,prec,rec,f1]
# Evaluate the model
result = evaluate_model(model, test_ds)

import pickle
with open("3_gram_200_all_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
with open("3_gram_200_all_result.pkl", "wb") as f:
    pickle.dump(result, f)