# TensorFlow Classification

Trying to classify URLs solely based on the domain name.

In [None]:
import os
import re

from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import pandas as pd
import tensorflow as tf

In [None]:
print(tf.__version__)

2.17.0


## Load the dataset

In [None]:
load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")

engine = create_engine(DATABASE_URL, connect_args={"options": "-c timezone=utc"})

df = pd.read_sql_table("url", engine, index_col="id", parse_dates=["created_at", "updated_at"])

In [None]:
df.sample(10)

Unnamed: 0_level_0,source_id,url,is_phishing,is_online,created_at,updated_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41568,1,https://www.chevrolet.com.co,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
406238,2,universityframes.com/schools.html,False,False,2024-11-15 12:45:38.796488,2024-11-15 12:45:38.796488
588446,2,huffingtonpost.com/david-kroodsma/sergeant-sav...,False,False,2024-11-15 12:45:38.796488,2024-11-15 12:45:38.796488
469113,2,semart.ugm.ac.id,True,False,2024-11-15 12:45:38.796488,2024-11-15 12:45:38.796488
33262,1,https://www.janeyolen.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
523758,2,chicago.blockshopper.com/property/150511201700...,False,False,2024-11-15 12:45:38.796488,2024-11-15 12:45:38.796488
100666,1,https://docs.google.com/forms/d/1aegowsjpbld0r...,True,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
482775,2,amazon.com/Bill-Wenningtons-Tales-Bulls-Hardwo...,False,False,2024-11-15 12:45:38.796488,2024-11-15 12:45:38.796488
188363,1,https://www.microwavetelemetry.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
586042,2,bestaviation.net/,False,False,2024-11-15 12:45:38.796488,2024-11-15 12:45:38.796488


## Data cleaning

In [None]:
def extract_domain_with_regex(url):
    try:
        # Regex to match domain
        pattern = r'^(?:https?://)?(?:www\.)?([^:/\n?]+)'
        match = re.search(pattern, url)
        return match.group(1) if match else None
    except Exception as e:
        print(f"Error parsing URL: {url}, {e}")
        return None

In [None]:
# Apply to your dataset
df['domain'] = df['url'].apply(extract_domain_with_regex)

In [None]:
# Drop rows with missing domains
df = df.dropna(subset=['domain'])

In [None]:
df.head()

Unnamed: 0_level_0,source_id,url,is_phishing,is_online,created_at,updated_at,domain
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,https://www.southbankmosaics.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064,southbankmosaics.com
2,1,https://www.uni-mainz.de,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064,uni-mainz.de
3,1,https://www.voicefmradio.co.uk,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064,voicefmradio.co.uk
4,1,https://www.sfnmjournal.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064,sfnmjournal.com
5,1,https://www.rewildingargentina.org,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064,rewildingargentina.org


## Data preprocessing

In [None]:
urls = df['domain']
labels = df['is_phishing'].astype(int)

### Tokenization

In [None]:
# Tokenize the URLs (character-level)
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(urls)
sequences = tokenizer.texts_to_sequences(urls)

In [None]:
# Pad the sequences
max_length = 300
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length)

### Split the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

## Build the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional

# Define the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='softmax')  # Number of classes = 1 (binary classification)
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

TypeError: cannot unpack non-iterable module object

## Train the model

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

AttributeError: module 'tensorflow.python.distribute.input_lib' has no attribute 'DistributedDatasetInterface'

## Evaluate the model

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")