In [1]:
import pandas as pd
from urllib.parse import urlparse
import re

# Importing Data

In [2]:
df = pd.read_csv("urldata_raw.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0
5,5,https://www.reddit.com,benign,0
6,6,https://www.yahoo.com,benign,0
7,7,https://www.google.co.in,benign,0
8,8,https://www.qq.com,benign,0
9,9,https://www.amazon.com,benign,0


In [3]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
df.head(10)

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0
5,https://www.reddit.com,benign,0
6,https://www.yahoo.com,benign,0
7,https://www.google.co.in,benign,0
8,https://www.qq.com,benign,0
9,https://www.amazon.com,benign,0


In [5]:
df["label"].value_counts()

label
benign       345738
malicious    104438
Name: count, dtype: int64

# Feature Extraction

## 1. Lengths
- URL length
- Hostname length
- Path length
- First Directory length

## 2. Feature count
- Non-Alphanumeric characters
- 'http' 
- 'https' 
- 'www' 
- Digits
- Letters 
- Directory count

## 3. Binary features - 
- Usage of IP in URL

In [6]:
# Lengths

# URL
# df["url"] = df["url"].apply(lambda i : i.replace("www.", ""))

# URL length
df["url_length"] = df["url"].apply(lambda i: len(i))

# Hostname length
df["hostname_length"] = df["url"].apply(lambda i: len(urlparse(i).netloc))

# Path length
df["path_length"] = df["url"].apply(lambda i: len(urlparse(i).path))


# First directory length
def first_directory_length(url: str):
    urlpath: str = urlparse(url).path
    try:
        return len(urlpath.split("/")[1])
    except:
        return 0


df["first_directory_length"] = df["url"].apply(lambda i: first_directory_length(i))

In [7]:
# Char Count
chars_to_count = []
for i in range(33, 40) or i in range(58, 65):
    if not chr(i).isalnum():
        chars_to_count.append(chr(i))

for i in chars_to_count:
    df["count:" + i] = df["url"].apply(lambda url: url.count(i))

df["count:http"] = df["url"].apply(lambda url: url.count("http"))
df["count:https"] = df["url"].apply(lambda url: url.count("https"))
df["count:www"] = df["url"].apply(lambda url: url.count("www"))

# Digit Count
df["count:digits"] = df["url"].apply(lambda url: len([i for i in url if i.isnumeric()]))

# Letter Count
df["count:letters"] = df["url"].apply(lambda url: len([i for i in url if i.isalpha()]))

# Directory Count
df["count:directories"] = df["url"].apply(lambda url: urlparse(url=url).path.count("/"))

In [8]:
# Case Change Count
def count_case_change(input_string):
    switch_count = 0
    prev_case = None
    for char in input_string:
        if char.isalpha() and (char.isupper() == (not prev_case)):
            switch_count += 1
            prev_case = char.isupper() if char.isalpha() else None
    return switch_count


df["count:casechanges"] = df["url"].apply(lambda url: count_case_change(url))

In [9]:
def ip_present(url: str):
    match = re.search(
        "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-fA-F]{1,4}(:[0-9a-fA-F]{1,4}){7}", url
    )
    return 1 if match else 0


df["ip_present"] = df["url"].apply(lambda url: ip_present(url=url))

In [10]:
df["label"].value_counts()

label
benign       345738
malicious    104438
Name: count, dtype: int64

In [12]:
df.to_csv('urldata_processed.csv')