In [1]:
import requests
import gzip
import pandas as pd
from io import StringIO

url = "http://data.phishtank.com/data/online-valid.csv.gz"

# Download the compressed file
response = requests.get(url)
compressed_content = response.content

# Decompress the data using gzip
decompressed_data = gzip.decompress(compressed_content)

# Decode bytes to string
csv_string = decompressed_data.decode('utf-8')

# Load the CSV data into a DataFrame using StringIO
phishing_url = pd.read_csv(StringIO(csv_string))

# Optionally shuffle the dataset and reset the index
phishing_url = phishing_url.sample(frac=1).reset_index(drop=True)

# Display the first few rows
print(phishing_url.head())


   phish_id                                          url  \
0   8748799                       https://qrco.de/bfMSpJ   
1   7532885                     https://mbarquitecto.cl/   
2   9009788    http://allegrolokalnie.oferta-8238064.icu   
3   8401021     https://atofredom-105660.weeblysite.com/   
4   6677519  http://monirshouvo.github.io/fb_responsive/   

                                    phish_detail_url  \
0  http://www.phishtank.com/phish_detail.php?phis...   
1  http://www.phishtank.com/phish_detail.php?phis...   
2  http://www.phishtank.com/phish_detail.php?phis...   
3  http://www.phishtank.com/phish_detail.php?phis...   
4  http://www.phishtank.com/phish_detail.php?phis...   

             submission_time verified          verification_time online  \
0  2024-09-07T23:21:18+00:00      yes  2024-09-07T23:22:48+00:00    yes   
1  2022-06-03T08:19:54+00:00      yes  2022-06-03T17:09:56+00:00    yes   
2  2025-03-08T13:04:00+00:00      yes  2025-03-08T13:12:36+00:00    yes   
3 

In [2]:
# Load the CSV file into a DataFrame
benign_url = pd.read_csv("top-1m.csv", header=None, names=['ID', 'domain'], nrows=100000)
print(benign_url.head())

   ID        domain
0   1    google.com
1   2  facebook.com
2   3   youtube.com
3   4     yahoo.com
4   5     baidu.com


In [3]:
# Define a helper function to convert a domain to a full URL
def domain_to_url(domain):
    # If domain already starts with http, return as is, else prepend "http://"
    if domain.startswith("http"):
        return domain
    else:
        return "http://" + domain + "/"

In [4]:
# Rename the URL column to "URL" if necessary and add Label 1 (phishing)
if 'url' in phishing_url.columns:
    phishing_url.rename(columns={'url': 'URL'}, inplace=True)
else:
    raise Exception("Phishing dataset does not contain a 'url' column.")
phishing_url['Label'] = 1



In [5]:
phishing_url.head()

Unnamed: 0,phish_id,URL,phish_detail_url,submission_time,verified,verification_time,online,target,Label
0,8748799,https://qrco.de/bfMSpJ,http://www.phishtank.com/phish_detail.php?phis...,2024-09-07T23:21:18+00:00,yes,2024-09-07T23:22:48+00:00,yes,Other,1
1,7532885,https://mbarquitecto.cl/,http://www.phishtank.com/phish_detail.php?phis...,2022-06-03T08:19:54+00:00,yes,2022-06-03T17:09:56+00:00,yes,Other,1
2,9009788,http://allegrolokalnie.oferta-8238064.icu,http://www.phishtank.com/phish_detail.php?phis...,2025-03-08T13:04:00+00:00,yes,2025-03-08T13:12:36+00:00,yes,Allegro,1
3,8401021,https://atofredom-105660.weeblysite.com/,http://www.phishtank.com/phish_detail.php?phis...,2023-12-23T11:29:58+00:00,yes,2023-12-23T11:33:13+00:00,yes,Other,1
4,6677519,http://monirshouvo.github.io/fb_responsive/,http://www.phishtank.com/phish_detail.php?phis...,2020-07-15T14:02:13+00:00,yes,2020-07-15T14:08:42+00:00,yes,Other,1


In [6]:
# Create a new column "URL" with full URLs and assign Label 0 (benign)
benign_url['URL'] = benign_url['domain'].apply(domain_to_url)
# We only need the URL and Label columns
benign_url = benign_url[['URL']].copy()
benign_url['Label'] = 0
benign_url.head()

Unnamed: 0,URL,Label
0,http://google.com/,0
1,http://facebook.com/,0
2,http://youtube.com/,0
3,http://yahoo.com/,0
4,http://baidu.com/,0


In [7]:
#2016's top most suspicious TLD and words
Suspicious_TLD=['zip','cricket','link','work','party','gq','kim','country','science','tk']
Suspicious_Domain=['luckytime.co.kr','mattfoll.eu.interia.pl','trafficholder.com','dl.baixaki.com.br','bembed.redtube.comr','tags.expo9.exponential.com','deepspacer.com','funad.co.kr','trafficconverter.biz']
#trend micro's top malicious domains

In [8]:
# Method to count number of dots
def countdots(url):
    return url.count('.')

In [9]:
# Method to count number of delimeters
def countdelim(url):
    count = 0
    delim=[';','_','?','=','&']
    for each in url:
        if each in delim:
            count = count + 1

    return count

In [10]:
# Is IP addr present as th hostname, let's validate

import ipaddress as ip #works only in python 3

def isip(uri):
    try:
        if ip.ip_address(uri):
            return 1
    except:
        return 0

In [11]:
#method to check the presence of hyphens

def isPresentHyphen(url):
    return url.count('-')


In [12]:
#method to check the presence of @

def isPresentAt(url):
    return url.count('@')

In [13]:
def isPresentDSlash(url):
    return url.count('//')

In [14]:
def countSubDir(url):
    return url.count('/')

In [15]:
def get_ext(url):
    """Return the filename extension from url, or ''."""

    root, ext = splitext(url)
    return ext

In [16]:
def countSubDomain(subdomain):
    if not subdomain:
        return 0
    else:
        return len(subdomain.split('.'))

In [17]:
def countQueries(query):
    if not query:
        return 0
    else:
        return len(query.split('&'))

In [18]:
'''
featureSet = pd.DataFrame(columns=('url','no of dots','presence of hyphen','len of url','presence of at',\
'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\
'presence of suspicious domain','create_age(months)','expiry_age(months)','update_age(days)','country','file extension','label'))'''

featureSet = pd.DataFrame(columns=('url','no of dots','presence of hyphen','len of url','presence of at',\
'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\
'presence of suspicious domain','label'))

In [22]:
!pip install tldextract



In [24]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m7.6 MB/s[0m eta [36m

In [19]:
from urllib.parse import urlparse
import tldextract

def getFeatures(url, label):
    result = []
    url = str(url)

    #add the url to feature set
    result.append(url)

    #parse the URL and extract the domain information
    path = urlparse(url)
    ext = tldextract.extract(url)

    #counting number of dots in subdomain
    result.append(countdots(ext.subdomain))

    #checking hyphen in domain
    result.append(isPresentHyphen(path.netloc))

    #length of URL
    result.append(len(url))

    #checking @ in the url
    result.append(isPresentAt(path.netloc))

    #checking presence of double slash
    result.append(isPresentDSlash(path.path))

    #Count number of subdir
    result.append(countSubDir(path.path))

    #number of sub domain
    result.append(countSubDomain(ext.subdomain))

    #length of domain name
    result.append(len(path.netloc))

    #count number of queries
    result.append(len(path.query))

    #Adding domain information

    #if IP address is being used as a URL
    result.append(isip(ext.domain))

    #presence of Suspicious_TLD
    result.append(1 if ext.suffix in Suspicious_TLD else 0)

    #presence of suspicious domain
    # Use attributes instead of indexing
    result.append(1 if '.'.join([ext.domain, ext.suffix]) in Suspicious_Domain else 0 )

    '''

    #Get domain information by asking whois
    domain = '.'.join(ext[1:]) # This line was likely the problem in your original code
    '''

    #result.append(get_ext(path.path))
    result.append(str(label))
    return result

    #Yay! finally done!

In [20]:
# --- Combine Datasets ---
combined_df = pd.concat([benign_url, phishing_url[['URL', 'Label']]], ignore_index=True, )
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
combined_df.head()

Unnamed: 0,URL,Label
0,https://docs.google.com/presentation/d/e/2PACX...,1
1,https://ezpass.com-gltu.xin/,1
2,http://ladbrokes.com/,0
3,http://ipsimisul.com/,0
4,http://gruks.com/,0


In [29]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [22]:
from tqdm import tqdm
for i in tqdm(range(len(combined_df))):
    features = getFeatures(combined_df["URL"].loc[i], combined_df["Label"].loc[i])
    featureSet.loc[i] = features

  0%|          | 0/160940 [00:00<?, ?it/s]

100%|██████████| 160940/160940 [11:00<00:00, 243.70it/s]


In [35]:
correct_col_names = [
    'url',
    'no of dots',
    'presence of hyphen',
    'len of url',
    'presence of at',
    'presence of double slash',
    'no of subdir',
    'no of subdomain',
    'len of domain',
    'no of queries',
    'is IP',
    'presence of Suspicious_TLD',
    'presence of suspicious domain',
    'label'
]

In [36]:
# Check if the number of columns matches the number of names provided
if len(featureSet.columns) == len(correct_col_names):
    print("\nNumber of columns matches number of names. Assigning new column names...")
    # Assign the list of correct names to the DataFrame's columns attribute
    featureSet.columns = correct_col_names

    print("\n--- Updated featureSet columns ---")
    print(featureSet.columns.tolist()) # Use tolist() for cleaner output

    print("\n--- Updated featureSet head ---")
    print(featureSet.head())


Number of columns matches number of names. Assigning new column names...

--- Updated featureSet columns ---
['url', 'no of dots', 'presence of hyphen', 'len of url', 'presence of at', 'presence of double slash', 'no of subdir', 'no of subdomain', 'len of domain', 'no of queries', 'is IP', 'presence of Suspicious_TLD', 'presence of suspicious domain', 'label']

--- Updated featureSet head ---
                               url  no of dots  presence of hyphen  \
0  http://auxilioplandesocios.com/           0                   0   
1       http://e-solutionsinc.com/           0                   1   
2    http://illustrations.free.fr/           0                   0   
3               http://tomari.org/           0                   0   
4           http://firefox.net.cn/           0                   0   

   len of url  presence of at  presence of double slash  no of subdir  \
0          31               0                         0             1   
1          26               0       

In [23]:

# Now group by 'label' and display the counts
print(featureSet.groupby('label').size())

label
0    100000
1     60940
dtype: int64


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [25]:
X = featureSet.drop(['url','label'],axis=1).values
y = featureSet['label'].values

model = { "LogisticRegression":LogisticRegression() }

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2)

In [26]:
results = {}
for algo in model:
    clf = model[algo]
    clf.fit(X_train,y_train)
    score = clf.score(X_test,y_test)
    print ("%s : %s " %(algo, score))
    results[algo] = score

LogisticRegression : 0.9757362992419535 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
!pip install joblib



In [30]:
import joblib

clf = LogisticRegression()
clf.fit(X_train,y_train)

# Save the model
joblib.dump(clf, 'logistic_model.pkl')  # You can choose any filename

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['logistic_model.pkl']

In [31]:
# Load the model
loaded_model = joblib.load('logistic_model.pkl')
score = clf.score(X_test,y_test)
print ("%s : %s " %(algo, score))

LogisticRegression : 0.9757362992419535 


In [28]:
clf = model[winner]
res = clf.predict(X)
mt = confusion_matrix(y, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

False positive rate : 1.220000 %
False negative rate : 4.522481 %


In [1]:
import re
def extract_urls(text):
    """Extracts potential URLs from a string using regex."""
    # This regex is relatively simple and might need refinement for edge cases
    # It looks for http://, https://, or ftp:// followed by non-space characters
    url_pattern = re.compile(r'(?:https?|ftp)://[^\s/$.?#].[^\s]*', re.IGNORECASE)
    return url_pattern.findall(text)

print(extract_urls(""" {
  "name": "@bahmutov/all-paths",
  "description": "Given an object returns list of all possible paths to its properties",
  "version": "1.0.2",
  "author": "Gleb Bahmutov <gleb.bahmutov@gmail.com>",
  "bugs": "https://github.com/bahmutov/all-paths/issues"""))

['https://github.com/bahmutov/all-paths/issues']
