In [3]:
!pip install pandas scikit-learn requests beautifulsoup4 whois joblib



In [1]:
import pandas as pd

data = {
    "url": [
        "http://secure-login.bank.com",
        "https://google.com",
        "http://verify-account.com",
        "https://facebook.com",
        "http://paypal-secure-login.com",
        "https://amazon.com",
        "http://free-gift-now.com",
        "https://github.com",
        "http://reset-password-now.com",
        "https://wikipedia.org"
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = фишинговый, 0 = безопасный
}

df = pd.DataFrame(data)

df.to_csv("phishing_dataset.csv", index=False)

print("✅ Датасет сохранен в phishing_dataset.csv")
df.head()

✅ Датасет сохранен в phishing_dataset.csv


Unnamed: 0,url,label
0,http://secure-login.bank.com,1
1,https://google.com,0
2,http://verify-account.com,1
3,https://facebook.com,0
4,http://paypal-secure-login.com,1


In [8]:
df = pd.read_csv("phishing_dataset.csv")

df.head()

Unnamed: 0,url,label
0,http://secure-login.bank.com,1
1,https://google.com,0
2,http://verify-account.com,1
3,https://facebook.com,0
4,http://paypal-secure-login.com,1


In [9]:
import re
from urllib.parse import urlparse

def extract_features(url):
    parsed_url = urlparse(url)
    
    url_length = len(url)
  
    num_dots = parsed_url.netloc.count('.')

    num_slashes = url.count("/")
    
    suspicious_words = ["secure", "account", "login", "bank", "verify", "password", "gift", "reset"]
    contains_suspicious = any(word in url.lower() for word in suspicious_words)
    
    return [url_length, num_dots, num_slashes, int(contains_suspicious)]


df_features = df["url"].apply(extract_features)
df_features = pd.DataFrame(df_features.tolist(), columns=["url_length", "num_dots", "num_slashes", "contains_suspicious"])

df_features["label"] = df["label"]

print("Признаки извлечены!")
df_features.head()

Признаки извлечены!


Unnamed: 0,url_length,num_dots,num_slashes,contains_suspicious,label
0,28,2,2,1,1
1,18,1,2,0,0
2,25,1,2,1,1
3,20,1,2,0,0
4,30,1,2,1,1


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

X = df_features.drop(columns=["label"])
y = df_features["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Точность модели: {accuracy * 100:.2f}%")

joblib.dump(model, "phishing_model.pkl")
print("Модель сохранена в phishing_model.pkl")

Точность модели: 100.00%
Модель сохранена в phishing_model.pkl


In [4]:
import pandas as pd
import joblib

model = joblib.load("phishing_model.pkl")

def predict_phishing(url):
    features = extract_features(url)
    feature_names = ["url_length", "num_dots", "num_slashes", "contains_suspicious"]
    features_df = pd.DataFrame([features], columns=feature_names)
    prediction = model.predict(features_df)
    return "Фишинговый сайт!" if prediction[0] == 1 else "Безопасный сайт."

print(predict_phishing("http://secure-login.bank.com"))
print(predict_phishing("https://google.com"))
print(predict_phishing("http://free-gift-now.com"))

NameError: name 'extract_features' is not defined

In [5]:
print ("Optimization 1")

Optimization 1


In [2]:
import whois
import datetime

def get_domain_age(url):
    try:
        domain_info = whois.whois(url)
        if domain_info.creation_date:
            creation_date = domain_info.creation_date[0] if isinstance(domain_info.creation_date, list) else domain_info.creation_date
            age = (datetime.datetime.now() - creation_date).days
            return age
    except:
        return -1  

    return -1  


print(get_domain_age("https://google.com"))
print(get_domain_age("http://secure-login.bank.com"))

-1
-1


In [3]:
import requests
from bs4 import BeautifulSoup

def analyze_html(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")

        num_forms = len(soup.find_all("form"))
        num_iframes = len(soup.find_all("iframe"))
        num_scripts = len(soup.find_all("script"))

        return [num_forms, num_iframes, num_scripts]
    except:
        return [-1, -1, -1] 


print(analyze_html("https://google.com"))
print(analyze_html("http://secure-login.bank.com"))

[1, 0, 9]
[-1, -1, -1]


In [4]:
def extract_features(url):
    parsed_url = urlparse(url)
    
    url_length = len(url)
    num_dots = parsed_url.netloc.count('.')
    num_slashes = url.count("/")
    suspicious_words = ["secure", "account", "login", "bank", "verify", "password", "gift", "reset"]
    contains_suspicious = any(word in url.lower() for word in suspicious_words)
    
    domain_age = get_domain_age(url)
    html_features = analyze_html(url) 

    return [url_length, num_dots, num_slashes, int(contains_suspicious), domain_age] + html_features

In [5]:
print ("Optimization 2")

Optimization 2


In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4


In [14]:
from urllib.parse import urlparse

def extract_features(url):
    parsed_url = urlparse(url)

    url_length = len(url)
    num_dots = parsed_url.netloc.count('.')
    num_slashes = url.count("/")
    suspicious_words = ["secure", "account", "login", "bank", "verify", "password", "gift", "reset"]
    contains_suspicious = any(word in url.lower() for word in suspicious_words)

    domain_age = get_domain_age(url)
    html_features = analyze_html(url) 

    return [url_length, num_dots, num_slashes, int(contains_suspicious), domain_age] + html_features

In [15]:
df_features = df["url"].apply(extract_features)

In [16]:
import pandas as pd

df = pd.read_csv("phishing_dataset.csv")

df_features = df["url"].apply(extract_features)
df_features = pd.DataFrame(df_features.tolist(), columns=["url_length", "num_dots", "num_slashes", "contains_suspicious", "domain_age", "num_forms", "num_iframes", "num_scripts"])

df_features["label"] = df["label"]

print("df_features успешно создана!")
df_features.head()

df_features успешно создана!


Unnamed: 0,url_length,num_dots,num_slashes,contains_suspicious,domain_age,num_forms,num_iframes,num_scripts,label
0,28,2,2,1,-1,-1,-1,-1,1
1,18,1,2,0,-1,1,0,9,0
2,25,1,2,1,-1,0,0,1,1
3,20,1,2,0,-1,1,0,18,0
4,30,1,2,1,-1,0,0,0,1


In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df_features.drop(columns=["label"])
y = df_features["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Улучшенная точность модели: {accuracy * 100:.2f}%")

import joblib
joblib.dump(model, "phishing_model_xgb.pkl")

Улучшенная точность модели: 100.00%


['phishing_model_xgb.pkl']