# 테스트 데이터셋으로 피처 판별 결과와 xgboost, mlp 모델의 피싱 판별 확률 및 판별 시간을 데이터셋으로 생성

In [13]:
!pip install python-whois



In [19]:
import asyncio
import pandas as pd
from features import short_url_features, url_based_feature, content_based_features, domainver1
import pickle
import numpy as np
import time

# 피처 이름 목록을 모델 학습 데이터의 피처 순서와 일치시킴
feature_order = [
    'having_IPhaving_IP_Address', 'URLURL_Length', 'having_At_Symbol', 'double_slash_redirecting',
    'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
    'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email',
    'Abnormal_URL', 'Redirect', 'popUpWidnow', 'age_of_domain', 'DNSRecord',
    'Google_Index'
]

# feature_order = [
#     'having_IPhaving_IP_Address', 'URLURL_Length', 'having_At_Symbol', 'double_slash_redirecting',
#     'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon',
#     'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email',
#     'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord',
#     'Google_Index'
# ]

async def evaluate_url(url):
    # URL이 단축된 경우 복원
    url, is_shortened = await asyncio.to_thread(short_url_features.check_url, url)
    print(f'단축 URL 여부: {is_shortened}')

    # 컨텐츠 기반 피처용 URLData 객체 생성
    response = await asyncio.to_thread(content_based_features.get_request_url, url)

    # 비동기 함수로 개별 피처들을 동시에 실행
    tasks = {
        'URLURL_Length': asyncio.to_thread(url_based_feature.check_url_length, url),
        'port': asyncio.to_thread(url_based_feature.scan_non_standard_ports, url),
        'having_At_Symbol': asyncio.to_thread(url_based_feature.check_at_symbol, url),
        'double_slash_redirecting': asyncio.to_thread(url_based_feature.check_double_slash_redirecting, url),
        'Prefix_Suffix': asyncio.to_thread(url_based_feature.check_prefix_suffix, url),
        'Abnormal_URL': asyncio.to_thread(url_based_feature.check_abnormal_url, url),
        'popUpWidnow': asyncio.to_thread(content_based_features.popup_window_text, response),
        'having_IPhaving_IP_Address': asyncio.to_thread(content_based_features.using_ip, url),
        'Request_URL': asyncio.to_thread(content_based_features.check_request_url, url, response),
        'URL_of_Anchor': asyncio.to_thread(content_based_features.check_url_of_anchor, url, response),
        'Links_in_tags': asyncio.to_thread(content_based_features.has_meta_tags, response),
        'SFH': asyncio.to_thread(content_based_features.check_sfh, url, response),
        'Submitting_to_email': asyncio.to_thread(content_based_features.check_submit_email, url, response),
        'Redirect': asyncio.to_thread(content_based_features.check_redirect_count, response),
        'Google_Index': asyncio.to_thread(domainver1.google_index, url),
        'Domain_registeration_length': asyncio.to_thread(domainver1.domain_registration_period, url),
        'age_of_domain': asyncio.to_thread(domainver1.domain_age, url),
        'DNSRecord': asyncio.to_thread(domainver1.dns_record, url),
        'SSLfinal_State': asyncio.to_thread(domainver1.ssl_certificate_status, url),
        'having_Sub_Domain': asyncio.to_thread(domainver1.having_subdomain, url),
        'HTTPS_token': asyncio.to_thread(domainver1.https_token, url),

        # 'RightClick': asyncio.to_thread(content_based_features.use_right_click, response),
        # 'Iframe': asyncio.to_thread(content_based_features.iFrame_redirection, response),
        # 'Favicon': asyncio.to_thread(content_based_features.check_favicon, url, response),
        # 'on_mouseover': asyncio.to_thread(content_based_features.check_onmouseover_change, response),
    }

    # 모든 비동기 작업을 병렬로 실행
    feature_results = await asyncio.gather(*tasks.values())
    features = dict(zip(tasks.keys(), feature_results))

    # 피처 값을 올바른 순서로 정렬하여 단순 배열로 변환
    feature_values = [features[feature] for feature in feature_order]
    features_array = np.array(feature_values).reshape(1, -1)  # 2D 배열로 변환

    # 학습된 모델 로드 및 예측 수행
    # 다층퍼셉트론 모델 로드
    with open('model/mlp_model_column_drop.pkl', 'rb') as f:
        mlp_model = pickle.load(f)

    # XGBoost 모델 로드
    with open('model/xgboost_model_column_drop.pkl', 'rb') as f:
        xgboost_model = pickle.load(f)

    # MLP 모델 예측
    mlp_prediction = mlp_model.predict(features_array)[0]
    mlp_probability = mlp_model.predict_proba(features_array)[0]
    mlp_phishing_prob = round(mlp_probability[1] * 100, 4)  # 피싱일 확률을 퍼센트로 변환하고 소수점 네 자리로 반올림

    # XGBoost 모델 예측
    xgboost_prediction = xgboost_model.predict(features_array)[0]
    xgboost_probability = xgboost_model.predict_proba(features_array)[0]
    xgboost_phishing_prob = round(xgboost_probability[1] * 100, 4)  # 피싱일 확률을 퍼센트로 변환하고 소수점 네 자리로 반올림

    # 피싱 여부 및 상세 설명
    phishing_mlp = mlp_prediction == 1
    phishing_xgboost = xgboost_prediction == 1
    explanation = []

    for feature_name, feature_value in features.items():
        if feature_name != 'Google_Index' and (feature_value == 1 or feature_value == 0):
            explanation.append(f"{feature_name}: {feature_value}")

    return {
        "url": url,
        "mlp": {
            "phishing": phishing_mlp,
            "probability": mlp_phishing_prob
        },
        "XGBoost": {
            "phishing": phishing_xgboost,
            "probability": xgboost_phishing_prob
        },
        "features": features
    }

In [20]:
# 메인 함수에서 비동기 함수 실행
async def main():
    df = pd.read_csv('test_dataset.csv')  # 데이터셋의 경로를 지정
    test_urls = df['url'].tolist()
    statuses = df['status'].tolist()  # status 컬럼 값 추출

    # 분석 결과를 저장할 데이터 프레임 초기화
    columns = feature_order + ['MLP_Phishing', 'MLP_Probability', 'XGBoost_Phishing', 'XGBoost_Probability', 'status']
    results_df = pd.DataFrame(columns=columns)

    # 각 URL에 대해 평가 수행
    for url, status in zip(test_urls, statuses):
        start_time = time.time()
        result = await evaluate_url(url)
        end_time = time.time()

        # 평가 결과를 데이터 프레임에 추가
        row = {**result["features"]}
        row['url'] = url  # url 추가
        row['status'] = status  # status 값 추가
        row['time'] = end_time - start_time  # 분석 시간 추가
        row['MLP_Phishing'] = result["mlp"]["phishing"]
        row['MLP_Probability'] = result["mlp"]["probability"]
        row['XGBoost_Phishing'] = result["XGBoost"]["phishing"]
        row['XGBoost_Probability'] = result["XGBoost"]["probability"]


        row_df = pd.DataFrame([row])
        results_df = pd.concat([results_df, row_df], ignore_index=True)

        print(f"URL: {url}")
        print(f"분석 시간: {end_time - start_time:.2f} seconds")
        print("=" * 50)

    # 결과를 CSV 파일로 저장
    results_df.to_csv('url_analysis_results_not_content.csv', index=False)

In [21]:
await main()

일반 URL입니다.
단축 URL 여부: -1
HTTP 요청 Error: {e}
popUpWidnow HTTP 요청 Error: Response is None
Request_URL HTTP 요청 Error: Response is None
URL_of_Anchor HTTP 요청 Error: Response is None
Links_in_tags HTTP 요청 Error: Response is None
SFH HTTP 요청 Error: Response is None
Submitting_to_email 요청 Error: Response is None
Redirect 요청 Error: Response is None
SSL Certificate Status Error: HTTPSConnectionPool(host='support-appleld.com.secureupdate.duilawyeryork.com', port=443): Max retries exceeded with url: /ap/89e6a3b4b063b8d/?cmd=_update&dispatch=89e6a3b4b063b8d1b&locale=_ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7a2a36642ec0>: Failed to resolve 'support-appleld.com.secureupdate.duilawyeryork.com' ([Errno -2] Name or service not known)"))


  results_df = pd.concat([results_df, row_df], ignore_index=True)


URL: https://support-appleld.com.secureupdate.duilawyeryork.com/ap/89e6a3b4b063b8d/?cmd=_update&dispatch=89e6a3b4b063b8d1b&locale=_
분석 시간: 0.57 seconds
일반 URL입니다.
단축 URL 여부: -1
popUpWidnow HTTP 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htm
Request_URL HTTP 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htm
URL_of_Anchor HTTP 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htmLinks_in_tags HTTP 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htmSFH HTTP 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htm

Submitting_to_email 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htm
Redirect 요청 Error: 404 Client Error: Not Found for url: http://phan.thuchao.free.fr/Hao/YouTube/ba.htm

URL: http://phan.thuchao.free.fr/Hao/YouTube/ba.htm
분석 

2024-08-26 12:15:51,833 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 104] Connection reset by peer
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno 104] Connection reset by peer


URL: http://www.j-net.cn/cms
분석 시간: 1.28 seconds
일반 URL입니다.
단축 URL 여부: -1
popUpWidnow HTTP 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/
Request_URL HTTP 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/URL_of_Anchor HTTP 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/
Links_in_tags HTTP 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/
SFH HTTP 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/
Submitting_to_email 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/Redirect 요청 Error: 404 Client Error: Not Found for url: https://mailserver67187.uc.r.appspot.com/


DNS Record Error: No match for "R.APPSPOT.COM".
>>> Last update of whois database: 2024-08-26T12:15:46Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's

2024-08-26 12:16:47,895 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
2024-08-26 12:16:50,280 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
2024-08-26 12:16:50,289 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
2024-08-26 12:16:50,290 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


URL: https://apshop.vn/
분석 시간: 4.92 seconds
일반 URL입니다.
단축 URL 여부: -1
HTTP 요청 Error: {e}
popUpWidnow HTTP 요청 Error: Response is None
Request_URL HTTP 요청 Error: Response is None
URL_of_Anchor HTTP 요청 Error: Response is None
Links_in_tags HTTP 요청 Error: Response is None
SFH HTTP 요청 Error: Response is None
Submitting_to_email 요청 Error: Response is None
Redirect 요청 Error: Response is None
SSL Certificate Status Error: HTTPSConnectionPool(host='html.house', port=443): Max retries exceeded with url: /l7ceeid6.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))
URL: http://html.house/l7ceeid6.html
분석 시간: 1.29 seconds
일반 URL입니다.
단축 URL 여부: -1
URL: https://storage.cloud.google.com/prprhrhprc.appspot.com/index.htm#oncall-infra@eqiom.com
분석 시간: 3.63 seconds
일반 URL입니다.
단축 URL 여부: -1
HTTP 요청 Error: {e}
popUpWidnow HTTP 요청 Error: Response is NoneRequest_URL HTTP 요청 Error: Response is

2024-08-26 12:18:31,685 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
2024-08-26 12:18:31,792 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
2024-08-26 12:18:31,809 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
2024-08-26 12:18:31,859 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known
ERROR:whois.whois:Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


URL: http://mne.edu.vn/wp-includes/Netflix/LoginID/index.php
분석 시간: 3.91 seconds
일반 URL입니다.
단축 URL 여부: -1
HTTP 요청 Error: {e}
popUpWidnow HTTP 요청 Error: Response is None
Request_URL HTTP 요청 Error: Response is None
URL_of_Anchor HTTP 요청 Error: Response is None
Links_in_tags HTTP 요청 Error: Response is None
SFH HTTP 요청 Error: Response is None
Submitting_to_email 요청 Error: Response is None
Redirect 요청 Error: Response is None
SSL Certificate Status Error: HTTPConnectionPool(host='marketinghelper.com.au', port=80): Max retries exceeded with url: /themes/sports/wp-content/net/555183669058fb273008595a9393d628/ (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7a2a365055a0>: Failed to resolve 'marketinghelper.com.au' ([Errno -2] Name or service not known)"))
URL: http://marketinghelper.com.au/themes/sports/wp-content/net/555183669058fb273008595a9393d628/
분석 시간: 0.80 seconds
일반 URL입니다.
단축 URL 여부: -1
HTTP 요청 Error: {e}
popUpWidnow HTTP 요청 Error: Response is NoneRequest