In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
AVAILABLE_MODELS = {
    'DECISION_TREE': DecisionTreeClassifier(),
    'RANDOM_FOREST': RandomForestClassifier(n_estimators=100),
    'NEURAL_NETWORK': MLPClassifier(max_iter=1000),
    'K_NEIGHBORS': KNeighborsClassifier(),
    'SUPPORT_VECTOR_MACHINE': SVC(),
    #'GAUSSIAN_PROCESS': GaussianProcessClassifier(1.0 * RBF(1.0)), # to trwa tak z 2 lata nauka, wywalam xD
    'ADABOOST': AdaBoostClassifier(learning_rate=0.55),
    'NAIVE_BAYES': GaussianNB(),
    'QUADRATIC_DISCRIMINANT_ANALYSIS': QuadraticDiscriminantAnalysis()
}

In [4]:
def build_model_return_predictions(model, train_samples, train_classes, test_samples):
    classifier = model.fit(train_samples, train_classes)
    return classifier.predict(test_samples), classifier

def calculate_metrics(y_true, y_pred):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred)
    metrics['recall'] = recall_score(y_true, y_pred)
    metrics['f1_score'] = f1_score(y_true, y_pred)
    return metrics

In [5]:
illegitimate_df = pd.read_csv('../../data/illegitimate_data.csv')
illegitimate_df

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,shop,Registrar of domain names REG.RU,RU,5.0,reg.ru,reg.ru,5.0,R3,21,0,...,0.0,0.0,0.0,0,3.558519,True,2,44.0,17.0,illegitimate
1,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,,,21,0,...,0.0,0.0,0.0,0,3.594466,False,2,0.0,0.0,illegitimate
2,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,7.0,E1,197,154,...,0.0,0.0,0.0,60,4.897371,True,2,0.0,0.0,illegitimate
3,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,GTS CA 1P5,210,154,...,0.0,0.0,0.0,56,4.990006,True,2,0.0,0.0,illegitimate
4,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,E1,206,154,...,0.0,0.0,0.0,64,4.966357,True,2,0.0,0.0,illegitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11108,xyz,"Registrar of Domain Names REG.RU, LLC",PL,1.0,cloudflare.com,reg.ru,,,34,0,...,0.0,0.0,0.0,8,4.079679,True,3,1.0,1.0,illegitimate
11109,xyz,"Registrar of Domain Names REG.RU, LLC",PL,1.0,cloudflare.com,reg.ru,1.0,GTS CA 1P5,43,0,...,0.0,0.0,0.0,10,4.518833,False,3,0.0,0.0,illegitimate
11110,site,"Name.com, Inc",UA,253.0,name.com,name.com,,,32,0,...,0.0,0.0,0.0,0,3.965018,False,3,0.0,8.0,illegitimate
11111,site,"Name.com, Inc",UA,253.0,name.com,name.com,,,33,0,...,0.0,0.0,0.0,0,4.089552,False,3,0.0,8.0,illegitimate


In [6]:
legitimate_df = pd.read_csv('../../data/legitimate_data.csv')
legitimate_df.drop('url', axis=1, inplace=True)
legitimate_df

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,org,DreamHost LLC,US,6334.0,dreamhost.com,,,,27,0,...,0.0,0.0,0.0,0.00000,3.912114,False,1,4.0,11.0,legitimate
1,org,Webcentral Group Limited dba Melbourne IT (Aus...,AU,7978.0,afraid.org,,,,49,0,...,0.0,0.0,0.0,0.00000,4.057961,False,1,0.0,18.0,legitimate
2,org,Webcentral Group Limited dba Melbourne IT (Aus...,AU,7978.0,afraid.org,,,,48,0,...,0.0,0.0,0.0,0.00000,4.130995,False,1,0.0,18.0,legitimate
3,fr,GIP RENATER,,10390.0,polytechnique.fr,,,,81,0,...,0.0,0.0,0.0,0.00000,4.609257,False,2,0.0,0.0,legitimate
4,com,"Network Solutions, LLC",US,8697.0,dns-solutions.net,,,,31,0,...,0.0,0.0,0.0,0.00000,3.760017,False,1,59.0,54.0,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,com,"NameSilo, LLC",US,1421.0,cloudflare.com,namesilo.com,3.0,Google Trust Services LLC,30,0,...,0.0,0.0,0.0,0.00000,4.056565,False,1,43.0,80.0,legitimate
3431,academy,"NameCheap, Inc.",IS,2567.0,registrar-servers.com,withheldforprivacy.com,11.0,Let's Encrypt,79,0,...,0.0,0.0,0.0,0.00000,4.206024,False,1,0.0,0.0,legitimate
3432,com,Tucows Domains Inc.,CA,7487.0,cloudflare.com,tucows.com,101.0,"Cloudflare, Inc.",61,0,...,0.0,0.0,0.0,11.47541,4.587895,False,1,19.0,346.0,legitimate
3433,com,"GoDaddy.com, LLC",US,3503.0,cloudflare.com,godaddy.com,165.0,"Cloudflare, Inc.",81,0,...,0.0,0.0,0.0,0.00000,4.197393,False,1,,,legitimate


In [7]:
all_df = pd.concat([illegitimate_df, legitimate_df])
all_df

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,shop,Registrar of domain names REG.RU,RU,5.0,reg.ru,reg.ru,5.0,R3,21,0,...,0.0,0.0,0.0,0.00000,3.558519,True,2,44.0,17.0,illegitimate
1,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,,,21,0,...,0.0,0.0,0.0,0.00000,3.594466,False,2,0.0,0.0,illegitimate
2,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,7.0,E1,197,154,...,0.0,0.0,0.0,60.00000,4.897371,True,2,0.0,0.0,illegitimate
3,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,GTS CA 1P5,210,154,...,0.0,0.0,0.0,56.00000,4.990006,True,2,0.0,0.0,illegitimate
4,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,E1,206,154,...,0.0,0.0,0.0,64.00000,4.966357,True,2,0.0,0.0,illegitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,com,"NameSilo, LLC",US,1421.0,cloudflare.com,namesilo.com,3.0,Google Trust Services LLC,30,0,...,0.0,0.0,0.0,0.00000,4.056565,False,1,43.0,80.0,legitimate
3431,academy,"NameCheap, Inc.",IS,2567.0,registrar-servers.com,withheldforprivacy.com,11.0,Let's Encrypt,79,0,...,0.0,0.0,0.0,0.00000,4.206024,False,1,0.0,0.0,legitimate
3432,com,Tucows Domains Inc.,CA,7487.0,cloudflare.com,tucows.com,101.0,"Cloudflare, Inc.",61,0,...,0.0,0.0,0.0,11.47541,4.587895,False,1,19.0,346.0,legitimate
3433,com,"GoDaddy.com, LLC",US,3503.0,cloudflare.com,godaddy.com,165.0,"Cloudflare, Inc.",81,0,...,0.0,0.0,0.0,0.00000,4.197393,False,1,,,legitimate


## Zamiana feature'ów tekstowych na liczbowe

In [8]:
all_df_ml_ready = all_df.copy()
#TODO możliwe że tu trzeba inaczej zamienić na liczbowe - nie wiem czy to jest ok, ktoś musi sprawdzić
all_df_ml_ready['label'] = all_df_ml_ready['label'].replace({'illegitimate': 1, 'legitimate': 0})
all_df_ml_ready = all_df_ml_ready.fillna(-1)
for col in all_df_ml_ready.columns:
    if all_df_ml_ready[col].dtype == object:  # Only apply to object (string) columns
        labels, uniques = pd.factorize(all_df_ml_ready[col])
        all_df_ml_ready[col] = labels
all_df_ml_ready

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,0,0,0,5.0,0,0,5.0,0,21,0,...,0.0,0.0,0.0,0.00000,3.558519,0,2,44.0,17.0,1
1,1,1,1,89.0,1,1,-1.0,1,21,0,...,0.0,0.0,0.0,0.00000,3.594466,1,2,0.0,0.0,1
2,1,1,1,89.0,1,1,7.0,2,197,154,...,0.0,0.0,0.0,60.00000,4.897371,0,2,0.0,0.0,1
3,1,1,1,89.0,1,1,6.0,3,210,154,...,0.0,0.0,0.0,56.00000,4.990006,0,2,0.0,0.0,1
4,1,1,1,89.0,1,1,6.0,2,206,154,...,0.0,0.0,0.0,64.00000,4.966357,0,2,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,1,1,1,1421.0,1,1,3.0,55,30,0,...,0.0,0.0,0.0,0.00000,4.056565,1,1,43.0,80.0,0
3431,149,7,2,2567.0,8,7,11.0,63,79,0,...,0.0,0.0,0.0,0.00000,4.206024,1,1,0.0,0.0,0
3432,1,10,12,7487.0,1,8,101.0,61,61,0,...,0.0,0.0,0.0,11.47541,4.587895,1,1,19.0,346.0,0
3433,1,12,1,3503.0,1,10,165.0,61,81,0,...,0.0,0.0,0.0,0.00000,4.197393,1,1,-1.0,-1.0,0


# TODO chłopaki
statystyki z pól df
z danych liczbowych jakieś wykresy/histogramy
z tekstowych policzenie
tabelka z podsumowaniem
tabelka korelacji parametrów
jak parametr zbyt płaski to może x^2

In [13]:
TRAIN_SAMPLES_FRACTION = 0.9
# Separate the features and target variable
features = all_df_ml_ready[all_df_ml_ready.columns.difference(['label'])]
classes = all_df_ml_ready['label']

# Split the data into training and testing datasets
features_train, features_test, classes_train, classes_test = train_test_split(features, classes, test_size=TRAIN_SAMPLES_FRACTION, random_state=42)

In [14]:
for model_name in AVAILABLE_MODELS:
    prediction_model = AVAILABLE_MODELS[model_name]
    predictions, model = build_model_return_predictions(prediction_model, features_train, classes_train, features_test)
    metrics = calculate_metrics(classes_test, predictions)
    print(model_name)
    print(metrics)

DECISION_TREE
{'accuracy': 0.9992362914311899, 'precision': 0.9992007992007992, 'recall': 0.9998000799680128, 'f1_score': 0.9995003497551713}
RANDOM_FOREST
{'accuracy': 0.999694516572476, 'precision': 0.9996003197442046, 'recall': 1.0, 'f1_score': 0.9998001199280432}
NEURAL_NETWORK
{'accuracy': 0.9628073926989461, 'precision': 0.9959353830119854, 'recall': 0.955217912834866, 'f1_score': 0.97515179345885}
K_NEIGHBORS
{'accuracy': 0.9479914464640293, 'precision': 0.9556250610888476, 'recall': 0.9773090763694522, 'f1_score': 0.9663454410674573}
SUPPORT_VECTOR_MACHINE
{'accuracy': 0.8661218878875822, 'precision': 0.881955374502361, 'recall': 0.952219112355058, 'f1_score': 0.9157414083153089}
ADABOOST
{'accuracy': 0.9992362914311899, 'precision': 0.9992007992007992, 'recall': 0.9998000799680128, 'f1_score': 0.9995003497551713}
NAIVE_BAYES
{'accuracy': 0.9896899343210631, 'precision': 0.9966784096628083, 'recall': 0.9898040783686526, 'f1_score': 0.9932293495160239}
QUADRATIC_DISCRIMINANT_ANA



# TODO chłopaki
modele się mega przeuczają, trzeba sprawdzić przez jaki parametr tak jest, może obciąć zbiór wejściowy z jakiś gówno danych. Jak dane będą przefiltrowane i wciąż się będzie przeuczał to dropujecie kolumny sprawdzając czy jak ich nie ma to jest dużo gorzej, powinniście znaleźć jedną/parę po których usunięciu model stanie się gównem - o to chodzi