In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [52]:
AVAILABLE_MODELS = {
    'DECISION_TREE': DecisionTreeClassifier(),
    'RANDOM_FOREST': RandomForestClassifier(n_estimators=100),
    'NEURAL_NETWORK': MLPClassifier(max_iter=1000),
    'K_NEIGHBORS': KNeighborsClassifier(),
    'SUPPORT_VECTOR_MACHINE': SVC(),
    #'GAUSSIAN_PROCESS': GaussianProcessClassifier(1.0 * RBF(1.0)), # to trwa tak z 2 lata nauka, wywalam xD
    'ADABOOST': AdaBoostClassifier(learning_rate=0.55),
    'NAIVE_BAYES': GaussianNB(),
    'QUADRATIC_DISCRIMINANT_ANALYSIS': QuadraticDiscriminantAnalysis()
}

In [53]:
def build_model_return_predictions(model, train_samples, train_classes, test_samples):
    classifier = model.fit(train_samples, train_classes)
    return classifier.predict(test_samples), classifier

def calculate_metrics(y_true, y_pred):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred)
    metrics['recall'] = recall_score(y_true, y_pred)
    metrics['f1_score'] = f1_score(y_true, y_pred)
    return metrics

In [54]:
illegitimate_df = pd.read_csv('../../data/illegitimate_data.csv')
#illegitimate_df = illegitimate_df.dropna(subset=['tls_age'])
#illegitimate_df = illegitimate_df.dropna(subset=['tls_issuer'])
#illegitimate_df = illegitimate_df.dropna(subset=['is_redirect'])
illegitimate_df

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,shop,Registrar of domain names REG.RU,RU,5.0,reg.ru,reg.ru,5.0,R3,21,0,...,0.0,0.0,0.0,0,3.558519,True,2,44.0,17.0,illegitimate
1,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,,,21,0,...,0.0,0.0,0.0,0,3.594466,False,2,0.0,0.0,illegitimate
2,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,7.0,E1,197,154,...,0.0,0.0,0.0,60,4.897371,True,2,0.0,0.0,illegitimate
3,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,GTS CA 1P5,210,154,...,0.0,0.0,0.0,56,4.990006,True,2,0.0,0.0,illegitimate
4,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,E1,206,154,...,0.0,0.0,0.0,64,4.966357,True,2,0.0,0.0,illegitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11108,xyz,"Registrar of Domain Names REG.RU, LLC",PL,1.0,cloudflare.com,reg.ru,,,34,0,...,0.0,0.0,0.0,8,4.079679,True,3,1.0,1.0,illegitimate
11109,xyz,"Registrar of Domain Names REG.RU, LLC",PL,1.0,cloudflare.com,reg.ru,1.0,GTS CA 1P5,43,0,...,0.0,0.0,0.0,10,4.518833,False,3,0.0,0.0,illegitimate
11110,site,"Name.com, Inc",UA,253.0,name.com,name.com,,,32,0,...,0.0,0.0,0.0,0,3.965018,False,3,0.0,8.0,illegitimate
11111,site,"Name.com, Inc",UA,253.0,name.com,name.com,,,33,0,...,0.0,0.0,0.0,0,4.089552,False,3,0.0,8.0,illegitimate


In [55]:
legitimate_df = pd.read_csv('../../data/legitimate_data.csv')
legitimate_df.drop('url', axis=1, inplace=True)
legitimate_df

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,org,DreamHost LLC,US,6334.0,dreamhost.com,,,,27,0,...,0.0,0.0,0.0,0.00000,3.912114,False,1,4.0,11.0,legitimate
1,org,Webcentral Group Limited dba Melbourne IT (Aus...,AU,7978.0,afraid.org,,,,49,0,...,0.0,0.0,0.0,0.00000,4.057961,False,1,0.0,18.0,legitimate
2,org,Webcentral Group Limited dba Melbourne IT (Aus...,AU,7978.0,afraid.org,,,,48,0,...,0.0,0.0,0.0,0.00000,4.130995,False,1,0.0,18.0,legitimate
3,fr,GIP RENATER,,10390.0,polytechnique.fr,,,,81,0,...,0.0,0.0,0.0,0.00000,4.609257,False,2,0.0,0.0,legitimate
4,com,"Network Solutions, LLC",US,8697.0,dns-solutions.net,,,,31,0,...,0.0,0.0,0.0,0.00000,3.760017,False,1,59.0,54.0,legitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,com,"NameSilo, LLC",US,1421.0,cloudflare.com,namesilo.com,3.0,Google Trust Services LLC,30,0,...,0.0,0.0,0.0,0.00000,4.056565,False,1,43.0,80.0,legitimate
3431,academy,"NameCheap, Inc.",IS,2567.0,registrar-servers.com,withheldforprivacy.com,11.0,Let's Encrypt,79,0,...,0.0,0.0,0.0,0.00000,4.206024,False,1,0.0,0.0,legitimate
3432,com,Tucows Domains Inc.,CA,7487.0,cloudflare.com,tucows.com,101.0,"Cloudflare, Inc.",61,0,...,0.0,0.0,0.0,11.47541,4.587895,False,1,19.0,346.0,legitimate
3433,com,"GoDaddy.com, LLC",US,3503.0,cloudflare.com,godaddy.com,165.0,"Cloudflare, Inc.",81,0,...,0.0,0.0,0.0,0.00000,4.197393,False,1,,,legitimate


In [56]:
all_df = pd.concat([illegitimate_df, legitimate_df])
all_df

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,shop,Registrar of domain names REG.RU,RU,5.0,reg.ru,reg.ru,5.0,R3,21,0,...,0.0,0.0,0.0,0.00000,3.558519,True,2,44.0,17.0,illegitimate
1,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,,,21,0,...,0.0,0.0,0.0,0.00000,3.594466,False,2,0.0,0.0,illegitimate
2,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,7.0,E1,197,154,...,0.0,0.0,0.0,60.00000,4.897371,True,2,0.0,0.0,illegitimate
3,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,GTS CA 1P5,210,154,...,0.0,0.0,0.0,56.00000,4.990006,True,2,0.0,0.0,illegitimate
4,com,"NameSilo, LLC",US,89.0,cloudflare.com,namesilo.com,6.0,E1,206,154,...,0.0,0.0,0.0,64.00000,4.966357,True,2,0.0,0.0,illegitimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,com,"NameSilo, LLC",US,1421.0,cloudflare.com,namesilo.com,3.0,Google Trust Services LLC,30,0,...,0.0,0.0,0.0,0.00000,4.056565,False,1,43.0,80.0,legitimate
3431,academy,"NameCheap, Inc.",IS,2567.0,registrar-servers.com,withheldforprivacy.com,11.0,Let's Encrypt,79,0,...,0.0,0.0,0.0,0.00000,4.206024,False,1,0.0,0.0,legitimate
3432,com,Tucows Domains Inc.,CA,7487.0,cloudflare.com,tucows.com,101.0,"Cloudflare, Inc.",61,0,...,0.0,0.0,0.0,11.47541,4.587895,False,1,19.0,346.0,legitimate
3433,com,"GoDaddy.com, LLC",US,3503.0,cloudflare.com,godaddy.com,165.0,"Cloudflare, Inc.",81,0,...,0.0,0.0,0.0,0.00000,4.197393,False,1,,,legitimate


In [101]:
import json

all_df_ml = all_df.copy()
all_df_ml['label'] = all_df['label'].replace({'illegitimate': 1, 'legitimate': 0})
all_df_ml = all_df.fillna(-1)

dict = {}
for col in all_df_ml.columns:
    if all_df_ml[col].dtype == object:  # Sprawdzenie, czy kolumna jest typu 'object' (string)
        uniq_val = all_df_ml[col].unique()  # Pobranie unikalnych wartości dla danej kolumny
        mapping = {val: index for index, val in enumerate(uniq_val)}  # Tworzenie mapowania
        dict[col] = mapping  # Dodawanie mapowania do słownika

dict = {col: {str(k): int(v) for k, v in mapping.items()} for col, mapping in dict.items()}

with open('dict.json', 'w') as file:
    json.dump(dict, file)


all_df_ml_ready = all_df_ml.replace(dict).astype(int)
all_df_ml_ready

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,0,0,0,5,0,0,5,0,21,0,...,0,0,0,0,3,1,2,44,17,0
1,1,1,1,89,1,1,-1,-1,21,0,...,0,0,0,0,3,0,2,0,0,0
2,1,1,1,89,1,1,7,2,197,154,...,0,0,0,60,4,1,2,0,0,0
3,1,1,1,89,1,1,6,3,210,154,...,0,0,0,56,4,1,2,0,0,0
4,1,1,1,89,1,1,6,2,206,154,...,0,0,0,64,4,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,1,1,1,1421,1,1,3,55,30,0,...,0,0,0,0,4,0,1,43,80,1
3431,149,7,2,2567,8,7,11,63,79,0,...,0,0,0,0,4,0,1,0,0,1
3432,1,10,12,7487,1,8,101,61,61,0,...,0,0,0,11,4,0,1,19,346,1
3433,1,12,1,3503,1,10,165,61,81,0,...,0,0,0,0,4,0,1,-1,-1,1


## Zamiana feature'ów tekstowych na liczbowe

In [103]:
#all_df_ml_ready = all_df.copy()
##TODO możliwe że tu trzeba inaczej zamienić na liczbowe - nie wiem czy to jest ok, ktoś musi sprawdzić
#all_df_ml_ready['label'] = all_df_ml_ready['label'].replace({'illegitimate': 1, 'legitimate': 0})
#all_df_ml_ready = all_df_ml_ready.fillna(-1)
#for col in all_df_ml_ready.columns:
#    if all_df_ml_ready[col].dtype == object:  # Only apply to object (string) columns
#        labels, uniques = pd.factorize(all_df_ml_ready[col])
#        all_df_ml_ready[col] = labels
#all_df_ml_ready

# TODO chłopaki
statystyki z pól df
tabelka z podsumowaniem
tabelka korelacji parametrów
z danych liczbowych jakieś wykresy/histogramy
z tekstowych policzenie (zostawiamy te Pawłowe)
jak parametr zbyt płaski to może x^2

In [104]:
all_df_ml_ready_legit = all_df_ml_ready.loc[all_df_ml_ready['label'] == 0]
all_df_ml_ready_illegit = all_df_ml_ready.loc[all_df_ml_ready['label'] == 1]

### Statytyki z podziałem na dane legit i illegit

In [105]:
stats_illegit = all_df_ml_ready_illegit.describe()
stats_illegit

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
count,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,...,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0
mean,19.071033,53.67802,5.26754,4632.43901,85.367686,2.712955,95.328675,58.840757,92.677729,33.741194,...,0.0,0.0,0.0,6.528675,3.937991,0.120233,1.042213,19.90131,104.658224,1.0
std,36.034809,71.643828,13.542994,3875.680575,112.657852,20.780455,94.318056,9.627662,137.977086,132.341527,...,0.0,0.0,0.0,9.23362,0.531969,0.325281,0.216446,70.695566,221.568857,0.0
min,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,15.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,1.0,-1.0,-1.0,1.0
25%,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.0,55.0,41.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,1.0
50%,3.0,33.0,1.0,4469.0,11.0,-1.0,56.0,61.0,60.0,0.0,...,0.0,0.0,0.0,2.0,4.0,0.0,1.0,2.0,35.0,1.0
75%,22.0,75.5,2.0,8104.0,201.0,-1.0,131.0,63.0,90.0,0.0,...,0.0,0.0,0.0,10.0,4.0,0.0,1.0,15.0,123.0,1.0
max,149.0,274.0,84.0,13929.0,384.0,178.0,369.0,80.0,1991.0,1903.0,...,0.0,0.0,0.0,54.0,6.0,1.0,3.0,2547.0,4993.0,1.0


In [106]:
stats_legit = all_df_ml_ready_legit.describe()
stats_legit

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
count,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,...,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0,11113.0
mean,12.303068,30.878701,3.653379,523.507244,16.161253,18.425178,6.927382,0.444434,44.901557,11.46837,...,0.0,0.0,0.0,6.187438,3.439935,-0.507784,2.665257,25.087015,11.339242,0.0
std,17.751878,40.362454,8.189435,1412.298711,32.078769,29.25984,36.579581,5.466749,70.781126,61.84977,...,0.0,0.0,0.0,15.200237,0.598937,0.779367,0.702559,111.158395,58.671364,0.0
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,14.0,0.0,...,0.0,0.0,0.0,0.0,2.0,-1.0,2.0,-1.0,-1.0,0.0
25%,3.0,4.0,1.0,1.0,1.0,2.0,-1.0,-1.0,23.0,0.0,...,0.0,0.0,0.0,0.0,3.0,-1.0,2.0,0.0,0.0,0.0
50%,7.0,10.0,1.0,10.0,1.0,4.0,-1.0,-1.0,27.0,0.0,...,0.0,0.0,0.0,0.0,3.0,-1.0,3.0,0.0,0.0,0.0
75%,15.0,51.0,4.0,197.0,14.0,24.0,-1.0,-1.0,37.0,0.0,...,0.0,0.0,0.0,7.0,4.0,0.0,3.0,5.0,4.0,0.0
max,110.0,213.0,70.0,9342.0,193.0,161.0,1761.0,53.0,2299.0,2262.0,...,0.0,0.0,0.0,401.0,6.0,1.0,10.0,3035.0,534.0,0.0


In [107]:
### histogramy, podział na legit i illegit

In [108]:
## Wyświetlanie histogramów dla każdej kolumny
#for column in all_df_ml_ready_legit.columns:
#    plt.figure()  # Tworzenie nowego wykresu dla każdej kolumny
#    all_df_ml_ready_legit[column].hist(alpha=0.5, label='Legitimate', color='green')  # Zielone słupki dla danych "all_df_ml_ready_legit"
#    all_df_ml_ready_illegit[column].hist(alpha=0.5, label='Illegitimate', color='red')  # Czerwone słupki dla danych "all_df_ml_ready_illegit"
#    plt.title(column)  # Ustawienie tytułu wykresu jako nazwa kolumny
#    plt.xlabel('Wartość')  # Etykieta osi x
#    plt.ylabel('Liczność')  # Etykieta osi y
#    plt.legend()  # Wyświetlenie legendy
#    plt.show()  # Wyświetlenie wykresu


### Macierz korelacji dla danych legit i illegit

In [109]:
#import pandas as pd
#import seaborn as sns
#import matplotlib.pyplot as plt
#
## Obliczanie macierzy korelacji dla danych "all_df_ml_ready_legit"
#corr_legit = all_df_ml_ready_legit.corr()
#
## Wyświetlanie macierzy korelacji dla danych "all_df_ml_ready_legit"
#plt.figure(figsize=(18, 16))
#sns.heatmap(corr_legit, annot=True, cmap='coolwarm', linewidths=0.5, annot_kws={'fontsize': 8})
#plt.title('Macierz korelacji - Legitimate', fontsize=10)
#plt.xticks(fontsize=8)
#plt.yticks(fontsize=8)
#plt.show()
#
## Obliczanie macierzy korelacji dla danych "all_df_ml_ready_illegit"
#corr_illegit = all_df_ml_ready_illegit.corr()
#
## Wyświetlanie macierzy korelacji dla danych "all_df_ml_ready_illegit"
#plt.figure(figsize=(18, 16))
#sns.heatmap(corr_illegit, annot=True, cmap='coolwarm', linewidths=0.5, annot_kws={'fontsize': 8})
#plt.title('Macierz korelacji - Illegitimate', fontsize=10)
#plt.xticks(fontsize=8)
#plt.yticks(fontsize=8)
#plt.show()
#
#
## Wyświetlanie macierzy korelacji dla danych "all_df_ml_ready"
#
#corr_all = all_df_ml_ready.corr()
#plt.figure(figsize=(18, 16))
#sns.heatmap(corr_all, annot=True, cmap='coolwarm', linewidths=0.5, annot_kws={'fontsize': 8})
#plt.title('Macierz korelacji - All', fontsize=10)
#plt.xticks(fontsize=8)
#plt.yticks(fontsize=8)
#plt.show()


In [110]:
all_df_ml_ready

Unnamed: 0,tld,registrar,registrant_country,domain_age,nameserver_domain,mail_domain,tls_age,tls_issuer,url_len,parameters_len,...,opening_bracket_percent,closing_bracket_percent,comma_percent,numbers_percent,url_entropy,is_redirect,subdomain_count,content_link_count,content_img_count,label
0,0,0,0,5,0,0,5,0,21,0,...,0,0,0,0,3,1,2,44,17,0
1,1,1,1,89,1,1,-1,-1,21,0,...,0,0,0,0,3,0,2,0,0,0
2,1,1,1,89,1,1,7,2,197,154,...,0,0,0,60,4,1,2,0,0,0
3,1,1,1,89,1,1,6,3,210,154,...,0,0,0,56,4,1,2,0,0,0
4,1,1,1,89,1,1,6,2,206,154,...,0,0,0,64,4,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430,1,1,1,1421,1,1,3,55,30,0,...,0,0,0,0,4,0,1,43,80,1
3431,149,7,2,2567,8,7,11,63,79,0,...,0,0,0,0,4,0,1,0,0,1
3432,1,10,12,7487,1,8,101,61,61,0,...,0,0,0,11,4,0,1,19,346,1
3433,1,12,1,3503,1,10,165,61,81,0,...,0,0,0,0,4,0,1,-1,-1,1


In [111]:
def train(df, fraction):
    features = df.drop(columns=['label'])
    classes = df['label']

    # Split the data into training and testing datasets
    features_train, features_test, classes_train, classes_test = train_test_split(features, classes, test_size=fraction, random_state=1234)
    
    models = []
    
    for model_name in AVAILABLE_MODELS:
        prediction_model = AVAILABLE_MODELS[model_name]
        predictions, model = build_model_return_predictions(prediction_model, features_train, classes_train, features_test)
        metrics = calculate_metrics(classes_test, predictions)
        print(model_name)
        print(metrics)
        models.append(model)
    return models

In [112]:
train(all_df_ml_ready, 0.3)

DECISION_TREE
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
RANDOM_FOREST
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
NEURAL_NETWORK
{'accuracy': 0.981672394043528, 'precision': 0.9304192685102587, 'recall': 0.9980861244019139, 'f1_score': 0.9630655586334257}
K_NEIGHBORS
{'accuracy': 0.9876288659793815, 'precision': 0.9834146341463414, 'recall': 0.9645933014354067, 'f1_score': 0.9739130434782609}
SUPPORT_VECTOR_MACHINE
{'accuracy': 0.9292096219931272, 'precision': 0.8717171717171717, 'recall': 0.8258373205741627, 'f1_score': 0.8481572481572482}
ADABOOST
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
NAIVE_BAYES
{'accuracy': 0.9903780068728523, 'precision': 0.9665116279069768, 'recall': 0.9942583732057416, 'f1_score': 0.980188679245283}
QUADRATIC_DISCRIMINANT_ANALYSIS
{'accuracy': 0.9963344788087056, 'precision': 0.990467111534795, 'recall': 0.9942583732057416, 'f1_score': 0.9923591212989494}




[DecisionTreeClassifier(),
 RandomForestClassifier(),
 MLPClassifier(max_iter=1000),
 KNeighborsClassifier(),
 SVC(),
 AdaBoostClassifier(learning_rate=0.55),
 GaussianNB(),
 QuadraticDiscriminantAnalysis()]

# TODO chłopaki
modele się mega przeuczają, trzeba sprawdzić przez jaki parametr tak jest, może obciąć zbiór wejściowy z jakiś gówno danych. Jak dane będą przefiltrowane i wciąż się będzie przeuczał to dropujecie kolumny sprawdzając czy jak ich nie ma to jest dużo gorzej, powinniście znaleźć jedną/parę po których usunięciu model stanie się gównem - o to chodzi

In [113]:
# deleting data where tls_issuer (and other data from whois) iquils to nan

all_df_ml_removed_nan_whois = all_df_ml_ready[all_df_ml_ready['tls_issuer'].notna()]

In [114]:
train(all_df_ml_removed_nan_whois, 0.3)

DECISION_TREE
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
RANDOM_FOREST
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
NEURAL_NETWORK
{'accuracy': 0.9942726231386025, 'precision': 0.989443378119002, 'recall': 0.9866028708133971, 'f1_score': 0.9880210828941064}
K_NEIGHBORS
{'accuracy': 0.9876288659793815, 'precision': 0.9834146341463414, 'recall': 0.9645933014354067, 'f1_score': 0.9739130434782609}
SUPPORT_VECTOR_MACHINE
{'accuracy': 0.9292096219931272, 'precision': 0.8717171717171717, 'recall': 0.8258373205741627, 'f1_score': 0.8481572481572482}
ADABOOST
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
NAIVE_BAYES
{'accuracy': 0.9903780068728523, 'precision': 0.9665116279069768, 'recall': 0.9942583732057416, 'f1_score': 0.980188679245283}
QUADRATIC_DISCRIMINANT_ANALYSIS
{'accuracy': 0.9963344788087056, 'precision': 0.990467111534795, 'recall': 0.9942583732057416, 'f1_score': 0.9923591212989494}




[DecisionTreeClassifier(),
 RandomForestClassifier(),
 MLPClassifier(max_iter=1000),
 KNeighborsClassifier(),
 SVC(),
 AdaBoostClassifier(learning_rate=0.55),
 GaussianNB(),
 QuadraticDiscriminantAnalysis()]

## VarianceThreshold

In [116]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np

# Convert DataFrame to a NumPy array
X = all_df_ml_removed_nan_whois.values

# Create VarianceThreshold object
selector = VarianceThreshold(threshold=0.2)  # Set the threshold as desired

# Fit the selector to the data
selector.fit(X)

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)

# Subset the DataFrame with the selected features
df_selected = all_df_ml_removed_nan_whois.iloc[:, selected_indices]
df_selected['label'] = all_df_ml_removed_nan_whois['label']

train(df_selected, 0.3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['label'] = all_df_ml_removed_nan_whois['label']


DECISION_TREE
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
RANDOM_FOREST
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
NEURAL_NETWORK
{'accuracy': 0.9894616265750287, 'precision': 0.9854227405247813, 'recall': 0.970334928229665, 'f1_score': 0.9778206364513019}
K_NEIGHBORS
{'accuracy': 0.9876288659793815, 'precision': 0.9834146341463414, 'recall': 0.9645933014354067, 'f1_score': 0.9739130434782609}
SUPPORT_VECTOR_MACHINE
{'accuracy': 0.929667812142039, 'precision': 0.8719758064516129, 'recall': 0.8277511961722488, 'f1_score': 0.8492881688757976}
ADABOOST
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
NAIVE_BAYES
{'accuracy': 0.9903780068728523, 'precision': 0.9665116279069768, 'recall': 0.9942583732057416, 'f1_score': 0.980188679245283}
QUADRATIC_DISCRIMINANT_ANALYSIS
{'accuracy': 0.9917525773195877, 'precision': 0.9719363891487371, 'recall': 0.9942583732057416, 'f1_score': 0.9829706717123935}


[DecisionTreeClassifier(),
 RandomForestClassifier(),
 MLPClassifier(max_iter=1000),
 KNeighborsClassifier(),
 SVC(),
 AdaBoostClassifier(learning_rate=0.55),
 GaussianNB(),
 QuadraticDiscriminantAnalysis()]

In [117]:
for col in all_df_ml_removed_nan_whois.columns:
    if all_df_ml[col].dtype == object:
        print(col)


tld
registrar
registrant_country
nameserver_domain
mail_domain
tls_issuer
is_redirect
label


## VIF

In [118]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

threshold = 6  # Set the VIF threshold as desired

# Calculate VIF for each feature
vif = pd.DataFrame()
vif["Feature"] = all_df_ml_removed_nan_whois.columns
vif["VIF"] = [variance_inflation_factor(all_df_ml_removed_nan_whois.values, i) for i in range(all_df_ml_removed_nan_whois.shape[1])]

# Get the column names that exceed the VIF threshold
columns_to_drop = vif[vif['VIF'] > threshold]['Feature']

# Drop the columns from the DataFrame
df_filtered = all_df_ml_removed_nan_whois.drop(columns_to_drop, axis=1)
df_filtered['label'] = all_df_ml_removed_nan_whois['label']

train(df_filtered, 0.3)


  return 1 - self.ssr/self.uncentered_tss


DECISION_TREE
{'accuracy': 0.9988545246277205, 'precision': 0.997131931166348, 'recall': 0.9980861244019139, 'f1_score': 0.9976087996174079}
RANDOM_FOREST
{'accuracy': 0.9997709049255441, 'precision': 0.9990439770554493, 'recall': 1.0, 'f1_score': 0.9995217599234816}
NEURAL_NETWORK
{'accuracy': 0.9915234822451318, 'precision': 0.9931506849315068, 'recall': 0.9712918660287081, 'f1_score': 0.9820996613449444}
K_NEIGHBORS
{'accuracy': 0.9802978235967926, 'precision': 0.9696376101860921, 'recall': 0.9473684210526315, 'f1_score': 0.9583736689254598}
SUPPORT_VECTOR_MACHINE
{'accuracy': 0.8982817869415808, 'precision': 0.8305830583058306, 'recall': 0.722488038277512, 'f1_score': 0.7727737973387923}
ADABOOST
{'accuracy': 0.9997709049255441, 'precision': 0.9990439770554493, 'recall': 1.0, 'f1_score': 0.9995217599234816}
NAIVE_BAYES
{'accuracy': 0.9676975945017182, 'precision': 0.8837011884550084, 'recall': 0.9961722488038277, 'f1_score': 0.9365721997300944}
QUADRATIC_DISCRIMINANT_ANALYSIS
{'acc



[DecisionTreeClassifier(),
 RandomForestClassifier(),
 MLPClassifier(max_iter=1000),
 KNeighborsClassifier(),
 SVC(),
 AdaBoostClassifier(learning_rate=0.55),
 GaussianNB(),
 QuadraticDiscriminantAnalysis()]

## Covaration threshold

In [119]:
# deleting features where covaration is above 0.7

import numpy as np

threshold = 0.2

# do not drop labels :)
corr_all_matrix = all_df_ml_removed_nan_whois.drop(columns=['label']).corr().abs()

# Select upper triangle of correlation matrix
upper = corr_all_matrix.where(np.triu(np.ones(corr_all_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]


# Drop features 
all_df_ml_removed_nan_whois_filtered_covariation = all_df_ml_removed_nan_whois.drop(to_drop, axis=1)

to_drop

['domain_age',
 'nameserver_domain',
 'mail_domain',
 'tls_age',
 'tls_issuer',
 'url_len',
 'parameters_len',
 'parameters_count',
 'numbers_percent',
 'url_entropy',
 'is_redirect',
 'subdomain_count',
 'content_img_count']

In [120]:
models = train(all_df_ml_removed_nan_whois_filtered_covariation, 0.3)

DECISION_TREE
{'accuracy': 0.9608247422680413, 'precision': 0.9405241935483871, 'recall': 0.8928229665071771, 'f1_score': 0.9160530191458027}
RANDOM_FOREST
{'accuracy': 0.9601374570446736, 'precision': 0.9257086999022482, 'recall': 0.9062200956937799, 'f1_score': 0.9158607350096711}
NEURAL_NETWORK
{'accuracy': 0.8710194730813288, 'precision': 0.7625272331154684, 'recall': 0.6698564593301436, 'f1_score': 0.7131940906775344}
K_NEIGHBORS
{'accuracy': 0.9241695303550974, 'precision': 0.8299445471349353, 'recall': 0.8593301435406698, 'f1_score': 0.8443817583450869}
SUPPORT_VECTOR_MACHINE
{'accuracy': 0.8016036655211913, 'precision': 0.92018779342723, 'recall': 0.1875598086124402, 'f1_score': 0.3116057233704293}
ADABOOST
{'accuracy': 0.9008018327605957, 'precision': 0.9421965317919075, 'recall': 0.6239234449760765, 'f1_score': 0.7507196315486471}
NAIVE_BAYES
{'accuracy': 0.7599083619702176, 'precision': 0.49774436090225566, 'recall': 0.3167464114832536, 'f1_score': 0.38713450292397666}
QUADR

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, msg_start, len(result))


In [121]:
models[0]

In [122]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(models[0], open(filename, 'wb'))


## Normalize all features

In [None]:
# from sklearn.preprocessing import normalize

# colum = all_df_ml_removed_nan_whois_filtered_covariation.drop(columns=['label']).columns


# all_df_ml_ready_filtered_covariation_normalized = all_df_ml_removed_nan_whois_filtered_covariation.copy()

# all_df_ml_ready_filtered_covariation_normalized[colum] = normalize(all_df_ml_removed_nan_whois_filtered_covariation[colum])

# # all_df_ml_ready_filtered_covariation_normalized = all_df_ml_ready_filtered_covariation_normalized.drop(columns=['tls_issuer', 'tls_age', 'is_redirect'])


# train(all_df_ml_ready_filtered_covariation_normalized, 0.3)
