## Importando Bibliotecas

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

### Feito leitura dos dados e verificado e definido o target do atual trabalho

#####Target Definido = Class Label
#####1 -> Fishing / 0 -> Legítimo

#### Utilização do método otimização de hiperparâmetros com o modelo de Decision Tree.

In [5]:
df = pd.read_csv('/content/Phishing_Legitimate_full.csv', index_col=False)
df.head()

#1 -> Fishing / #0 -> Legítimo

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,1,3,1,5,72,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,2,3,1,3,144,0,0,0,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,3,1,2,58,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,4,3,1,6,79,1,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,5,3,0,4,46,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


#### Criando matriz de correlação:
####Etapa importante para verificarmos qual a correlação das outra features com o target que foi definido.

In [6]:
pd.set_option('display.max_columns', 50)
corr_mat = df.corr(method='pearson')
corr_mat

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,NumQueryComponents,NumAmpersand,NumHash,NumNumericChars,NoHttps,RandomString,IpAddress,DomainInSubdomains,DomainInPaths,HttpsInHostname,HostnameLength,PathLength,QueryLength,DoubleSlashInPath,NumSensitiveWords,EmbeddedBrandName,PctExtHyperlinks,PctExtResourceUrls,ExtFavicon,InsecureForms,RelativeFormAction,ExtFormAction,AbnormalFormAction,PctNullSelfRedirectHyperlinks,FrequentDomainNameMismatch,FakeLinkInStatusBar,RightClickDisabled,PopUpWindow,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
id,1.0,-0.238066,0.027976,-0.234909,-0.046967,0.231091,-0.10717,0.009916,-0.084148,0.024811,0.042614,0.047733,-0.0027,-0.0504,-0.07222,0.065735,0.050651,-0.174331,-0.06388,0.081811,,-0.109106,-0.012361,-0.015069,-0.023122,-0.214784,-0.127501,-0.245957,-0.010172,-0.085581,-0.274347,0.073639,0.096961,0.130747,-0.266362,-0.427392,0.007209,-0.041793,0.052504,0.305298,0.204099,-0.081004,0.000481,0.063732,-0.010836,-0.020932,-0.169502,-0.06857,0.465757,-0.866025
NumDots,-0.238066,1.0,0.507992,0.079202,0.27738,-0.198769,0.093034,0.011425,0.109508,0.038068,0.022616,0.169283,0.255404,0.371642,0.211384,0.02459,-0.064631,0.158447,0.238473,0.009897,,0.289482,0.000579,0.263039,0.002462,0.305793,0.164434,0.116726,-0.042685,0.050103,0.072103,-0.088802,-0.009604,-0.03199,0.033516,0.177703,-0.00249,0.050992,-0.031698,-0.117766,-0.11721,0.045291,-0.012244,-0.343927,-0.151756,0.047821,0.032276,0.035586,-0.153835,0.294111
SubdomainLevel,0.027976,0.507992,1.0,0.007277,0.098579,-0.050018,0.246368,-0.005845,0.009518,-0.023593,-0.006003,-0.029706,-0.031766,-0.018054,0.051744,-0.044628,0.076544,-0.103343,0.522014,-0.019847,,0.605218,-0.039307,-0.026179,0.01207,0.158365,0.019471,-0.06389,-0.079427,-0.033224,-0.040207,-0.020441,0.002922,0.029142,0.106631,-0.061013,-0.023893,0.13353,-0.010968,-0.027739,-0.001061,0.026432,-0.00883,-0.679687,-0.086943,0.059584,-0.03079,0.093623,-0.034866,0.043132
PathLevel,-0.234909,0.079202,0.007277,1.0,0.390456,0.090503,0.027626,-0.005891,0.127291,0.015719,-0.043529,-0.140303,-0.104935,0.007947,0.233024,0.058466,0.250091,0.017895,0.014693,0.261468,,0.013025,0.612786,-0.085555,0.020221,0.121805,0.087328,0.018365,-0.038542,0.082978,0.099618,0.007842,-0.012905,-0.033858,0.085679,0.028918,-0.004003,0.049318,0.014823,-0.061813,0.000701,-0.02788,-0.002279,-0.024815,-0.39971,-0.000903,0.040901,0.064222,-0.095786,0.22945
UrlLength,-0.046967,0.27738,0.098579,0.390456,1.0,0.437313,0.11198,-0.003425,0.058233,0.350756,0.184947,0.476699,0.466038,0.227029,0.637872,0.036667,0.230698,0.094311,0.120201,0.227722,,0.16277,0.643924,0.649204,0.000162,0.223492,0.026183,-0.09543,-0.045713,0.061362,-0.00403,0.048207,0.015086,0.023152,-0.090563,-0.099667,0.014242,0.087848,0.03498,0.042056,0.033192,-0.053222,-0.036766,-0.133533,-0.800096,0.002019,-0.025366,-0.00086,0.142819,-0.074493
NumDash,0.231091,-0.198769,-0.050018,0.090503,0.437313,1.0,0.1132,-0.010139,-0.024395,-0.050195,-0.017938,-0.064746,-0.056693,-0.023398,0.12183,0.036594,0.248141,-0.056135,0.056684,0.315989,,0.010045,0.605889,-0.013725,-0.001462,-0.081733,-0.04344,-0.07049,0.116288,0.086994,-0.091292,0.094711,0.077007,0.049723,-0.113173,-0.188701,-0.007831,0.063704,0.092634,0.181687,0.228154,-0.06445,-0.006314,-0.023363,-0.468798,-0.137093,-0.085238,-0.122579,0.184808,-0.372235
NumDashInHostname,-0.10717,0.093034,0.246368,0.027626,0.11198,0.1132,1.0,-0.004409,-0.018043,-0.03517,-0.021648,-0.042379,-0.036415,-0.008395,0.120447,-0.089577,0.073574,-0.033672,0.542477,0.038602,,0.550247,0.00045,-0.029753,0.065692,0.081098,0.013179,-0.025364,-0.078277,0.004301,0.011456,0.075694,-0.011151,-0.04169,0.177997,-0.019193,-0.009018,0.439129,-0.015237,-0.059574,0.040743,-0.010096,-0.007712,-0.359595,-0.097446,0.079631,0.038776,0.070111,-0.113745,0.150444
AtSymbol,0.009916,0.011425,-0.005845,-0.005891,-0.003425,-0.010139,-0.004409,1.0,-0.001996,-0.005023,0.007226,-0.005908,-0.004298,-0.000832,0.004545,0.001844,0.016471,-0.002292,-0.00261,0.008322,,-0.013143,0.005943,-0.006132,-0.00052,-0.005135,0.020623,-0.007698,-0.00771,0.007712,0.007448,-0.009967,0.013264,-0.004283,-0.005682,-0.009074,-0.001288,-0.002064,-0.001216,-0.006661,-0.012422,-0.00316,-0.003067,0.003031,-0.007469,-0.006885,-0.004207,0.011307,0.013235,-0.017323
TildeSymbol,-0.084148,0.109508,0.009518,0.127291,0.058233,-0.024395,-0.018043,-0.001996,1.0,-0.022361,-0.013665,-0.024903,-0.026223,-0.005532,0.111735,-0.062954,0.009156,0.228278,-0.01736,0.019217,,-0.021237,0.082378,0.003963,-0.003458,0.068413,0.039874,-0.013789,-0.032663,-0.025698,0.001057,-0.047976,-0.021336,-0.013384,0.002866,0.038078,-0.008568,0.001243,-0.008085,-0.039048,-0.039905,-0.011051,-0.0204,0.016614,-0.036087,0.022478,0.022098,0.058519,-0.000144,0.095864
NumUnderscore,0.024811,0.038068,-0.023593,0.015719,0.350756,-0.050195,-0.03517,-0.005023,-0.022361,1.0,0.061638,0.34352,0.279688,-0.002685,0.131965,0.027451,-0.001518,0.10172,-0.029686,0.049452,,-0.053946,0.113097,0.380857,-0.005711,0.032299,-0.019931,-0.042287,-0.022014,-0.029186,-0.015282,0.05795,0.017797,0.052693,-0.095099,-0.037074,0.018468,-0.023862,0.002779,0.037957,-0.010341,-0.019348,-0.019467,0.003353,-0.25748,0.014952,-0.053835,-0.047775,0.100214,-0.098869


#### Drop de features
####Após visualização de matriz de correlação, foi feito o drop das features chaves como o ID e a Class_label (target).

In [7]:
#Drop do target e do index
X = df.drop(columns=['CLASS_LABEL', 'id'])
y = df['CLASS_LABEL']

#### Criação da Pipeline, com o StandardScaler e o model(DecisionTreeClassifier).

In [8]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DecisionTreeClassifier(random_state=42))
])

#### Foi feito separação dos dados de treino e teste

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Utilizado Para otimização de Hiperparametro o Random Search

<b>Prós</b>:
- Mais eficiente do que Grid Search quando o espaço de hiperparâmetros é grande.
- Pode encontrar boas soluções com menos recursos computacionais.

<b>Contras</b>:
- Ao encontrar uma solução não ótima pode levar a resultados inconsistentes dado os parâmetros os quais foram testados


###### cv = Determina a estratégia de divisão de validação cruzada

In [10]:
#Criação de validação random search
param_random_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_features': list(range(1, 48)),
    'model__max_depth': [None, 4, 8, 10, 12],
    'model__class_weight': [None, 'balanced'],
    'model__min_samples_split': [2, 6, 10, 25]
}

random_search = RandomizedSearchCV(pipe, param_random_grid, random_state=42, n_iter=40, cv=5 ,n_jobs=-1)


#### Treinamento do modelo

In [11]:
random_search.fit(X_train, y_train)

#### Após treinamento, foram encontrados os melhores hiperparâmetros através do uso de Random Seacrh.

In [12]:
random_search.best_params_

{'model__min_samples_split': 10,
 'model__max_features': 42,
 'model__max_depth': None,
 'model__criterion': 'entropy',
 'model__class_weight': 'balanced'}

#### Atribuição da predição

In [13]:
y_pred = random_search.predict(X_test)

#### Classification_report

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       988
           1       0.96      0.98      0.97      1012

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

