# CPC802: Tópicos Especiais em Inteligência Computacional

A proposta do seguinte código é carregar um modelo já treinado utilizando a biblioteca joblib e efetuar uma comparação de acurácia de cada modelo.

O vetor utilizado e os modelos são carregados nas variáveis a seguir. Ao fim do arquivo, é impressa a acurácia dos 3 modelos juntos, o número de erros e a lista com as URLs que foram classificadas com erro.

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [0]:
filenameVectorizer   = 'cpc802-20200209-030517-perceptron-word.vectorizer'
filenameModel_logreg = 'cpc802-20200209-030517-logreg-word.sav'
filenameModel_lsvm   = 'cpc802-20200209-030517-lsvm-word.sav'
filenameModel_percep = 'cpc802-20200209-030517-perceptron-word.sav'

In [0]:
import os
import urllib.parse

# Função que lê os dados do arquivo.
def loadFile(name):
    # Obtém diretório e nome completo do arquivo (com path).
    directory = str(os.getcwd())
    filepath = os.path.join(directory, name)

    # Faz a leitura das linhas do arquivo.
    with open(filepath,'r') as f:
        data = f.readlines()

    # Transforma as linhas em um Set (retira as duplicadas) e depois em uma lista.
    data = list(set(data))

    # Realiza o decode das URLs e coloca elas na lista result.
    result = []
    for d in data:
        d = str(urllib.parse.unquote(d))
        result.append(d)

    return result

In [0]:
import re

# Função que separa cada URL em uma lista de palavras/tokens, utilizando como separadores: '/', '-', '.'
def getTokens(input):
    return re.split('/|-|\.|=|&|\?|\s+|\<|\>|;|\(|\)', str(input.encode('utf-8')))

# Exemplo:
# getTokens('/wikipedia/noticias/museu-nacional-e-10.php?paramenter=10&c=select * from table&opa=<xss(alert)>')

Vamos fazer a leitura dos dados do Google Drive (estamos executando no Google Colab para validação)

In [120]:
# Mount Google Drive
from google.colab import drive # import drive from google colab
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)
drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
%cd drive/'My Drive'/'Colab Notebooks'/cpc802

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/cpc802'
/content/drive/My Drive/Colab Notebooks/cpc802


In [0]:
import joblib

# Vamos carregar os vetores.
vectorizer = joblib.load(filenameVectorizer)

In [127]:
# Imprime vocabulário para saber se está tudo certo.
print(vectorizer.vocabulary_)

{"b'": 16301, 'internacional': 30474, 'php': 39340, 'template': 45794, 'simbolos': 44336, 'titulo': 46209, 'symbols': 45319, 'cat': 19316, 'lang': 32695, 'us': 47340, 'page': 38264, 'imagem': 29825, 'simbolos1111111111111"': 44340, 'union': 47161, 'select': 43813, 'char': 19822, '45,120,49,45,81,45': 8319, ',char': 2346, '45,120,50,45,81,45': 8381, '45,120,51,45,81,45': 8392, '45,120,52,45,81,45': 8403, '45,120,53,45,81,45': 8414, '45,120,54,45,81,45': 8425, '45,120,55,45,81,45': 8436, '45,120,56,45,81,45': 8447, '45,120,57,45,81,45': 8458, '45,120,49,48,45,81,45': 8320, '45,120,49,49,45,81,45': 8331, '45,120,49,50,45,81,45': 8342, '45,120,49,51,45,81,45': 8353, '45,120,49,52,45,81,45': 8364, '45,120,49,53,45,81,45': 8375, '45,120,49,54,45,81,45': 8377, '45,120,49,55,45,81,45': 8378, '45,120,49,56,45,81,45': 8379, '45,120,49,57,45,81,45': 8380, '45,120,50,48,45,81,45': 8382, '45,120,50,49,45,81,45': 8383, '45,120,50,50,45,81,45': 8384, '45,120,50,51,45,81,45': 8385, '45,120,50,52,45,81

In [0]:
 # Vamos carregar o modelo.
lgs = joblib.load(filenameModel_logreg)
lsvm = joblib.load(filenameModel_lsvm)
percep = joblib.load(filenameModel_percep)

In [129]:
fileForValidationBad = 'badqueries_gg2.txt'
fileForValidationGood = 'goodqueries_gg2.txt'

# Prepara dataset real apenas para validação.
otherBadQueries = loadFile(fileForValidationBad)
otherGoodQueries = loadFile(fileForValidationGood)
allOtherQueries = otherBadQueries + otherGoodQueries

# Vetoriza dataset real.
X_real = vectorizer.transform(allOtherQueries)

# Calcula as inferências do dataset real.
y_lgs = lgs.predict(X_real)
y_lsvm = lsvm.predict(X_real)
y_percep = percep.predict(X_real)

# Vamos colocar os labels no dataset real.
#   0 - normal
#   1 - malicious
yGood_gg = [0 for i in range(0, len(otherGoodQueries))]
yBad_gg = [1 for i in range(0, len(otherBadQueries))]
y_real_with_label = yBad_gg + yGood_gg

# Verifica acertos e erros de predição usando dataset real.
acertos = 0
erros = 0
errosList = {}
for i in range(len(y_real_with_label)):
    # Verifica se usando os 3 modelos a acurácia é melhor
    if   y_real_with_label[i] == y_lgs[i]:
        acertos = acertos + 1
    elif y_real_with_label[i] == y_lsvm[i]:
        acertos = acertos + 1
    elif y_real_with_label[i] == y_percep[i]:
        acertos = acertos + 1
    elif (y_real_with_label[i] == 0 and y_lgs[i] == 1):
        acertos = acertos + 1
    elif (y_real_with_label[i] == 0 and y_lsvm[i] == 1):
        acertos = acertos + 1
    elif (y_real_with_label[i] == 0 and y_percep[i] == 1):
        acertos = acertos + 1
    else:
        errosList[i] = allOtherQueries[i]
        erros = erros + 1

print(acertos/len(y_real_with_label))

0.9994293540287605


In [130]:
erros

30

In [131]:
errosList

{21: '/index.php?id=/_templates/mobile-noticia.php&xml=/_conteudo/2013/11/voce_ag/tv_e_famosos/noticias/1467958-guitarrista-processa-chitaozinho-e-xororo-e-ganha-r-1-1-milhao.html\n',
 606: '/tftpboot/\n',
 2307: '/==r?e.unique&&h.has(n)||l.push(n):n&&n.length&&"string"!==r&&s(n)})}(arguments),t?i=l.length:n&&(a=r,c(n))}return+this}\n',
 2421: '/bin/?page=login\n',
 2563: '/\n',
 2945: '/tftp/\n',
 4167: '/Nessus553614390.html\n',
 5058: '/Overrides.playerVersion\n',
 5459: '/platform-telemetry/platform\n',
 6719: '/gazetadealagoas/noticia.php?c=328068/images/ios/img-iphone.png\n',
 8499: '/compartilhar1/do?ajax=1&share=arquivo/614415/derivativos-levam--sadia-prejuizo-de-r-248-bilhoes-em-2008\n',
 8970: '/Telerik.Web.UI.WebResource.axd?type=rau\n',
 9141: '/wp-json/wp/v2/posts/None\n',
 9407: '/noticia_interna.php?cat=oclube&session=5717&subtag=Base\n',
 9525: '/sdk\n',
 9707: '/noticia.php?c=249469&e=34\n',
 9746: '/licenciamento.php?cat=negocios\n',
 10258: '/install/index.php.bak\n'

In [132]:
# Testando com valores conhecidos.
X_predict = [
             '/././././././../../../../../winnt/win.ini',
             '/.git/config',
             '/wp-content/plugins/sell-downloads/sell-downloads.php?file=../../../../../../../../.././wp-config.php%00',
             '/?pagina=2%20AND%201001%3D5832',
             '/info.php',
             '/about_ver.php',
             '/admin',
             ]
X_predict = vectorizer.transform(X_predict)
y_Predict = lgs.predict(X_predict)
print(y_Predict)
y_Predict = lsvm.predict(X_predict)
print(y_Predict)
y_Predict = percep.predict(X_predict)
print(y_Predict)

[1 1 1 1 0 0 0]
[1 1 1 1 0 0 0]
[1 1 1 1 0 0 0]
