# CPC802: Tópicos Especiais em Inteligência Computacional

A proposta do seguinte código é carregar um modelo já treinado utilizando a biblioteca joblib e efetuar algumas operações de teste usando o modelo.

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [0]:
filenameVectorizer = 'cpc802-20200129-223546-word.vectorizer'
filenameModel = 'cpc802-20200129-223546-word.sav'

In [0]:
import os
import urllib.parse

# Função que lê os dados do arquivo.
def loadFile(name):
    # Obtém diretório e nome completo do arquivo (com path).
    directory = str(os.getcwd())
    filepath = os.path.join(directory, name)

    # Faz a leitura das linhas do arquivo.
    with open(filepath,'r') as f:
        data = f.readlines()

    # Transforma as linhas em um Set (retira as duplicadas) e depois em uma lista.
    data = list(set(data))

    # Realiza o decode das URLs e coloca elas na lista result.
    result = []
    for d in data:
        d = str(urllib.parse.unquote(d))
        result.append(d)

    return result

In [0]:
import re

# Função que separa cada URL em uma lista de palavras/tokens, utilizando como separadores: '/', '-', '.'
def getTokens(input):
    return re.split('/|-|\.|=|&|\?|\s+|\<|\>|;|\(|\)', str(input.encode('utf-8')))

# Exemplo:
# getTokens('/wikipedia/noticias/museu-nacional-e-10.php?paramenter=10&c=select * from table&opa=<xss(alert)>')

Vamos fazer a leitura dos dados do Google Drive (estamos executando no Google Colab para validação)

In [22]:
# Mount Google Drive
from google.colab import drive # import drive from google colab
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)
drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
%cd drive/'My Drive'/'Colab Notebooks'/cpc802

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/cpc802'
/content/drive/My Drive/Colab Notebooks/cpc802


In [0]:
import joblib

# Vamos carregar os vetores.
vectorizer = joblib.load(filenameVectorizer)

In [25]:
# Imprime vocabulário para saber se está tudo certo.
print(vectorizer.vocabulary_)

{"b'": 17837, 'internacional': 32679, 'php': 41645, 'template': 48177, 'simbolos': 46695, 'titulo': 48597, 'symbols': 47695, 'cat': 21111, 'lang': 34923, 'us': 49731, 'page': 40555, 'imagem': 32026, 'simbolos1111111111111"': 46699, 'union': 49551, 'select': 46168, 'char': 21656, '45,120,49,45,81,45': 8833, ',char': 2356, '45,120,50,45,81,45': 8895, '45,120,51,45,81,45': 8906, '45,120,52,45,81,45': 8917, '45,120,53,45,81,45': 8928, '45,120,54,45,81,45': 8939, '45,120,55,45,81,45': 8950, '45,120,56,45,81,45': 8961, '45,120,57,45,81,45': 8972, '45,120,49,48,45,81,45': 8834, '45,120,49,49,45,81,45': 8845, '45,120,49,50,45,81,45': 8856, '45,120,49,51,45,81,45': 8867, '45,120,49,52,45,81,45': 8878, '45,120,49,53,45,81,45': 8889, '45,120,49,54,45,81,45': 8891, '45,120,49,55,45,81,45': 8892, '45,120,49,56,45,81,45': 8893, '45,120,49,57,45,81,45': 8894, '45,120,50,48,45,81,45': 8896, '45,120,50,49,45,81,45': 8897, '45,120,50,50,45,81,45': 8898, '45,120,50,51,45,81,45': 8899, '45,120,50,52,45,81

In [0]:
 # Vamos carregar o modelo.
lgs = joblib.load(filenameModel)

In [28]:
# Testando com valores conhecidos.
X_predict = [
             '/././././././../../../../../winnt/win.ini',
             '/.git/config',
             '/wp-content/plugins/sell-downloads/sell-downloads.php?file=../../../../../../../../.././wp-config.php%00',
             '/?pagina=2%20AND%201001%3D5832',
             '/info.php',
             '/about_ver.php',
             '/admin',
             ]
X_predict = vectorizer.transform(X_predict)
y_Predict = lgs.predict_log_proba(X_predict)
print(y_Predict)
y_Predict = lgs.predict(X_predict)
print(y_Predict)

[[-2.01127682e+00 -1.43659890e-01]
 [-6.41587362e-01 -7.47510623e-01]
 [-1.63763566e+00 -2.16216622e-01]
 [-1.94047142e+00 -1.55060018e-01]
 [-1.93593002e-01 -1.73723264e+00]
 [-2.23636693e-01 -1.60746777e+00]
 [-3.38573178e-03 -5.68987760e+00]]
[1 0 1 1 0 0 0]
