### SVM

Implemantação do classificador utilizando SVM.

In [None]:
import nltk
import itertools

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

from pymongo import MongoClient

%matplotlib inline

In [None]:
'''
Importação dos dados da base
'''
client = MongoClient()
client = MongoClient('localhost', 27800)
db = client.network_logs
db_unique_hostnames = db.unique_hostnames

df_hostnames = pd.DataFrame(list(db_unique_hostnames.find({})))

print(len(df_hostnames))
df_hostnames.head()

In [None]:
'''
Criação do vocabulário
'''
keywords = [''.join(trig) for trig in itertools.product(ascii_lowercase, repeat = 3)]
len(keywords)

### Vetorização e frequência dos termos

In [None]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,3), vocabulary=keywords)
words_vector = count_vectorizer.transform(df_hostnames['normalized_hostname'])
tf_transformer = TfidfTransformer(use_idf=False).fit(words_vector)
urls_tf = tf_transformer.transform(words_vector)

### Separação do conjunto

In [None]:
labels = df_hostnames['category']
url_train,url_test,label_train,label_test = train_test_split(urls_tf, labels, test_size=0.3)

### Classificação e teste

In [None]:
classifier_model = LinearSVC().fit(url_train, label_train)

In [None]:
predictions = classifier_model.predict(url_test)

### Relatório de classificação

In [None]:
print(classification_report(label_test, predictions))

### GridSearch

In [1]:
from sklearn.model_selection import GridSearchCV

In [None]:
#np.linspace(0.001, 10, num=10)

In [None]:
'''
Parâmetros para GridSearch
'''

#Gerar com linspace
C_range = np.linspace(0.001, 10, num=10)

#np.logspace(-1, 10, num=13)                     
#np.linspace(0.001, 10, num=20)
gamma_range = np.logspace(-9, 3, 13)

print(C_range)
#print(gamma_range)

param_grid = dict(C=C_range, gamma=gamma_range)

In [None]:
grid = GridSearchCV(SVC(max_iter=1000, cache_size=800),param_grid,refit=True,verbose=3, n_jobs=4)

In [None]:
# May take awhile!
grid.fit(url_train, label_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(url_test)

In [None]:
print(classification_report(label_test,grid_predictions))