### SVM

Implemantação do classificador utilizando SVM.

In [1]:
import nltk
import itertools

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

# SVM
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from pymongo import MongoClient

%matplotlib inline

In [2]:
'''
Importação dos dados da base
'''
client = MongoClient()
client = MongoClient('localhost', 27800)
db = client.odp_logs
db_log_lines = db.log_lines

df_log_lines = pd.DataFrame(list(db_log_lines.find({})))

print(len(df_log_lines))
df_log_lines.head()

1562978


Unnamed: 0,_id,category,normalized_url,url
0,59a2ed3d88b03213aca8fcd4,Adult,liquidgeneration,http://www.liquidgeneration.com/
1,59a2ed3e88b03213aca8fcd5,Adult,onlineanimeorg,http://www.onlineanime.org/
2,59a2ed3e88b03213aca8fcd6,Adult,ceresdtinekoisennosenfirst,http://www.ceres.dti.ne.jp/~nekoi/senno/senfir...
3,59a2ed3e88b03213aca8fcd7,Adult,galeonkmh,http://www.galeon.com/kmh/
4,59a2ed3e88b03213aca8fcd8,Adult,fanworkrecs,http://www.fanworkrecs.com/


In [3]:
'''
Criação do vocabulário
'''
keywords = [''.join(trig) for trig in itertools.product(ascii_lowercase, repeat = 3)]
len(keywords)

17576

### Vetorização e frequência dos termos

In [4]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,3), vocabulary=keywords)
gamma_range = np.logspace(-9, 3, 13)
tol_range =np.logspace(0, 1, 13)
words_vector = count_vectorizer.transform(df_log_lines['normalized_url'])
tf_transformer = TfidfTransformer(use_idf=False).fit(words_vector)
urls_tf = tf_transformer.transform(words_vector)

In [5]:
print(urls_tf[6584])
print(count_vectorizer.get_feature_names()[5697])
print(df_log_lines['normalized_url'][6584])

  (0, 76)	0.353553390593
  (0, 1998)	0.353553390593
  (0, 2149)	0.353553390593
  (0, 5697)	0.353553390593
  (0, 7438)	0.353553390593
  (0, 7518)	0.353553390593
  (0, 15091)	0.353553390593
  (0, 16804)	0.353553390593
ild
lacywilder


### Divisão dos dados

In [7]:
labels = df_log_lines['category']
url_train,url_test,label_train,label_test = train_test_split(urls_tf, labels, test_size=0.3)

### Classificação e teste

In [8]:
classifier_model = LinearSVC().fit(url_train, label_train)

In [9]:
predictions = classifier_model.predict(url_test)

### Relatório da classificação

In [10]:
print(classification_report(label_test, predictions))

             precision    recall  f1-score   support

      Adult       0.70      0.52      0.60     10630
       Arts       0.49      0.63      0.55     76114
   Business       0.43      0.67      0.53     72138
  Computers       0.52      0.38      0.44     35371
      Games       0.67      0.53      0.59     16822
     Health       0.58      0.37      0.45     18067
       Home       0.76      0.44      0.56      8572
       Kids       0.55      0.16      0.25     13896
       News       0.52      0.17      0.25      2694
 Recreation       0.47      0.33      0.39     32048
  Reference       0.53      0.43      0.48     17370
    Science       0.55      0.44      0.49     33293
   Shopping       0.43      0.23      0.30     28449
    Society       0.51      0.59      0.55     73069
     Sports       0.63      0.58      0.60     30361

avg / total       0.52      0.51      0.50    468894

