### Naive bayes (ODP data set)

Implemantação do classificador utilizando Naive Bayes.

In [1]:
import nltk
import itertools
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

from pymongo import MongoClient

In [2]:
'''
Importação dos dados da base
'''
client = MongoClient()
client = MongoClient('localhost', 27800)
db = client.odp_logs
db_log_lines = db.log_lines

df_urls = pd.DataFrame(list(db_log_lines.find({})))

print(len(df_urls))
df_urls.head()

1562978


Unnamed: 0,_id,category,normalized_url,url
0,59a2ed3d88b03213aca8fcd4,Adult,liquidgeneration,http://www.liquidgeneration.com/
1,59a2ed3e88b03213aca8fcd5,Adult,onlineanimeorg,http://www.onlineanime.org/
2,59a2ed3e88b03213aca8fcd6,Adult,ceresdtinekoisennosenfirst,http://www.ceres.dti.ne.jp/~nekoi/senno/senfir...
3,59a2ed3e88b03213aca8fcd7,Adult,galeonkmh,http://www.galeon.com/kmh/
4,59a2ed3e88b03213aca8fcd8,Adult,fanworkrecs,http://www.fanworkrecs.com/


In [3]:
'''
Criação do vocabulário
'''
keywords = [''.join(trig) for trig in itertools.product(ascii_lowercase, repeat = 3)]
len(keywords)

17576

### Separação do conjunto

In [4]:
'''
Separação do conjunto de treinamento e validação
'''
data = df_urls['normalized_url']
labels = df_urls['category']

url_train,url_test,label_train,label_test = train_test_split(data, labels, test_size=0.3)

### Criação do pipeline

In [5]:
pipeline = Pipeline([
    ('wordsVector', CountVectorizer(analyzer='char', ngram_range=(3,3), vocabulary=keywords)),
    ('tf', TfidfTransformer(use_idf=False)),
    ('classifier', MultinomialNB())
])

In [6]:
pipeline.fit(url_train, label_train)

Pipeline(memory=None,
     steps=[('wordsVector', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 3), preprocessor=None, stop_words=None,
      ...        use_idf=False)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [7]:
predictions = pipeline.predict(url_test)

### Relatório da classificação

In [8]:
print(classification_report(label_test, predictions))

             precision    recall  f1-score   support

      Adult       0.85      0.36      0.50     10635
       Arts       0.40      0.59      0.48     76136
   Business       0.34      0.64      0.44     72281
  Computers       0.53      0.28      0.37     35412
      Games       0.70      0.42      0.52     16881
     Health       0.67      0.20      0.31     17845
       Home       0.89      0.29      0.44      8479
       Kids       0.60      0.10      0.17     13803
       News       0.33      0.02      0.03      2621
 Recreation       0.47      0.22      0.29     32054
  Reference       0.46      0.36      0.40     17532
    Science       0.52      0.34      0.41     33139
   Shopping       0.42      0.14      0.21     28564
    Society       0.42      0.55      0.48     73151
     Sports       0.65      0.43      0.52     30361

avg / total       0.48      0.43      0.41    468894

