### Redes neurais

Implemantação do classificador utilizando redes neurais.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score

%matplotlib inline

In [2]:
random_state = 47
np.random.seed(seed=random_state)

In [3]:
'''
Importação dos dados da base
'''

df_hostnames = pd.read_csv('../../../data/odp_reduzido_218.csv')

print(len(df_hostnames))

df_hostnames = df_hostnames.dropna()

print(len(df_hostnames))
df_hostnames.head()

218972
218972


Unnamed: 0,_id,category,normalized_url,url
0,59a2ed6888b03213aca97499,Adult,gayprogrammes,http://www.gay-programmes.com/
1,59a2ed4e88b03213aca92efe,Adult,watchsomepornsiteskinkylesbianpornoindex,http://www.watchsomeporn.com/sites8/kinkylesbi...
2,59a2ed4588b03213aca914d1,Adult,mmsitescarnindex,http://www.mmsites.com/carn/2index.htm
3,59a2ed5588b03213aca93f14,Adult,clubnikkidevon,http://www.club-nikki.com/devon/
4,59a2ed6b88b03213aca97d86,Adult,hotelvenusbayside,http://www.hotelvenus.co.jp/bayside/


In [4]:
dict_cat = {
    'Adult': 0,
    'Arts': 1,
    'Business': 2,
    'Computers': 3,
    'Games': 4,
    'Health': 5,
    'Home': 6,
    'Kids': 7,
    'News': 8,
    'Recreation': 9,
    'Reference': 10,
    'Science': 11,
    'Shopping': 12,
    'Society': 13,
    'Sports': 14
}

def to_category_id(item):
    return dict_cat[item]

In [5]:
df_hostnames['cat_id'] = df_hostnames['category'].apply(to_category_id)
df_hostnames.head()

Unnamed: 0,_id,category,normalized_url,url,cat_id
0,59a2ed6888b03213aca97499,Adult,gayprogrammes,http://www.gay-programmes.com/,0
1,59a2ed4e88b03213aca92efe,Adult,watchsomepornsiteskinkylesbianpornoindex,http://www.watchsomeporn.com/sites8/kinkylesbi...,0
2,59a2ed4588b03213aca914d1,Adult,mmsitescarnindex,http://www.mmsites.com/carn/2index.htm,0
3,59a2ed5588b03213aca93f14,Adult,clubnikkidevon,http://www.club-nikki.com/devon/,0
4,59a2ed6b88b03213aca97d86,Adult,hotelvenusbayside,http://www.hotelvenus.co.jp/bayside/,0


### Vetorização e frequência dos termos

In [6]:
X = df_hostnames['normalized_url']
Y = df_hostnames['cat_id']

In [7]:
count_vectorizer = HashingVectorizer(analyzer='char', ngram_range=(3,3), n_features=800)
count_vectorizer.fit(X)
#len(count_vectorizer)

HashingVectorizer(alternate_sign=True, analyzer='char', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True, n_features=800,
         ngram_range=(3, 3), non_negative=False, norm='l2',
         preprocessor=None, stop_words=None, strip_accents=None,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None)

In [8]:
words_vector = count_vectorizer.transform(X)

In [9]:
words_vector.shape

(218972, 800)

In [10]:
words_vector[0].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([  6,  72, 114, 579, 594, 700, 702, 738, 756, 784, 787], dtype=int32))

In [11]:
#tf_transformer = TfidfTransformer(use_idf=True).fit(words_vector)
#urls_tf = tf_transformer.transform(words_vector)

In [12]:
#print(urls_tf.shape)
#urls_tf[0]

In [13]:
for i in words_vector[0].toarray().nonzero()[1]:
    print('pos: ', i)
    print(words_vector[0].toarray()[0,i])
#    print('feature: ', count_vectorizer.get_feature_names()[i])

pos:  6
0.301511344578
pos:  72
0.301511344578
pos:  114
-0.301511344578
pos:  579
-0.301511344578
pos:  594
-0.301511344578
pos:  700
0.301511344578
pos:  702
0.301511344578
pos:  738
-0.301511344578
pos:  756
0.301511344578
pos:  784
-0.301511344578
pos:  787
-0.301511344578


### Separação do conjunto

In [14]:
url_train,url_test,label_train,label_test = train_test_split(words_vector, Y, test_size=0.3,random_state=random_state)

In [15]:
print(url_train.shape)
print(label_train.shape) 

(153280, 800)
(153280,)


### Classificação e teste

In [None]:
#0.47 com log2
classifier_model_f = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_features='log2',
                                            random_state=random_state,verbose=3)
classifier_model_f.fit(url_train,label_train)

building tree 2 of 1000
building tree 1 of 1000building tree 3 of 1000building tree 4 of 1000


building tree 5 of 1000
building tree 6 of 1000
building tree 7 of 1000
building tree 8 of 1000
building tree 9 of 1000
building tree 10 of 1000
building tree 11 of 1000
building tree 12 of 1000
building tree 13 of 1000
building tree 14 of 1000
building tree 15 of 1000
building tree 16 of 1000
building tree 17 of 1000
building tree 18 of 1000
building tree 19 of 1000
building tree 20 of 1000
building tree 21 of 1000
building tree 22 of 1000
building tree 23 of 1000
building tree 24 of 1000


In [None]:
predictions = classifier_model_f.predict(url_test)

### Relatório de classificação

In [None]:
print(classification_report(label_test, predictions))

In [None]:
model_accuracy = accuracy_score(label_test, predictions)
model_accuracy