### Redes neurais

Implemantação do classificador utilizando redes neurais.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score

%matplotlib inline

In [2]:
random_state = 47
np.random.seed(seed=random_state)

In [3]:
'''
Importação dos dados da base
'''

df_hostnames = pd.read_csv('../../../data/odp_reduzido_15.csv')

print(len(df_hostnames))

df_hostnames = df_hostnames.dropna()

print(len(df_hostnames))
df_hostnames.head()

15000
15000


Unnamed: 0,_id,category,normalized_url,url
0,59a2ed6388b03213aca967aa,Adult,heisseeisen,http://www.heisse-eisen.de/
1,59a2ed4788b03213aca91a2d,Adult,nakedsportsmen,http://www.nakedsportsmen.com/
2,59a2ed4c88b03213aca9294e,Adult,daddyswap,http://www.daddyswap.com/
3,59a2ed6588b03213aca96b47,Adult,sexpalastgelsenkirchen,http://www.sexpalast-gelsenkirchen.de/
4,59a2ed6e88b03213aca9848b,Adult,geocitiesruoskanet,http://www.geocities.com/ruoskanet/


In [4]:
dict_cat = {
    'Adult': 0,
    'Arts': 1,
    'Business': 2,
    'Computers': 3,
    'Games': 4,
    'Health': 5,
    'Home': 6,
    'Kids': 7,
    'News': 8,
    'Recreation': 9,
    'Reference': 10,
    'Science': 11,
    'Shopping': 12,
    'Society': 13,
    'Sports': 14
}

def to_category_id(item):
    return dict_cat[item]

In [5]:
df_hostnames['cat_id'] = df_hostnames['category'].apply(to_category_id)
df_hostnames.head()

Unnamed: 0,_id,category,normalized_url,url,cat_id
0,59a2ed6388b03213aca967aa,Adult,heisseeisen,http://www.heisse-eisen.de/,0
1,59a2ed4788b03213aca91a2d,Adult,nakedsportsmen,http://www.nakedsportsmen.com/,0
2,59a2ed4c88b03213aca9294e,Adult,daddyswap,http://www.daddyswap.com/,0
3,59a2ed6588b03213aca96b47,Adult,sexpalastgelsenkirchen,http://www.sexpalast-gelsenkirchen.de/,0
4,59a2ed6e88b03213aca9848b,Adult,geocitiesruoskanet,http://www.geocities.com/ruoskanet/,0


### Vetorização e frequência dos termos

In [6]:
X = df_hostnames['normalized_url']
Y = df_hostnames['cat_id']

In [7]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2))
count_vectorizer.fit(X)
len(count_vectorizer.vocabulary_)

669

In [8]:
words_vector = count_vectorizer.transform(X)

In [9]:
tf_transformer = TfidfTransformer(use_idf=True).fit(words_vector)
urls_tf = tf_transformer.transform(words_vector)

In [10]:
print(urls_tf.shape)
urls_tf[0]

(15000, 669)


<1x669 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [11]:
for i in urls_tf[0].toarray().nonzero()[1]:
    print('pos: ', i)
    print(urls_tf[0].toarray()[0,i])
    print('feature: ', count_vectorizer.get_feature_names()[i])

pos:  108
0.246577972182
feature:  ee
pos:  112
0.615883247282
feature:  ei
pos:  117
0.20076543503
feature:  en
pos:  185
0.223115937393
feature:  he
pos:  225
0.466021155975
feature:  is
pos:  465
0.442199990089
feature:  se
pos:  479
0.238920695794
feature:  ss


### Separação do conjunto

In [12]:
url_train,url_test,label_train,label_test = train_test_split(urls_tf, Y, test_size=0.3,random_state=random_state)

In [13]:
print(url_train.shape)
print(label_train.shape) 

(10500, 669)
(10500,)


### Classificação e teste

In [18]:
#0.38471380471380473 - 4000 sem batch size
classifier_model_f = MLPClassifier(hidden_layer_sizes=(600,),max_iter=400, verbose=True, random_state=random_state
                                    )
classifier_model_f.fit(url_train,label_train)

Iteration 1, loss = 2.61749268
Iteration 2, loss = 2.24068715
Iteration 3, loss = 1.98215002
Iteration 4, loss = 1.79279393
Iteration 5, loss = 1.61009712
Iteration 6, loss = 1.41474257
Iteration 7, loss = 1.20005614
Iteration 8, loss = 0.98763563
Iteration 9, loss = 0.76546656
Iteration 10, loss = 0.56160537
Iteration 11, loss = 0.39303473
Iteration 12, loss = 0.26658730
Iteration 13, loss = 0.17908690
Iteration 14, loss = 0.12432034
Iteration 15, loss = 0.09283290
Iteration 16, loss = 0.07501108
Iteration 17, loss = 0.06058769
Iteration 18, loss = 0.05522856
Iteration 19, loss = 0.05227157
Iteration 20, loss = 0.04374613
Iteration 21, loss = 0.04527689
Iteration 22, loss = 0.04101056
Iteration 23, loss = 0.04127641
Iteration 24, loss = 0.04037064
Iteration 25, loss = 0.03704045
Iteration 26, loss = 0.03605104
Iteration 27, loss = 0.03838018
Iteration 28, loss = 0.03495877
Iteration 29, loss = 0.03574258
Iteration 30, loss = 0.03527287
Iteration 31, loss = 0.03426514
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(600, 600), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=47, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False)

In [19]:
predictions = classifier_model_f.predict(url_test)

### Relatório de classificação

In [20]:
print(classification_report(label_test, predictions))

             precision    recall  f1-score   support

          0       0.42      0.39      0.41       314
          1       0.22      0.20      0.21       295
          2       0.18      0.17      0.17       283
          3       0.27      0.24      0.26       294
          4       0.38      0.38      0.38       309
          5       0.27      0.33      0.30       315
          6       0.43      0.39      0.40       301
          7       0.19      0.25      0.21       273
          8       0.33      0.38      0.36       341
          9       0.17      0.17      0.17       290
         10       0.42      0.38      0.40       307
         11       0.27      0.23      0.25       308
         12       0.22      0.24      0.23       303
         13       0.25      0.16      0.20       294
         14       0.36      0.43      0.39       273

avg / total       0.29      0.29      0.29      4500



In [21]:
model_accuracy = accuracy_score(label_test, predictions)
model_accuracy

0.29133333333333333