### Redes neurais

Implemantação do classificador utilizando redes neurais.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score

%matplotlib inline

In [None]:
random_state = 47
np.random.seed(seed=random_state)

In [None]:
'''
Importação dos dados da base
'''

df_hostnames = pd.read_csv('../../../data/odp_reduzido_218.csv')

print(len(df_hostnames))

df_hostnames = df_hostnames.dropna()

print(len(df_hostnames))
df_hostnames.head()

In [None]:
dict_cat = {
    'Adult': 0,
    'Arts': 1,
    'Business': 2,
    'Computers': 3,
    'Games': 4,
    'Health': 5,
    'Home': 6,
    'Kids': 7,
    'News': 8,
    'Recreation': 9,
    'Reference': 10,
    'Science': 11,
    'Shopping': 12,
    'Society': 13,
    'Sports': 14
}

def to_category_id(item):
    return dict_cat[item]

In [None]:
df_hostnames['cat_id'] = df_hostnames['category'].apply(to_category_id)
df_hostnames.head()

### Vetorização e frequência dos termos

In [None]:
X = df_hostnames['normalized_url']
Y = df_hostnames['cat_id']

In [None]:
count_vectorizer = HashingVectorizer(analyzer='char', ngram_range=(3,3), n_features=800)
count_vectorizer.fit(X)
#len(count_vectorizer)

In [None]:
words_vector = count_vectorizer.transform(X)

In [None]:
words_vector.shape

In [None]:
words_vector[0].nonzero()

In [None]:
#tf_transformer = TfidfTransformer(use_idf=True).fit(words_vector)
#urls_tf = tf_transformer.transform(words_vector)

In [None]:
#print(urls_tf.shape)
#urls_tf[0]

In [None]:
for i in words_vector[0].toarray().nonzero()[1]:
    print('pos: ', i)
    print(words_vector[0].toarray()[0,i])
#    print('feature: ', count_vectorizer.get_feature_names()[i])

### Separação do conjunto

In [None]:
url_train,url_test,label_train,label_test = train_test_split(words_vector, Y, test_size=0.3,random_state=random_state)

In [None]:
print(url_train.shape)
print(label_train.shape) 

### Classificação e teste

In [None]:
#0.47 com log2
classifier_model_f = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_features='log2',
                                            random_state=random_state,verbose=3)
classifier_model_f.fit(url_train,label_train)

In [None]:
predictions = classifier_model_f.predict(url_test)

### Relatório de classificação

In [None]:
print(classification_report(label_test, predictions))

In [None]:
model_accuracy = accuracy_score(label_test, predictions)
model_accuracy