### Redes neurais

Implemantação do classificador utilizando redes neurais.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score

%matplotlib inline

In [2]:
random_state = 47
np.random.seed(seed=random_state)

In [3]:
'''
Importação dos dados da base
'''

df_hostnames = pd.read_csv('../../../data/odp_reduzido_15.csv')

print(len(df_hostnames))

df_hostnames = df_hostnames.dropna()

print(len(df_hostnames))
df_hostnames.head()

15000
15000


Unnamed: 0,_id,category,normalized_url,url
0,59a2ed6388b03213aca967aa,Adult,heisseeisen,http://www.heisse-eisen.de/
1,59a2ed4788b03213aca91a2d,Adult,nakedsportsmen,http://www.nakedsportsmen.com/
2,59a2ed4c88b03213aca9294e,Adult,daddyswap,http://www.daddyswap.com/
3,59a2ed6588b03213aca96b47,Adult,sexpalastgelsenkirchen,http://www.sexpalast-gelsenkirchen.de/
4,59a2ed6e88b03213aca9848b,Adult,geocitiesruoskanet,http://www.geocities.com/ruoskanet/


dictr = {'Adult':0,
'Advertising':1,
'Arts and Culture':2,
'Business':3,
'Content Servers':4,
'Domain Parking':5,
'Education':6,
'Entertainment':7,
'File Sharing and Storage':8,
'Finance and Banking':9,
'Freeware and Software Downloads':10,
'Games':11,
'Health and Wellness':12,
'Information Technology':13,
'Internet Radio and TV':14,
'Job Search':15,
'Malicious or Illegal':16,
'Meaningless Content':17,
'News and Media':18,
'Newsgroups and Message Boards':19,
'Organizations':20,
'Personal Vehicles':21,
'Personal Websites and Blogs':22,
'Real Estate':23,
'Reference':24,
'Restaurant and Dining':25,
'Search Engines and Portals':26,
'Security':27,
'Shopping':28,
'Social Networking':29,
'Society and Lifestyles':30,
'Sports':31,
'Streaming Media and Download':32,
'Travel':33,
'Unrated':33,
'Web Hosting':35,
'Web communication':36,
'Web-based Applications':37
}

def transform_category(cat):
    return dictr[cat];

In [4]:
dict_cat = {
    'Adult': 0,
    'Arts': 1,
    'Business': 2,
    'Computers': 3,
    'Games': 4,
    'Health': 5,
    'Home': 6,
    'Kids': 7,
    'News': 8,
    'Recreation': 9,
    'Reference': 10,
    'Science': 11,
    'Shopping': 12,
    'Society': 13,
    'Sports': 14
}

def to_category_id(item):
    return dict_cat[item]

In [5]:
df_hostnames['cat_id'] = df_hostnames['category'].apply(to_category_id)
df_hostnames.head()

Unnamed: 0,_id,category,normalized_url,url,cat_id
0,59a2ed6388b03213aca967aa,Adult,heisseeisen,http://www.heisse-eisen.de/,0
1,59a2ed4788b03213aca91a2d,Adult,nakedsportsmen,http://www.nakedsportsmen.com/,0
2,59a2ed4c88b03213aca9294e,Adult,daddyswap,http://www.daddyswap.com/,0
3,59a2ed6588b03213aca96b47,Adult,sexpalastgelsenkirchen,http://www.sexpalast-gelsenkirchen.de/,0
4,59a2ed6e88b03213aca9848b,Adult,geocitiesruoskanet,http://www.geocities.com/ruoskanet/,0


### Vetorização e frequência dos termos

In [6]:
X = df_hostnames['normalized_url']
Y = df_hostnames['cat_id']

In [7]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2))
count_vectorizer.fit(X)
len(count_vectorizer.vocabulary_)

669

In [8]:
words_vector = count_vectorizer.transform(X)

In [9]:
tf_transformer = TfidfTransformer(use_idf=True).fit(words_vector)
urls_tf = tf_transformer.transform(words_vector)
urls_tf = urls_tf.toarray()

In [10]:
print(urls_tf.shape)
urls_tf[0]

(15000, 669)


array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [11]:
for i in urls_tf[0].nonzero()[0]:
    print('pos: ', i)
    print(urls_tf[0][i])
    print('feature: ', count_vectorizer.get_feature_names()[i])

pos:  108
0.246577972182
feature:  ee
pos:  112
0.615883247282
feature:  ei
pos:  117
0.20076543503
feature:  en
pos:  185
0.223115937393
feature:  he
pos:  225
0.466021155975
feature:  is
pos:  465
0.442199990089
feature:  se
pos:  479
0.238920695794
feature:  ss


### Separação do conjunto

In [12]:
url_train,url_test,label_train,label_test = train_test_split(urls_tf, Y, test_size=0.3,random_state=random_state)

In [13]:
print(url_train.shape)
print(label_train.shape) 

(10500, 669)
(10500,)


In [14]:
print(label_train)

1326      1
10151    10
7630      7
11493    11
9302      9
553       0
6079      6
12036    12
10140    10
3475      3
14553    14
4914      4
6860      6
3220      3
10082    10
14674    14
11806    11
13091    13
2150      2
8432      8
14244    14
6804      6
4270      4
11472    11
5502      5
8730      8
2374      2
13701    13
11633    11
1772      1
         ..
14409    14
10800    10
7733      7
14657    14
9961      9
12786    12
6825      6
1608      1
7092      7
4475      4
14423    14
6209      6
10731    10
14706    14
13018    13
562       0
13111    13
3185      3
2733      2
10311    10
8967      8
7227      7
3095      3
2896      2
8883      8
6728      6
11528    11
14663    14
1926      1
5255      5
Name: cat_id, dtype: int64


### Classificação e teste

In [15]:
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(url_train)



In [16]:
feature_columns

[_RealValuedColumn(column_name='', dimension=669, default_value=None, dtype=tf.float64, normalizer=None)]

In [27]:
classifier = tf.contrib.learn.DNNClassifier(
      feature_columns=feature_columns, hidden_units=[600,600], n_classes=15,
      optimizer=tf.train.GradientDescentOptimizer(
        learning_rate=0.1
      )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13593ecf8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/var/folders/bv/14k4b18x05n142z0w3pt969r0000gn/T/tmpr1n4zebm'}


In [None]:
classifier.fit(url_train, label_train, steps=250)

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


  equality = a == b


Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/bv/14k4b18x05n142z0w3pt969r0000gn/T/tmpr1n4zebm/model.ckpt.
INFO:tensorflow:loss = 2.70886, step = 1
INFO:tensorflow:global_step/sec: 0.293825
INFO:tensorflow:loss = 2.70289, step = 101 (340.344 sec)


In [None]:
predictions = list(classifier.predict_classes(url_test, as_iterable=True))

### Relatório de classificação

In [None]:
score = accuracy_score(label_test, predictions)
print('Accuracy: {0:f}'.format(score))

In [None]:
print(classification_report(label_test, predictions))

# 