### Redes neurais

Implemantação do classificador utilizando redes neurais.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from string import ascii_lowercase

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score,mean_squared_error

%matplotlib inline

In [2]:
random_state = 47
np.random.seed(seed=random_state)

In [3]:
'''
Importação dos dados da base
'''

df_hostnames = pd.read_csv('../../../data/odp_reduzido_15.csv')

print(len(df_hostnames))

df_hostnames = df_hostnames.dropna()

print(len(df_hostnames))
df_hostnames.head()

15000
15000


Unnamed: 0,_id,category,normalized_url,url
0,59a2ed6388b03213aca967aa,Adult,heisseeisen,http://www.heisse-eisen.de/
1,59a2ed4788b03213aca91a2d,Adult,nakedsportsmen,http://www.nakedsportsmen.com/
2,59a2ed4c88b03213aca9294e,Adult,daddyswap,http://www.daddyswap.com/
3,59a2ed6588b03213aca96b47,Adult,sexpalastgelsenkirchen,http://www.sexpalast-gelsenkirchen.de/
4,59a2ed6e88b03213aca9848b,Adult,geocitiesruoskanet,http://www.geocities.com/ruoskanet/


dictr = {'Adult':0,
'Advertising':1,
'Arts and Culture':2,
'Business':3,
'Content Servers':4,
'Domain Parking':5,
'Education':6,
'Entertainment':7,
'File Sharing and Storage':8,
'Finance and Banking':9,
'Freeware and Software Downloads':10,
'Games':11,
'Health and Wellness':12,
'Information Technology':13,
'Internet Radio and TV':14,
'Job Search':15,
'Malicious or Illegal':16,
'Meaningless Content':17,
'News and Media':18,
'Newsgroups and Message Boards':19,
'Organizations':20,
'Personal Vehicles':21,
'Personal Websites and Blogs':22,
'Real Estate':23,
'Reference':24,
'Restaurant and Dining':25,
'Search Engines and Portals':26,
'Security':27,
'Shopping':28,
'Social Networking':29,
'Society and Lifestyles':30,
'Sports':31,
'Streaming Media and Download':32,
'Travel':33,
'Unrated':33,
'Web Hosting':35,
'Web communication':36,
'Web-based Applications':37
}

def transform_category(cat):
    return dictr[cat];

In [4]:
dict_cat = {
    'Adult': 0,
    'Arts': 1,
    'Business': 2,
    'Computers': 3,
    'Games': 4,
    'Health': 5,
    'Home': 6,
    'Kids': 7,
    'News': 8,
    'Recreation': 9,
    'Reference': 10,
    'Science': 11,
    'Shopping': 12,
    'Society': 13,
    'Sports': 14
}

def to_category_id(item):
    return dict_cat[item]

In [5]:
df_hostnames['cat_id'] = df_hostnames['category'].apply(to_category_id)
df_hostnames.head()

Unnamed: 0,_id,category,normalized_url,url,cat_id
0,59a2ed6388b03213aca967aa,Adult,heisseeisen,http://www.heisse-eisen.de/,0
1,59a2ed4788b03213aca91a2d,Adult,nakedsportsmen,http://www.nakedsportsmen.com/,0
2,59a2ed4c88b03213aca9294e,Adult,daddyswap,http://www.daddyswap.com/,0
3,59a2ed6588b03213aca96b47,Adult,sexpalastgelsenkirchen,http://www.sexpalast-gelsenkirchen.de/,0
4,59a2ed6e88b03213aca9848b,Adult,geocitiesruoskanet,http://www.geocities.com/ruoskanet/,0


### Vetorização e frequência dos termos

In [6]:
X = df_hostnames['normalized_url']
Y = df_hostnames['cat_id']

In [7]:
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2))
count_vectorizer.fit(X)
len(count_vectorizer.vocabulary_)

669

In [8]:
words_vector = count_vectorizer.transform(X)

In [9]:
tf_transformer = TfidfTransformer(use_idf=True).fit(words_vector)
urls_tf = tf_transformer.transform(words_vector)
urls_tf = urls_tf.toarray()

In [10]:
print(urls_tf.shape)
urls_tf[0]

(15000, 669)


array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [11]:
for i in urls_tf[0].nonzero()[0]:
    print('pos: ', i)
    print(urls_tf[0][i])
    print('feature: ', count_vectorizer.get_feature_names()[i])

pos:  108
0.246577972182
feature:  ee
pos:  112
0.615883247282
feature:  ei
pos:  117
0.20076543503
feature:  en
pos:  185
0.223115937393
feature:  he
pos:  225
0.466021155975
feature:  is
pos:  465
0.442199990089
feature:  se
pos:  479
0.238920695794
feature:  ss


### Separação do conjunto

In [12]:
url_train,url_test,label_train,label_test = train_test_split(urls_tf, Y, test_size=0.2,random_state=random_state)

In [13]:
print(url_train.shape)
print(label_train.shape) 

(12000, 669)
(12000,)


In [14]:
print(label_train)

12427    12
10988    10
13458    13
2182      2
1019      1
1134      1
7941      7
9275      9
9172      9
9792      9
4680      4
2503      2
10958    10
6873      6
11736    11
5017      5
4613      4
1210      1
3321      3
11966    11
3485      3
5704      5
9266      9
14290    14
7096      7
11532    11
4855      4
5653      5
12595    12
6255      6
         ..
14409    14
10800    10
7733      7
14657    14
9961      9
12786    12
6825      6
1608      1
7092      7
4475      4
14423    14
6209      6
10731    10
14706    14
13018    13
562       0
13111    13
3185      3
2733      2
10311    10
8967      8
7227      7
3095      3
2896      2
8883      8
6728      6
11528    11
14663    14
1926      1
5255      5
Name: cat_id, dtype: int64


### Classificação e teste

In [15]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


In [16]:
batch_size = 32

# reshape input to be [samples, time steps, features]
trainX = np.reshape(url_train, (url_train.shape[0], 1, url_train.shape[1]))
testX = np.reshape(url_test, (url_test.shape[0], 1, url_test.shape[1]))

In [17]:
model = Sequential()
model.add(LSTM(128, input_shape=(1, 669)))
model.add(Dense(1))

In [None]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(trainX, label_train, nb_epoch=30, batch_size=batch_size, verbose=3)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


In [None]:
score, acc = model.evaluate(testX, label_test, batch_size=batch_size)
#print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
import math
predictions = model.predict(testX)

testScore = math.sqrt(mean_squared_error(label_test, predictions))
print('Test Score: %.2f RMSE' % (testScore))

#print(classification_report(label_test, predictions))

In [None]:
for p in predictions:
    print(p)