In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tqdm import tqdm
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
data_a = pd.read_csv('db/dataset_entidades_actor_2.csv')
data_c = pd.read_csv('db/dataset_entidades_company_2.csv')
data_m = pd.read_csv('db/dataset_entidades_musical_artist_2.csv')
data_p = pd.read_csv('db/dataset_entidades_politican_2.csv')
data_s = pd.read_csv('db/dataset_entidades_soccer_player_2.csv')

In [3]:
all_data = pd.concat([data_a,data_c,data_m,data_p,data_s])

In [4]:
all_data.head()

Unnamed: 0.1,Unnamed: 0,index,0,1,2
0,0,Anthony_Eustrel,"['Person', 'Actor', 'Agent', 'Artist']",Actor,"Anthony Eustrel (October 12, 1902-July 2, 1979..."
1,1,Billy_Dee,"['Person', 'Actor', 'AdultActor', 'Agent', 'Ar...",AdultActor,Not to be confused with Billy Dee Williams or ...
2,2,Sid_Lucero,"['Person', 'Actor', 'Agent', 'Artist']",Actor,"Timothy Mark Pimentel Eigenmann, better known ..."
3,3,Jacques_Balutin,"['Person', 'Actor', 'Agent', 'Artist']",Actor,Jacques Balutin is a French actor.
4,4,Austin_Leigh,"['Person', 'Actor', 'Agent', 'Artist']",Actor,Austin Leigh was a British stage and film actor.


In [5]:
group_data = all_data.groupby("1")

In [6]:
## Delete classes with few examples (<300)
for item in tqdm(group_data):
    target = item[0]
    if len(all_data[all_data['1']==target])<300:
        all_data.drop(all_data[all_data['1']==target].index,inplace=True )


100%|██████████| 116/116 [00:01<00:00, 97.06it/s] 


In [7]:
def text_word_sequence(texts):
    abstract_sentences = list()
    for text in tqdm(texts):
        abstract_sentences.append(text_to_word_sequence(str(text)))
    return abstract_sentences

In [8]:
text = all_data.iloc[:,4].values

In [9]:
labels = all_data.iloc[:,3].values

In [10]:
np.unique(labels), labels.shape

(array(['Actor', 'AdultActor', 'Company', 'Congressman', 'Governor',
        'Mayor', 'MemberOfParliament', 'MusicalArtist', 'Politician',
        'President', 'PrimeMinister', 'SoccerManager', 'SoccerPlayer',
        'VoiceActor'], dtype=object), (29033,))

In [11]:
abstract_sentences = text_word_sequence(text)

100%|██████████| 29033/29033 [00:00<00:00, 29899.38it/s]


In [12]:
def tokenize_text(examples):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(examples)
    sequences = tokenizer.texts_to_sequences(examples)
    return (sequences, tokenizer)
sequences, tokenizer = tokenize_text(abstract_sentences)

In [13]:
abstract_sentences_number = sequences

In [14]:
from gensim.models import word2vec

In [15]:
model = word2vec.Word2Vec(size=200, sg=1, workers=10)

model.build_vocab(sentences=abstract_sentences)

model.train(sentences=abstract_sentences, epochs=20, total_examples=len(abstract_sentences))

(16373237, 23699860)

In [16]:
model.save("word2vec_dbpedia_size=200.model")

In [17]:
quantity_of_embeddings = len(model.wv.vocab)
embedding_matrix = np.zeros((len(tokenizer.word_index), 200))
for word, i in tqdm(tokenizer.word_index.items()):
    if word in model and i < quantity_of_embeddings:
        embedding_matrix[i] = model[word]

  after removing the cwd from sys.path.
  """
100%|██████████| 83123/83123 [00:00<00:00, 231038.37it/s]


In [18]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = le.fit_transform(labels)
y = to_categorical(y)

In [19]:
from sklearn.model_selection import train_test_split

sequences_pad = pad_sequences(abstract_sentences_number,maxlen=100)

x_train,x_test,y_train,y_test = train_test_split(sequences_pad,y,test_size=0.10, random_state=42)

x_train,x_val,y_train,y_val = train_test_split(x_train,y_train, test_size=0.2,random_state=42)

In [20]:
from keras.layers import Embedding,Input, Conv1D, MaxPooling1D, Dense,merge,Flatten, Dropout, GlobalMaxPool1D, Concatenate
from keras.initializers import random_uniform
from keras.layers.merge import concatenate
from keras.regularizers import l2
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint

In [21]:
#Constants
channels = 100
size_feature_map = 3
size_feature_map_2 = 4
size_feature_map_3 = 5
dropout = 0.25
dense1 = 50
dense2 = 100
embedding_size=100

In [None]:
embedding_layer = Embedding(len(tokenizer.word_index),
                            200,
                            weights=[embedding_matrix],
                            trainable=True)

sequence_input = Input(shape=(embedding_size,), dtype='int32', name='sequence_input')

embedded_sequences = embedding_layer(sequence_input)
embedded_sequences = Dropout(dropout) (embedded_sequences)
x = Conv1D(channels, size_feature_map, activation='relu',padding='SAME')(embedded_sequences)
x = Conv1D(channels, size_feature_map_2,activation='relu',padding='SAME')(x)
x = Dropout(dropout)(x)
x = GlobalMaxPool1D()(x)
# x = Flatten()(x)

fc1 = Dense(dense2, activation='relu', name='fc1')(x)
fc1 = Dropout(dropout)(fc1)
fc2 = Dense(dense1, activation='relu', name='fc2')(fc1)
preds = Dense(14, activation='softmax')(fc2)
model_cnn = Model(sequence_input, preds)
model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model_cnn.summary()
checkpoint = ModelCheckpoint('weights-best-model1-dbpedia.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model_cnn.fit(x_train, y_train, batch_size=64, epochs=50,validation_data=(x_val,y_val),verbose=1,callbacks=callbacks_list)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequence_input (InputLayer)  (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 200)          16624600  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 100)          60100     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 100)          40100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
__________

In [None]:
model_cnn.load_weights('weights-best-model3.hdf5')
model_cnn.evaluate([x_test,x_test_rdf], y_test)

In [None]:
predicted = model.predict(x_test, verbose=1)