In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Input, Embedding, Activation, Flatten, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.models import Model

In [2]:
#dataset
PATH_X_TRAIN='./data/x_train.txt'
PATH_Y_TRAIN='./data/y_train.txt'
PATH_X_TEST='./data/x_test.txt'
PATH_Y_TEST='./data/y_test.txt'

#model 
INPUT_SIZE=500

In [3]:
#read dataset
def read_txt(path):
    handle = open(path, "r", encoding='utf-8')
    df = pd.DataFrame(handle.readlines())
    handle.close()
    return df

x_train=read_txt(PATH_X_TRAIN)
y_train=read_txt(PATH_Y_TRAIN)
x_test=read_txt(PATH_X_TEST)
y_test=read_txt(PATH_Y_TEST)

print(x_train)

                                                        0
0       Klement Gottwaldi surnukeha palsameeriti ning ...
1       Sebes, Joseph; Pereira Thomas (1961) (på eng)....
2       भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...
3       Après lo cort periòde d'establiment a Basilèa,...
4       ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...
...                                                   ...
117495  Nekoliko prašćića je rođeno na farmi Arableovi...
117496  Tahiti of Otaheite is 'n eilandj in 't zuje va...
117497  同年，太后崩。絳侯周勃、陳平諸臣共謀誅呂。朱虛侯章已殺呂產，文帝使人持節勞章。朱虛侯欲奪節信...
117498  I det mindste opnåede Venedig den 18. april 14...
117499  KR tók fljótlega að sér að bjóða uppá æfingar ...

[117500 rows x 1 columns]


In [4]:
def letters(text):
    new_text=''
    for t in text:
        if t.isalpha():
            new_text+=t
    return new_text

texts_train=x_train.values
texts_test=x_test.values
texts_train=[s[0].replace('\n','').lower() for s in texts_train]
texts_test=[s[0].replace('\n','').lower() for s in texts_test]

for i in range(len(texts_train)):
    texts_train[i]=letters(texts_train[i])
for i in range(len(texts_test)):
    texts_test[i]=letters(texts_test[i])
    
tokenizer=Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tokenizer.fit_on_texts(texts_train)

In [5]:
#texts preprocessing
'''
texts_train=x_train.values
texts_test=x_test.values
texts_train=[s[0].replace('\n','').lower() for s in texts_train]
texts_test=[s[0].replace('\n','').lower() for s in texts_test]
tokenizer=Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tokenizer.fit_on_texts(texts_train)
'''
#sequences
texts_train=tokenizer.texts_to_sequences(texts_train)
texts_test=tokenizer.texts_to_sequences(texts_test)

texts_train=pad_sequences(texts_train, maxlen=INPUT_SIZE, padding='post')
texts_train=np.array(texts_train)
texts_test=pad_sequences(texts_test, maxlen=INPUT_SIZE, padding='post')
texts_test=np.array(texts_test)

In [6]:
#classes preprocessing
classes_train=y_train.values
classes_test=y_test.values

#dictionary for languages
classes = np.unique(np.array(classes_train))
nums=np.arange(len(classes))
d = dict(zip(classes,nums))

classes_train=[d[c[0]] for c in classes_train] 
classes_test=[d[c[0]] for c in classes_test] 
classes_train=to_categorical(classes_train)
classes_test=to_categorical(classes_test)

In [7]:
voc_size=len(tokenizer.word_index)
voc_size

9627

In [8]:
#embeddings
embeddings_weights=[]
embeddings_weights.append(np.zeros(voc_size))


for char, i in tokenizer.word_index.items():
    onehot=np.zeros(voc_size)
    onehot[i-1]=1
    embeddings_weights.append(onehot)
embeddings_weights=np.array(embeddings_weights)

In [9]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=5000)
#principalComponents = pca.fit_transform(embeddings_weights)
#principalDf = pd.DataFrame(principalComponents)
#print(principalDf)

In [10]:
#data=principalDf.values
#data.shape

In [11]:
import scipy
embeddings_weights=scipy.sparse.csr_matrix(embeddings_weights)

In [12]:
row_nnz = np.diff(embeddings_weights.indptr)
indices = np.asarray([[row_i, col_i]
                         for row_i, nnz in enumerate(row_nnz)
                         for col_i in range(nnz)], dtype='int64')

In [13]:
sp_weights = tf.SparseTensor(indices, embeddings_weights.data, embeddings_weights.shape)

In [31]:
#embedding_layer=Embedding(voc_size+1,
                         #5000,
                         #input_length=500,
                         #weights=[data])
            

In [48]:
embeddings_weights.data.shape

(9627,)

In [20]:
embedding_layer=Embedding(voc_size+1,
                         voc_size,
                         input_length=INPUT_SIZE,
                         weights=[sp_weights]
                         )

In [21]:
conv_layers=[[256, 7, 3],
            [256, 7, 3],
            [256, 7, 3],
            [256, 3, -1],
            [256, 3, -1],
            [256, 3, -1],
            [256, 3, 3]]


#conv_layers=[[256, 7, 3]]

fully_connected_layers=[1024, 1024]
#fully_connected_layers=[512]   
num_of_classes=len(classes)
dropout=0.5
optimizer='adam'
loss='categorical_crossentropy'

In [22]:
embeddings_weights

<9628x9627 sparse matrix of type '<class 'numpy.float64'>'
	with 9627 stored elements in Compressed Sparse Row format>

In [23]:
inputs = Input(shape = (INPUT_SIZE,), name = 'input', dtype = 'int64', sparse=False)

In [17]:
x=embedding_layer(inputs)

Instructions for updating:
Colocations handled automatically by placer.


TypeError: float() argument must be a string or a number, not 'SparseTensor'

In [32]:
for filter_num, filter_size, pooling_size in conv_layers:
    x=Conv1D(filter_num, filter_size)(x)
    x=Activation('tanh')(x)
    if pooling_size!=-1:
        x=MaxPooling1D(pool_size=pooling_size)(x)
x=Flatten()(x)
for dense_size in fully_connected_layers:
    x=Dense(dense_size, activation='softmax')(x)
    x=Dropout(dropout)(x)
predictions=Dense(num_of_classes, activation='softmax')(x)
model=Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.summary()

ValueError: setting an array element with a sequence.

In [19]:
embeddings_weights.shape

(9628, 9627)

In [None]:
model.fit(texts_train, classes_train, 
        validation_data = (texts_test, classes_test),
         batch_size=128,
         epochs=10,
         verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 117500 samples, validate on 117500 samples
Epoch 1/10
