In [1]:
from __future__ import print_function

import nltk
import keras
import pandas as pd
import numpy as np

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, Conv2D, GlobalMaxPooling2D
from keras.datasets import imdb
from keras.datasets import mnist
from keras.optimizers import RMSprop
from keras.datasets import reuters

Using Theano backend.


In [22]:
raw_data = pd.read_csv('C:\Users\pedro.castanha\Downloads\ML_Gabinete_Digital.csv',
                       error_bad_lines=False,
                       sep='\t',
                       encoding='utf_8')

stop = set(stopwords.words('portuguese'))

x_vectorized = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=stop)
x_vectorized.fit(raw_data.Text)
x_train = x_vectorized.transform(raw_data.Text)
y_train = raw_data.Class

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [24]:
xx = sequence.pad_sequences(x_train.toarray(), maxlen=maxlen)
xx_test = sequence.pad_sequences(x_test.toarray(), maxlen=maxlen)

xx.shape

(504L, 400L)

In [3]:
# set parameters:
max_features = 10000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Building model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=2,
                    validation_split=0.1)
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

print('Model complete...')

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000L, 400L)
x_test shape: (25000L, 400L)
Building model...
Train on 22500 samples, validate on 2500 samples
Epoch 1/2
106s - loss: 0.4039 - acc: 0.8011 - val_loss: 0.2970 - val_acc: 0.8800
Epoch 2/2
112s - loss: 0.2238 - acc: 0.9103 - val_loss: 0.3116 - val_acc: 0.8728
Test score: 0.314095856614
Test accuracy: 0.87216
Model complete...


In [2]:
max_words = 1000
batch_size = 32
epochs = 5

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

print('Building model...')
model = Sequential()
model.add(Embedding(1000, 100))
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#model.add(Dense(512, input_shape=(max_words,)))
#model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Conv1D(filters=100,kernel_size=3, strides=1, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=2,
                    validation_split=0.1)
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Loading data...
8982 train sequences
2246 test sequences
46 classes
Vectorizing sequence data...
x_train shape: (8982L, 1000L)
x_test shape: (2246L, 1000L)
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (8982L, 46L)
y_test shape: (2246L, 46L)
Building model...
Train on 8083 samples, validate on 899 samples
Epoch 1/5
175s - loss: 2.4690 - acc: 0.3543 - val_loss: 2.4636 - val_acc: 0.3382
Epoch 2/5
171s - loss: 2.3297 - acc: 0.3653 - val_loss: 2.4191 - val_acc: 0.3315
Epoch 3/5
174s - loss: 2.3124 - acc: 0.3759 - val_loss: 2.4274 - val_acc: 0.3315
Epoch 4/5
172s - loss: 2.2985 - acc: 0.3840 - val_loss: 2.4362 - val_acc: 0.3315
Epoch 5/5
179s - loss: 2.2922 - acc: 0.3914 - val_loss: 2.4636 - val_acc: 0.3315
Test score: 2.41729351483
Test accuracy: 0.361976847782
