In [1]:
import csv

import dask.dataframe as dd
import pandas as pd
import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt
%matplotlib inline

import keras
from keras.layers import GRU, Dense, GlobalMaxPool1D, Embedding, Dropout
from keras.preprocessing import text, sequence
from keras.models import Sequential

from math import ceil

Using TensorFlow backend.


In [2]:
# https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
import sys, csv

maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

In [3]:
class Generator(Sequential):

    def __init__(self, data_file):
        self.data_file = data_file
        # we initialize the number of rows with -1
        self.length = -1

    def __iter__(self):
        while True:
            with open(self.data_file, 'r') as f:
                # skip first line (header)
                for _ in range(1):
                    next(f)

                reader = csv.reader(f)
                for row in reader:
                    # content
                    tokenizer = text.Tokenizer(num_words=20000)
                    tokenizer.fit_on_texts(list(row[0]))
                    list_tokenized_train = tokenizer.texts_to_sequences(row[0])
                    X = sequence.pad_sequences(list_tokenized_train, maxlen=100)
                    # category
                    y = np.array(int(row[1]), dtype='int32').flatten()
                    
                    yield (X, y)

    def __next__(self):
        try:
            self.data_file = self.data_file
        except IndexError:
            raise StopIteration()
    
    # we also need to override __len__ so Keras can get the number of rows in the file
    def __len__(self):
        # we only compute this once
        if self.length ==  -1:
            n_rows = 0
            with open(self.data_file, 'r') as f:
                reader = csv.reader(f)
                for row in reader:
                    n_rows += 1
            self.length = n_rows
        return self.length

In [None]:
# Now we need to call our Keras model with the generators
train_generator = Generator('training_data.csv')
valid_generator = Generator('validation_data.csv')

# and we train our Keras model using the fit_generator function
model = Sequential([
    Embedding(20000, 128),
    GRU(50, return_sequences=True),
    GlobalMaxPool1D(),
    Dropout(0.5),
    Dense(50, activation='relu'),
    Dropout(0.5),
    Dense(8, activation='softmax')
])

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# let's say our batch size is 100 rows
batch_size = 100

# the number of batches per epoch is then
batches_per_epoch = ceil(len(train_generator) / batch_size)

result = model.fit_generator(train_generator, steps_per_epoch=batches_per_epoch, epochs=3,
                             validation_data=valid_generator, validation_steps=5)

Epoch 1/3
