In [2]:
import numpy as np
import pandas as pd
import pickle
import keras.backend as K
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.optimizers import Adam, SGD

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Parameters

In [15]:
max_length = 35
vocab_size = 2000
char_model = False
batch_size = 128
num_epochs = 5

# General Model

### Read data, get text list

In [21]:
df = pd.read_csv('general.csv', sep="\|\|\|", engine='python')
df.columns = ['Text', 'Feedback']
text_list = df[u'Text'].tolist()

### Create tokenizer

In [22]:
tokenizer = Tokenizer(num_words=vocab_size, char_level=char_model)
tokenizer.fit_on_texts(text_list)

pickle.dump(tokenizer, open('general_tokenizer.p', 'wb'))

### Get training data

In [23]:
text_matrix = tokenizer.texts_to_sequences(text_list)

X = sequence.pad_sequences(text_matrix, maxlen=max_length, padding='pre', truncating='post')
Y = np.array(df['Feedback'])

### Create model

In [24]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_shape=(max_length, )))
model.add(Dropout(.2))
model.add(LSTM(128))
model.add(Dropout(.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['binary_accuracy'])

### Train model

In [25]:
model.fit(X, Y, epochs=num_epochs)
model.save('general_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# General + Reviews Model

### Read data, get text list

In [26]:
df = pd.read_csv('general+reviews.csv', sep="\|\|\|", engine='python')
df.columns = ['Text', 'Feedback']
text_list = df['Text'].tolist()

### Create tokenizer

In [27]:
tokenizer = Tokenizer(num_words=vocab_size, char_level=char_model)
tokenizer.fit_on_texts(text_list)

pickle.dump(tokenizer, open('general+reviews_tokenizer.p', 'wb'))

### Get training data

In [28]:
text_matrix = tokenizer.texts_to_sequences(text_list)

X = sequence.pad_sequences(text_matrix, maxlen=max_length, padding='pre', truncating='post')
Y = np.array(df['Feedback'])

### Create model

In [29]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_shape=(max_length, )))
model.add(Dropout(.2))
model.add(LSTM(128))
model.add(Dropout(.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['binary_accuracy'])

### Train model

In [30]:
model.fit(X, Y, epochs=num_epochs)
model.save('general+reviews_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Amazon Model

### Read data, get text list

In [4]:
df = pd.read_csv('amazon.csv', sep="\|\|\|", engine='python')
df.columns = ['Text', 'Feedback']
text_list = df['Text'].tolist()

Unnamed: 0,Text,Feedback
0,Start your mornings with @ikemorgan 'Down in A...,0
1,Please be specific in advertising for VIRAL LA...,0
2,"If you're worried about data privacy, maybe st...",1
3,@MomoPicks It looks like your original message...,0
4,@MomoPicks It looks like your original message...,0


### Create tokenizer

In [32]:
tokenizer = Tokenizer(num_words=vocab_size, char_level=char_model)
tokenizer.fit_on_texts(text_list)

pickle.dump(tokenizer, open('amazon_tokenizer.p', 'wb'))

### Get training data

In [33]:
text_matrix = tokenizer.texts_to_sequences(text_list)

X = sequence.pad_sequences(text_matrix, maxlen=max_length, padding='pre', truncating='post')
Y = np.array(df['Feedback'])

### Create model

In [34]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_shape=(max_length, )))
model.add(Dropout(.2))
model.add(LSTM(128))
model.add(Dropout(.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['binary_accuracy'])

### Train model

In [35]:
model.fit(X, Y, epochs=num_epochs)
model.save('amazon_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
