In [7]:
import keras
import pandas
import numpy
import sklearn.preprocessing
import matplotlib.pyplot
import seaborn

In [8]:
VOCABULARY_SIZE = 50000
MAX_NUM_WORDS = 100 # max number of words to use to represent each review - reviews will be shaped to have this as their max length
EMBEDDING_SIZE = 300
BATCH_SIZE = 10000
NUM_EPOCHS = 10

In [9]:
train = pandas.read_csv('/Users/laurapallett/data/niclas_thomas/drug_condition/drugsComTrain_raw.csv')[['condition', 'review']]
train = train[train['condition'].isin(['Acne','Anxiety','Birth Control','Depression','Pain'])].reset_index(drop=True)
train.head()

Unnamed: 0,condition,review
0,Birth Control,"""I used to take another oral contraceptive, wh..."
1,Birth Control,"""This is my first time using any form of birth..."
2,Birth Control,"""I had been on the pill for many years. When m..."
3,Depression,"""I have taken anti-depressants for years, with..."
4,Birth Control,"""Started Nexplanon 2 months ago because I have..."


In [10]:
test = pandas.read_csv('/Users/laurapallett/data/niclas_thomas/drug_condition/drugsComTest_raw.csv')[['condition', 'review']]
test = test[test['condition'].isin(['Acne','Anxiety','Birth Control','Depression','Pain'])].reset_index(drop=True)
test.head()

Unnamed: 0,condition,review
0,Depression,"""I&#039;ve tried a few antidepressants over th..."
1,Birth Control,"""I have been on this birth control for one cyc..."
2,Birth Control,"""I&#039;ve had the copper coil for about 3 mon..."
3,Birth Control,"""I was on this pill for almost two years. It d..."
4,Acne,"""So I was on Ginanvi for about 3 months before..."


## Process Raw Text

In [11]:
train_reviews = train['review'].tolist()
train_conditions = train['condition'].tolist()

test_reviews = test['review'].tolist()
test_conditions = test['condition'].tolist()

In [12]:
# processes target
# converts string categories to integers encoding the categories

encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(train_conditions)
y_train = encoder.transform(train_conditions)
y_test = encoder.transform(test_conditions)

y_train 

array([2, 2, 2, ..., 3, 1, 2])

In [13]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCABULARY_SIZE)
tokenizer.fit_on_texts(train_reviews)
train_sequences = tokenizer.texts_to_sequences(train_reviews)
test_sequences = tokenizer.texts_to_sequences(test_reviews)
word_index = tokenizer.word_index

In [14]:
X_train = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_NUM_WORDS)
X_test = keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_NUM_WORDS)

X_train[0]

array([ 257,    7, 4980,   68,    3, 2781,   84, 1073,   66,    5,  115,
        176,  579,    1,   54, 4980,  467,   13,    5,   39,   49,   12,
         38,   34,    3, 2547,  291,    2,    3,   38,  364,    9,  113,
         78,   66,   47,    3,  280,  374,  224,  113,   78,    2,   32,
         22,  533,  374,  256,   73,  128,  188,    5,  533,   38,  364,
          9,  113,   78,    2,   32,    4,    6,   35,    3,  428,   12,
          3,  533,   82,    1,  114,   10,  325, 1077,  769,   94,    3,
        442,   43,   20,   18,    1,  116,    6,   21,   10,   92,  115,
         43,   58,    3,  936,   12,  172,   38,  422,   11,   25, 7115,
       6836], dtype=int32)

In [15]:
X_test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    1,    6,   28,  149,
          8,  118,  779,  110,    3,   46, 1315, 2174, 2554,   15,  689,
         12,  355,  211,   22,    5,   97,  540,  183,   75,    5,   98,
        738,    2,  346,   19, 1789, 3858, 1646,    2,   14,  167,   40,
        746,    5,  105, 1591,    1,   10,   16,   26,   43,   58,  594,
          3,  217, 1254,   69,  129,    1,    6,   28,  303,  265, 1342,
         12,   69,    1,  114,   10,  601,  436,   15, 1646,   40,  746,
         19], dtype=int32)

## Build & Run Model

In [16]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=VOCABULARY_SIZE, output_dim=EMBEDDING_SIZE, input_length=MAX_NUM_WORDS, name='embedding'))
model.add(keras.layers.LSTM(100, name='lstm'))
model.add(keras.layers.Dense(1, activation='sigmoid', name='output'))

In [17]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          15000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 15,160,501
Trainable params: 15,160,501
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [19]:
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [None]:
model.predict(X_test[0:10])

In [None]:
model.evaluate(X_text, y_test)

In [None]:
model.metrics_names