In [1]:
# define args
run_env = 'windows'
drop_user = True
simplify_http = True
remove_stop_words = False
apply_porter_stemmer = True
padding = 'post'
verbose = 1
n_samples = 150000

!pip install gensim
from google.colab import drive
drive.mount('/content/drive')
wrk_dr = '/content/drive/My Drive/Colab Notebooks/'

data_dir = wrk_dr + 'data/'

import ast
import datetime
from gensim.models import Word2Vec
import io
from keras import backend as K
from keras import regularizers
from keras.layers import LSTM, GRU, GRUCell, Dense, Flatten, TimeDistributed, Dropout
from keras.layers.embeddings import Embedding
from keras.models import load_model, Model
from keras.models import Sequential
from keras.optimizers import Adam
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import porter
#from nltk.tokenize import word_tokenize
import seaborn as sns
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import string

# load raw data
start = datetime.datetime.now()
twcs = pd.read_csv(data_dir + 'twcs.zip')[['author_id','text']]
print('upload time: ' + str(datetime.datetime.now() - start))


Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
[K    100% |████████████████████████████████| 23.6MB 1.5MB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 15.5MB/s 
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downlo

Using TensorFlow backend.


upload time: 0:00:19.432691


In [3]:

# setup train and test size
if run_env == 'windows':
    val_and_test_prop = 0.07
    val_and_test_size = int(n_samples * val_and_test_prop)
    n_samples += val_and_test_size * 2
    twcs = twcs.sample(n=n_samples, random_state=345)

# label tweet as one of 109 corp accounts, or non-corp acct (92 classes) 40% of data is other class
# consolidate non-corporate accounts
start = datetime.datetime.now()
twcs['author_id'] = twcs['author_id']\
  .apply(lambda x: x if x.replace('_','')\
  .replace('O2','O').isalpha() else 'non-corporate')
print('consolidate author time: ' + str(datetime.datetime.now() - start))

# avoid not having class representitives in train and validate sets
if run_env == 'windows':
    n_tweets_by_author = twcs['author_id'].value_counts()
    twcs = twcs[twcs['author_id'].isin(n_tweets_by_author.index[n_tweets_by_author >= 500])]
    min_tweets = 500
    print(str(len(n_tweets_by_author[n_tweets_by_author >= min_tweets])) + ' included classes:')
    print(n_tweets_by_author[n_tweets_by_author >= min_tweets])
    val_and_test_size = int(len(twcs) * val_and_test_prop)

# cleanse text
# remove @____ text

drop_user_pattern = re.compile("(@[A-Za-z0-9]+)")
http_pattern = re.compile('http[^\s]+')
translator = str.maketrans('', '', string.punctuation)
porter_stemmer = porter.PorterStemmer()

def word_cleanse(word):
    # porter stem & remove punctuation
    word = porter_stemmer.stem(word.translate(translator))

    # simplify web addresses
    if simplify_http:
        return re.sub(pattern=http_pattern, repl='http', string=word)
    else:
        return word

def text_cleanse(tweet_text):
    """
    cleanse text column of the tweet
    :param tweet_text: string of words (tweet)
    :return: list of cleansed words in tweet text
    """
    # replace @username tags with marker
    if drop_user:
        tweet_text = ' '.join(re.sub(drop_user_pattern, " ", tweet_text).split())\
            .lower()\
            .replace('  ', ' ')\
            .split(' ')
    else:
        tweet_text = tweet_text.lower() \
            .replace('  ', ' ') \
            .split(' ')

    # remove stop words (decided not to do this because removes important words like et (eastern time))
    if remove_stop_words:
        tweet_text = [word for word in tweet_text.split(' ')
                      if (word not in stopwords.words('english'))
                      & (len(word)>2) & (len(word)<15)]

    # apply porter stemmer & remove punctuation
    if apply_porter_stemmer:
        tweet_text = [word_cleanse(word=word) for word in tweet_text]

    return tweet_text

start = datetime.datetime.now()
twcs['text'] = twcs['text'].apply(lambda x: text_cleanse(tweet_text=x))
print('cleanse time: ' + str(datetime.datetime.now() - start))

if run_env == 'colaboratory':
    twcs.to_csv(data_dir + 'twcs_cleansed.gzip', compression='gzip', index=False)

twcs.head()

if run_env == 'colaboratory':
    start = datetime.datetime.now()
    twcs = pd.read_csv(data_dir + 'twcs_cleansed.gzip', compression='gzip')
    print('read data time:' + str(datetime.datetime.now() - start))

    start = datetime.datetime.now()
    twcs['text'] = twcs['text'].apply(ast.literal_eval)
    print('string to list time:' + str(datetime.datetime.now() - start))

start = datetime.datetime.now()
x_train, x_val, y_train, y_val = \
    train_test_split(twcs['text'], twcs['author_id'],
                     test_size=val_and_test_size,
                     stratify=twcs['author_id'],
                     random_state=3135,
                     shuffle=True)

x_train, x_test, y_train, y_test = \
    train_test_split(x_train, y_train,
                     test_size=val_and_test_size,
                     stratify=y_train,
                     random_state=3135,
                     shuffle=True)

x_train.reset_index(inplace=True, drop=True)
x_val.reset_index(inplace=True, drop=True)
x_test.reset_index(inplace=True, drop=True)
print('split time: ' + str(datetime.datetime.now() - start))

# reformat y to one-hot encoding for Keras cat target
y_encoder = LabelBinarizer().fit(y_val.values)
y_train = y_encoder.transform(y_train)
y_val = y_encoder.transform(y_val)
y_test = y_encoder.transform(y_test)

def to_unique_words(seq, idfun=None):
   # order preserving
   if idfun is None:
       def idfun(x): return x
   seen = {}
   result = []
   for sent in seq:
       for item in sent:
           marker = idfun(item)
           if marker in seen: continue
           seen[marker] = 1
           result.append(item)
   return result


unique_words = to_unique_words(x_train)
vocab_size = len(unique_words)

# convert x to sequence data
sent_len = x_train.apply(len)
max_sent_len = np.max(sent_len)
print('\nmax_steps: ' + str(max_sent_len))


def to_token(x, vocab_size, max_len, padding):
    x = x.apply(lambda s: ' '.join(s))
    x = [one_hot(w, vocab_size) for w in x]
    return pad_sequences(x, maxlen=max_len, padding=padding)

start = datetime.datetime.now()
x_train = to_token(x=x_train, vocab_size=vocab_size, max_len=max_sent_len, padding=padding)
x_val = to_token(x=x_val, vocab_size=vocab_size, max_len=max_sent_len, padding=padding)
x_test = to_token(x=x_test, vocab_size=vocab_size, max_len=max_sent_len, padding=padding)
print('to_token time:' + str(datetime.datetime.now() - start))

def plot_training_results(metric, history, nn, x_test, y_test, file=None):
    test_loss, test_acc = nn.evaluate(x=x_test, y=y_test)
    test_result = test_loss if metric == 'loss' else test_acc

    plt.figure(figsize=(8*1.5, 6*1.5))
    plt.plot(history.history[metric], label='train')
    plt.plot(history.history['val_' + metric], label='val')
    plt.legend()
    plt.title(metric.title() + ' by Epoch    |    Test ' + metric.title() + ': ' + str(round(test_result, 3)))

    if file is not None:
        plt.savefig(file, transparent=True)
    else:
        plt.show()

    plt.close()

# y = y_test
# x=x_test
# nn=model
def plot_confusion_matrix(x, y, nn, file=None):
    y = np.argmax(y, axis=1)
    y_hat = nn.predict(x=x)
    y_hat = np.argmax(y_hat, axis=1)

    f1 = np.round(f1_score(y, y_hat, average='micro'), 3)

    conf_matrix = np.log(1 + confusion_matrix(y_true=y, y_pred=y_hat))
    # conf_matrix = confusion_matrix(y_true=y, y_pred=y_hat)

    plt.figure(figsize=(8 * 1.5, 6 * 1.5))
    sns.heatmap(conf_matrix, center=np.median(conf_matrix))
    plt.title('Confusion Matrix (log of 1 + count)    |    F1: ' + str(f1))

    if file is not None:
        plt.savefig(file, transparent=True)
    else:
        plt.show()

    plt.close()

# plot_confusion_matrix(x=x_test, y=y_test, nn=model)

# # # define fully connected model
# fc_model = Sequential()
# fc_model.add(Embedding(vocab_size, 500, input_length=max_sent_len))
# fc_model.add(Flatten())
# fc_model.add(Dense(200, activation='relu'))
# fc_model.add(Dropout(.3))
# fc_model.add(Dense(100, activation='relu'))
# fc_model.add(Dropout(.3))
# fc_model.add(Dense(50, activation='relu'))
# fc_model.add(Dense(y_val.shape[1], activation='softmax'))
# # compile the model
# fc_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# # summarize the model
# print(fc_model.summary())
# # fit the model
# start = datetime.datetime.now()
# history = fc_model.fit(x_train, y_train, epochs=5, verbose=verbose, validation_data=[x_val, y_val], batch_size=256)
# print('fc train time: ' + str(datetime.datetime.now() - start))
#
# plot_training_results(metric='acc', history=history, nn=fc_model, x_test=x_test, y_test=y_test,
#                       file=data_dir + 'fc_acc.png')
# plot_training_results(metric='loss', history=history, nn=fc_model, x_test=x_test, y_test=y_test,
#                       file=data_dir + 'fc_loss.png')
# plot_confusion_matrix(x=x_test, y=y_test, nn=fc_model, file=data_dir + 'fc_confusion.png')

# t = np.bincount(np.argmax(y_test, axis=1)) / np.sum(np.argmax(y_test, axis=1))



consolidate author time: 0:00:00.158845
45 included classes:
non-corporate      93591
AmazonHelp         10169
AppleSupport        6486
Uber_Support        3422
SpotifyCares        2730
Delta               2488
Tesco               2342
AmericanAir         2287
TMobileHelp         2162
comcastcares        2077
SouthwestAir        1755
British_Airways     1746
VirginTrains        1665
Ask_Spectrum        1556
XboxSupport         1512
sprintcare          1336
hulu_support        1315
sainsburys          1205
GWRHelp             1182
AskPlayStation      1149
VerizonSupport      1140
ChipotleTweets      1116
UPSHelp             1090
ATVIAssist          1039
Safaricom_Care       982
idea_cares           958
O2                   930
AskTarget            869
AirAsiaSupport       787
SW_Help              739
ArgosHelpers         726
BofA_Help            726
AskPayPal            720
AskLyft              704
MicrosoftHelps       671
AskAmex              670
marksandspencer      667
AdobeCare     

In [4]:
from keras.models import load_model

# load model
fc_model = load_model(data_dir + 'fc_model.h5')

fc_model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 68, 500)           31679000  
_________________________________________________________________
flatten_1 (Flatten)          (None, 34000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               6800200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                5050      
__________

In [5]:
# take feature extractor
base_model = Model(inputs=fc_model.input, 
              outputs=fc_model.get_layer('embedding_1').output)

# load model
# input = Input(batch_shape=(5, 224, 224, 3))
# base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
"""
x = base_model.output
x = Flatten()(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(16)(x)
x = Activation('relu')(x)
x = Dense(3)(x)
predictions = Activation('softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

# freeze vgg19 layers
for layer in base_model.layers:
    layer.trainable = False

# compile model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
"""

# extend the feature extractor
# define sequential model
batch_size = 2**9
embed_size = 500
epochs = 4
lstm_model = base_model.output

#lstm_model = Sequential()
# lstm_model = LSTM(200, return_sequences=True, stateful=False)(lstm_model)
lstm_model = GRU(200, return_sequences=False, stateful=False)(lstm_model)
lstm_model = Dropout(.4)(lstm_model)
lstm_model = Dense(100)(lstm_model)
lstm_model = Dropout(.2)(lstm_model)
lstm_model = Dense(50)(lstm_model)
# lstm_model = Dropout(.2)(lstm_model)
# lstm_model = Dense(50)(lstm_model)
predictions = Dense(y_val.shape[1], activation='softmax')(lstm_model)

full_lstm_model = Model(inputs=base_model.input, outputs=predictions)

# freeze embedding layer
for layer in base_model.layers:
    layer.trainable = False

# compile the model
full_lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(full_lstm_model.summary())

# fit the model
loss = []
val_loss = []
acc = []
val_acc = []

train_minutes = 1
start = datetime.datetime.now()
while datetime.datetime.now() - start < datetime.timedelta(minutes=train_minutes):
    history = full_lstm_model.fit(x=x_train, y=y_train, 
                                  validation_data=[x_val, y_val], 
                                  epochs=epochs, batch_size=batch_size, 
                                  shuffle=True, verbose=verbose)

    loss.append(history.history['loss'])
    val_loss.append(history.history['val_loss'])
    acc.append(history.history['acc'])
    val_acc.append(history.history['val_acc'])

history.history['loss'] = np.array(loss).ravel()
history.history['val_loss'] = np.array(val_loss).ravel()
history.history['acc'] = np.array(acc).ravel()
history.history['val_acc'] = np.array(val_acc).ravel()

print(history.history['loss'])
print(history.history['val_loss'])

plot_training_results(metric='loss', history=history, nn=full_lstm_model, x_test=x_test, y_test=y_test,
                      file=data_dir + 'lstm_loss.png')
plot_training_results(metric='acc', history=history, nn=full_lstm_model, x_test=x_test, y_test=y_test,
                      file=data_dir + 'lstm_acc_loss.png')
plot_confusion_matrix(x=x_test, y=y_test, nn=full_lstm_model, file=data_dir + 'lstm_confusion.png')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1_input (InputLaye (None, 68)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 68, 500)           31679000  
_________________________________________________________________
gru_1 (GRU)                  (None, 200)               420600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
__________

In [6]:
plot_training_results(metric='loss', history=history, nn=full_lstm_model, x_test=x_test, y_test=y_test,
                      file=data_dir + 'lstm_loss.png')
plot_training_results(metric='acc', history=history, nn=full_lstm_model, x_test=x_test, y_test=y_test,
                      file=data_dir + 'lstm_acc_loss.png')
plot_confusion_matrix(x=x_test, y=y_test, nn=full_lstm_model, file=data_dir + 'lstm_confusion.png')



In [0]:
# define sequential model
batch_size = 2**9
embed_size = 500
epochs = 4
lstm_model = Sequential()
lstm_model.add(Embedding(vocab_size, embed_size, input_length=max_sent_len))
# lstm_model.add(LSTM(200, return_sequences=True, stateful=False))
lstm_model.add(LSTM(500, return_sequences=False, stateful=False))
lstm_model.add(Dropout(.4))
lstm_model.add(Dense(200))
lstm_model.add(Dropout(.2))
lstm_model.add(Dense(50))
lstm_model.add(Dropout(.2))
lstm_model.add(Dense(50))
lstm_model.add(Dense(y_val.shape[1], activation='softmax'))
# compile the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(lstm_model.summary())
# fit the model
loss = []
val_loss = []
acc = []
val_acc = []

train_minutes = 1
start = datetime.datetime.now()
while datetime.datetime.now() - start < datetime.timedelta(minutes=train_minutes):
    history = lstm_model.fit(x=x_train, y=y_train, validation_data=[x_val, y_val],
                             epochs=epochs, batch_size=batch_size, shuffle=True, verbose=verbose)

    loss.append(history.history['loss'])
    val_loss.append(history.history['val_loss'])
    acc.append(history.history['acc'])
    val_acc.append(history.history['val_acc'])

history.history['loss'] = np.array(loss).ravel()
history.history['val_loss'] = np.array(val_loss).ravel()
history.history['acc'] = np.array(acc).ravel()
history.history['val_acc'] = np.array(val_acc).ravel()

print(history.history['loss'])
print(history.history['val_loss'])

plot_training_results(metric='loss', history=history, nn=lstm_model, x_test=x_test, y_test=y_test,
                      file=data_dir + 'lstm_loss.png')
plot_training_results(metric='acc', history=history, nn=lstm_model, x_test=x_test, y_test=y_test,
                      file=data_dir + 'lstm_acc_loss.png')
plot_confusion_matrix(x=x_test, y=y_test, nn=lstm_model, file=data_dir + 'lstm_confusion.png')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 68, 500)           31679000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 500)               2002000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               100200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                10050     
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
__________

KeyboardInterrupt: ignored