In [1]:
import numpy as np
import pandas as pd
import re

from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
np.random.seed(1)
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import text_to_word_sequence

import matplotlib.pyplot as plt
import gc

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.preprocessing.text import text_to_word_sequence

stop_words = pd.read_csv('../data/stopwords.csv')['words'].values

def remove_stop_words(text):
    word_tokens = text_to_word_sequence(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

print(stop_words)

['only' 'y' 'by' 'am' 'most' 'me' 'same' 'these' 'so' 'some' 'why' 'down'
 'had' 'd' 'at' 'having' 'those' 'has' 'few' 'theirs' "you've" 'more' 'i'
 'than' 'through' 'be' 'what' 'where' 'myself' 'which' 'doing' 'ours'
 'will' 'in' 'both' 'do' 'it' 'o' 'on' 'yours' 'once' 'ourselves' 'here'
 'about' "it's" 'my' 'for' 'her' 'then' 'after' "should've" 'from' 'each'
 'when' 'does' 'now' 'off' 'don' 'are' 'we' 'itself' 'should' 'his'
 'between' 'our' 'were' 'under' 'other' 'all' 'she' 'won' 'been' "you're"
 'how' 'did' 'yourself' 'they' 'into' 'there' 've' 'such' 't' 's' 'and'
 'over' 'to' 'just' 'was' 'being' 'because' 'if' 'who' 'further' 'the'
 'any' "that'll" 'themselves' 'as' 'again' "you'd" 'until' 'he' 'him'
 'this' 'or' 'of' 'below' 'an' "she's" 'weren' 'm' 'their' 'ma' 'up' 'll'
 'whom' 'hers' 'can' 'you' 'them' 'very' 'a' 'herself' 'before' 'too'
 'himself' 'during' 're' 'out' 'its' 'above' 'own' 'have' 'while'
 'yourselves' 'that' 'with' "you'll" 'is' 'your']


In [4]:
data = pd.concat([pd.read_csv(x) for x in [
    '../data/tweets.csv',
    '../data/word-list.csv',
    '../data/twitter-airline-sentiment.csv',
    '../data/sentiwordnet.csv',
    '../data/reviews.csv',
    '../data/imdb.csv',

]])

In [None]:
data['text'] = list(map(remove_stop_words, data['text'].values))
data.head()

In [None]:
sentences = data['text'].values
corpus = [text_to_word_sequence(y) for y in sentences]
len(corpus)

In [None]:
X_raw_train, X_raw_test, Y_train, Y_test = train_test_split(
    sentences,
    data[['pos', 'neg']].values,
    test_size=0.2, 
    random_state=3945
)

In [None]:
max_sentence_length = 35
vector_size = 100

In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

import os

# word2vec = KeyedVectors.load_word2vec_format('../../GoogleNews-vectors-negative300.bin', binary=True)

word2vec_name = 'lstm-word2vec.bin'

# Create Word2Vec
if os.path.isfile(word2vec_name): 
    print("Loading...")
    word2vec = KeyedVectors.load(word2vec_name)

else:
    print("Computing...")
    word2vec = Word2Vec(sentences=corpus,
                    size=vector_size, 
                    window=10, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=4)
    word2vec.save(word2vec_name)

word2vec = word2vec.wv

gc.collect()

In [None]:
def sent2index(corpus):
    gc.collect()
    input_matrix = np.zeros((len(corpus),max_sentence_length))
    
    for i in range(len(corpus)):
        for t, token in enumerate(corpus[i]):
            if t >= max_sentence_length:
                break
            if token not in word2vec.vocab:
                continue
            input_matrix[i, t] = word2vec.vocab.get(token).index
    return input_matrix

In [None]:
X_train = sent2index(X_raw_train)
X_test = sent2index(X_raw_test)

In [None]:
gc.collect()
vocab_len = len(word2vec.vocab) + 1

emb_matrix = np.zeros((vocab_len, vector_size))

for word in word2vec.vocab:
    index = word2vec.vocab.get(word).index
    emb_matrix[index, :] = word2vec[word]
    
print(emb_matrix.shape)

In [None]:
gc.collect()
sentence_indices = Input(shape=(max_sentence_length,))
    
embedding_layer = Embedding(vocab_len, vector_size, trainable = False)
embedding_layer.build((None,))
embedding_layer.set_weights([emb_matrix])

embeddings = embedding_layer(sentence_indices)

In [None]:
X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(2, activation=None)(X)
X = Activation('softmax')(X)

model = Model(inputs=[sentence_indices], outputs=X)

model.summary()

In [None]:
model_file = "keras-model.h5"

def compile_model():
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

def train_model(): 
    model.fit(
        X_train, 
        y=Y_train, 
        batch_size=512, 
        epochs=100, 
        verbose=1, 
        validation_data=(X_test, Y_test))
    
    print("Saving model")
    model.save_weights(model_file)

def load_model():
    model.load_weights(model_file)

In [None]:
force_train = True
import os

if os.path.isfile(model_file) and not force_train:
    print("Loading model...")
    load_model()
    compile_model()
else:
    print("Training model...")
    compile_model()
    train_model()

In [None]:
mess = np.array([
    'this is slow',
    'this is exceptional service',
])
X_test_indices = sent2index(mess)
pred = model.predict(X_test_indices)

output = ''
for i ,m in enumerate(mess):
        output += ('{} {} {}\n'.format('POSITIVE:' if pred[i][0] > 0.5 else 'NEGATIVE:', m, pred[i]))

        
print(output)
        
del mess
del output
del pred
del X_test_indices
gc.collect()

In [None]:
from IPython.display import clear_output
import os

import urllib, json
import urllib.request

from dotenv import load_dotenv
load_dotenv()

TLGRM_SECRET = os.getenv('TLGRM_SECRET')
TLGRM_MIKE = os.getenv('TLGRM_MIKE')

def telegram_call(method, query = {}):
    try:
        url = 'http://api.telegram.org/bot{}/{}?{}'.format(TLGRM_SECRET, method, urllib.parse.urlencode(query))
        response = urllib.request.urlopen(url)
        return json.loads(response.read().decode("utf-8"))
    except:
        print('Repeating call...')
        return telegram_call(method, query)

def telegram_bot(respond):
    last_offset = 0
    
    while True:
        data = telegram_call('getupdates', {'offset': last_offset})
        for item in data['result']:
            last_offset = item['update_id'] + 1
            if 'message' in item:
                if 'text' in item['message']:
                    text = item['message']['text']
                    chat_id = item['message']['chat']['id']
                    response = respond(text)
                    smd = telegram_call('sendmessage', {
                        'chat_id': chat_id,
                        'text': response
                    })
                    
                    while smd['ok'] == False:
                        smd = telegram_call('sendmessage', {
                            'chat_id': chat_id,
                            'text': response
                        })
                        
                    clear_output()
                    print('FROM: {}\nSAYS: {}\nRESPONSE: {}\n\n'.format(chat_id, text, response))

In [None]:
from math import floor
gc.collect()

uhms = [
    "uhm", "uhh", "hmm", 
    "hmmm", "oh uh", "oh hmm",
]

sentiment = [
    "😭",
    "☹️",
    "😐",
    "🙂",
    "😍",
]

def respond(text):
    mess = np.array([text])
    X_test_indices = sent2index(mess)
    pred = model.predict(X_test_indices)
    score = pred[0][0]
    sent = sentiment[floor(score * 5)]
    response = '{}\n\n{}'.format(text, sent)

    return response

In [None]:
telegram_bot(respond)