<a href="https://colab.research.google.com/github/mohammedterry/NLP_for_ML/blob/master/topics_intents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### preprocessing raw text

In [0]:
example_text = '''
A huge fireball exploded in the Earth's atmosphere in December, according to Nasa.

The blast was the second largest of its kind in 30 years, and the biggest since the fireball over Chelyabinsk in Russia six years ago.

But it went largely unnoticed until now because it blew up over the Bering Sea, off Russia's Kamchatka Peninsula.

The space rock exploded with 10 times the energy released by the Hiroshima atomic bomb.

Lindley Johnson, planetary defence officer at Nasa, told BBC News a fireball this big is only expected about two or three times every 100 years.
'''

In [118]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
stopWords = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
wn_lemmatiser = WordNetLemmatizer()

def clean(text):
  return ' '.join(''.join(letter if ord('a') <= ord(letter) <= ord('z') or letter.isdigit() else ' ' for letter in text.lower()).split())

def preprocess(document):
  return [wn_lemmatiser.lemmatize(word,pos='v') for word in clean(document).split() if word not in stopWords]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [146]:
' '.join(preprocess(example_text))

'huge fireball explode earth atmosphere december accord nasa blast second largest kind 30 years biggest since fireball chelyabinsk russia six years ago go largely unnoticed blow bering sea russia kamchatka peninsula space rock explode 10 time energy release hiroshima atomic bomb lindley johnson planetary defence officer nasa tell bbc news fireball big expect two three time every 100 years'

# LDA (Gensim)


### Pre-trained LDA models

load some pretrained lda models

In [0]:
from gensim.test.utils import datapath
import gensim

lda1 = gensim.models.LdaMulticore.load(datapath("lda_3_0_1_model"))
lda2 = gensim.models.LdaMulticore.load(datapath("ldamodel_python_3_5"))

In [0]:
def lda_topics(text, model, top_n = 3):
  results = dict(model[model.id2word.doc2bow(preprocess(text))])
  return[(topic,'/'.join([w for w in model.print_topic(topic,5).split('"') if w.isalpha()]), score) for score,topic in sorted(zip(results.values(), results.keys()), reverse=True)][:top_n]

try them out

In [143]:
lda_topics(example_text, lda1)

[(0, 'system/user/eps/time/response', 0.8268186987472599),
 (1, 'graph/trees/minors/survey/interface', 0.17318130125274017)]

In [144]:
lda_topics(example_text, lda2)

[(1, 'response/user/eps/survey/time', 0.8268134549578768),
 (0, 'graph/trees/minors/system/computer', 0.17318654504212305)]

### Train your own LDA model

Get some sample documents to represent the topics

In [0]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True, remove=('headers', 'footers', 'quotes'))
preprocessed_docs = [preprocess(doc) for doc in newsgroups_train.data]

train an LDA model on the sample documents

In [0]:
import gensim
vocab = gensim.corpora.Dictionary(preprocessed_docs)
vocab.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
bow_corpus = [vocab.doc2bow(doc) for doc in preprocessed_docs]
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 20, id2word = vocab, passes = 10, workers = 2)

save the trained LDA model

In [0]:
from gensim.test.utils import datapath
temp_file = datapath("lda")
lda_model.save(temp_file)

load in the trained LDA model

In [0]:
your_lda = gensim.models.LdaMulticore.load(temp_file)

test out the trained LDA model

In [145]:
lda_topics(example_text, your_lda)

[(3, 'gun/government/us/really/keep', 0.33644345),
 (17, 'state/program/report/national/research', 0.27499065),
 (2, 'wire/cause/power/back/grind', 0.16544925)]

# Intent Extraction (NLP Architect)

### Train your own Intent Model

install the nlp architect library

In [0]:
!pip3 install nlp_architect

download and load in the intent training data

In [1]:
!git clone https://github.com/snipsco/nlu-benchmark.git

Cloning into 'nlu-benchmark'...
remote: Enumerating objects: 378, done.[K
remote: Total 378 (delta 0), reused 0 (delta 0), pack-reused 378[K
Receiving objects: 100% (378/378), 1.23 MiB | 152.00 KiB/s, done.
Resolving deltas: 100% (240/240), done.


In [0]:
from nlp_architect.data.intent_datasets import SNIPS
intent_dataset = SNIPS(path='nlu-benchmark/2017-06-custom-intent-engines/',sentence_length=50,word_length=12)
train_x, train_c, train_i, train_y = intent_dataset.train_set
test_x, test_c, test_i, test_y = intent_dataset.test_set

encode training labels as one-hot vectors

In [0]:
from tensorflow.python.keras.utils import to_categorical
train_y = to_categorical(train_y, intent_dataset.label_vocab_size)
test_y = to_categorical(test_y, intent_dataset.label_vocab_size)

from nlp_architect.utils.generic import one_hot
train_i = one_hot(train_i, len(intent_dataset.intents_vocab))
test_i = one_hot(test_i, len(intent_dataset.intents_vocab))

download and load in word vectors (e.g. GloVe - feel free to use others)

In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2019-03-19 11:31:44--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-03-19 11:31:44--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-03-19 11:32:41 (14.5 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
from nlp_architect.utils.embedding import load_word_embeddings
wordvectors, _ = load_word_embeddings('glove.6B.300d.txt')

from nlp_architect.utils.embedding import get_embedding_matrix
embedding_matrix = get_embedding_matrix(wordvectors, intent_dataset.word_vocab)

set up and train the model (RNN - feel free to modify the hyperparameters, and be sure to match word_emb_dims to the size of word vectors used (e.g. if using Glove.100, then word_emb_dims= 100) )

In [0]:
from nlp_architect.models.intent_extraction import MultiTaskIntentModel
machinelearning_model = MultiTaskIntentModel()
machinelearning_model.build(intent_dataset.word_len, intent_dataset.label_vocab_size, intent_dataset.intent_size,intent_dataset.word_vocab_size,intent_dataset.char_vocab_size,word_emb_dims=300,tagger_lstm_dims=100, dropout=0.2)
machinelearning_model.load_embedding_weights(embedding_matrix)

In [0]:
train_inputs = [train_x, train_c]
test_inputs = [test_x, test_c]

train_outputs = [train_i, train_y]
test_outputs = [test_i, test_y]

machinelearning_model.fit(train_inputs, train_outputs, batch_size = 32, epochs = 50, validation = (test_inputs, test_outputs))

lets try out our trained model

In [0]:
import spacy
sp = spacy.load('en')
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(sp.vocab)

def predict_intent(sentence, max_words = 50, max_chars = 12):
  tokens = [token.text for token in tokenizer(sentence)[:max_words]]
  x = [intent_dataset.word_vocab[token] if token in intent_dataset.word_vocab else 1 for token in tokens]
  x += [0] * (max_words - len(x))
  
  c = [[intent_dataset.char_vocab[ch] for ch in word[:max_chars]] + [0]*(max_chars-len(word[:max_chars])) for word in sentence.split()[:max_words]]
  c += [[0]*max_chars for _ in range(max_words - len(c))]
  
  inputs = [[x], [c]]
  predictions = machinelearning_model.predict(inputs, batch_size=1)
  predicted_intents = predictions[0].argmax(1)
  return [intent_dataset.intents_vocab.id_to_word(i) for i in predicted_intents]

In [168]:
predict_intent("play a little song")

['PlayMusic']

# ShortText

In [0]:
#!pip3 install shorttext
#!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
#from shorttext.utils import load_word2vec_model
#wvmodel = load_word2vec_model('GoogleNews-vectors-negative300.bin.gz')

#import shorttext
#nihtraindata = shorttext.data.nihreports(sample_size=None)
#classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel)   
#classifier.train(nihtraindata)
#classifier.save_compact_model('sumvec_nihdata_model.bin')
#!ls

#classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, 'sumvec_nihdata_model.bin')

#classifier2.score('bioinformatics')

#sorted(classifier2.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]