<a href="https://colab.research.google.com/github/mohammedterry/NLP_for_ML/blob/master/topics_intents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
example_text = '''
A huge fireball exploded in the Earth's atmosphere in December, according to Nasa.

The blast was the second largest of its kind in 30 years, and the biggest since the fireball over Chelyabinsk in Russia six years ago.

But it went largely unnoticed until now because it blew up over the Bering Sea, off Russia's Kamchatka Peninsula.

The space rock exploded with 10 times the energy released by the Hiroshima atomic bomb.

Lindley Johnson, planetary defence officer at Nasa, told BBC News a fireball this big is only expected about two or three times every 100 years.
'''

### preprocessing raw text

In [118]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
stopWords = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
wn_lemmatiser = WordNetLemmatizer()

def clean(text):
  return ' '.join(''.join(letter if ord('a') <= ord(letter) <= ord('z') or letter.isdigit() else ' ' for letter in text.lower()).split())

def preprocess(document):
  return [wn_lemmatiser.lemmatize(word,pos='v') for word in clean(document).split() if word not in stopWords]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [146]:
' '.join(preprocess(example_text))

'huge fireball explode earth atmosphere december accord nasa blast second largest kind 30 years biggest since fireball chelyabinsk russia six years ago go largely unnoticed blow bering sea russia kamchatka peninsula space rock explode 10 time energy release hiroshima atomic bomb lindley johnson planetary defence officer nasa tell bbc news fireball big expect two three time every 100 years'

# LDA


### Pre-trained LDA models

load some pretrained lda models

In [0]:
from gensim.test.utils import datapath
import gensim

lda1 = gensim.models.LdaMulticore.load(datapath("lda_3_0_1_model"))
lda2 = gensim.models.LdaMulticore.load(datapath("ldamodel_python_3_5"))

In [0]:
def lda_topics(text, model, top_n = 3):
  results = dict(model[model.id2word.doc2bow(preprocess(text))])
  return[(topic,'/'.join([w for w in model.print_topic(topic,5).split('"') if w.isalpha()]), score) for score,topic in sorted(zip(results.values(), results.keys()), reverse=True)][:top_n]

try them out

In [143]:
lda_topics(example_text, lda1)

[(0, 'system/user/eps/time/response', 0.8268186987472599),
 (1, 'graph/trees/minors/survey/interface', 0.17318130125274017)]

In [144]:
lda_topics(example_text, lda2)

[(1, 'response/user/eps/survey/time', 0.8268134549578768),
 (0, 'graph/trees/minors/system/computer', 0.17318654504212305)]

### Train your own LDA model

Get some sample documents to represent the topics

In [0]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True, remove=('headers', 'footers', 'quotes'))
preprocessed_docs = [preprocess(doc) for doc in newsgroups_train.data]

train an LDA model on the sample documents

In [0]:
import gensim
vocab = gensim.corpora.Dictionary(preprocessed_docs)
vocab.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
bow_corpus = [vocab.doc2bow(doc) for doc in preprocessed_docs]
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 20, id2word = vocab, passes = 10, workers = 2)

save the trained LDA model

In [0]:
from gensim.test.utils import datapath
temp_file = datapath("lda")
lda_model.save(temp_file)

load in the trained LDA model

In [0]:
your_lda = gensim.models.LdaMulticore.load(temp_file)

test out the trained LDA model

In [145]:
lda_topics(example_text, your_lda)

[(3, 'gun/government/us/really/keep', 0.33644345),
 (17, 'state/program/report/national/research', 0.27499065),
 (2, 'wire/cause/power/back/grind', 0.16544925)]

# ShortText

In [0]:
!pip3 install shorttext
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
from shorttext.utils import load_word2vec_model
wvmodel = load_word2vec_model('GoogleNews-vectors-negative300.bin.gz')

import shorttext
nihtraindata = shorttext.data.nihreports(sample_size=None)
classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel)   
classifier.train(nihtraindata)
classifier.save_compact_model('sumvec_nihdata_model.bin')
!ls

classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, 'sumvec_nihdata_model.bin')

classifier2.score('bioinformatics')

sorted(classifier2.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5]