# NLP Tutorial

### Elena Kochkina

NESTA HackSTIR

22.10.2019

# Part III. Topic modelling

## Imports

In [None]:
import nltk
nltk.download('punkt')
import re
nltk.download('reuters')
from nltk.corpus import reuters
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
warnings.filterwarnings("ignore")

## Documents

In [None]:
documents_example = ['I like cats and dogs', 'Cats are furry animals', 'Dogs are good friends', 'Apples and carrots are healthy foods', 'Humans should maintain a healthy diet', 'If your diet consists of burgers it is not very healthy']

In [None]:
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(documents_example)
vocabulary = vectorizer.get_feature_names()

## Latent Dirichlet Allocation

[documentation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)

In [None]:
no_topics = 2
lda = LatentDirichletAllocation(n_components=no_topics).fit(bow_matrix)

In [None]:
no_top_words = 3
for topic_idx, topic in enumerate(lda.components_):
        print ("Topic ", topic_idx)
        print (" ".join([vocabulary[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
for topic_idx, topic in enumerate(lda.components_):
  print ("Topic ", topic_idx)
  for i in range(len(vocabulary)):
    print (vocabulary[i],topic[i])

In [None]:
lda.transform(bow_matrix)

## Get Reuters dataset

In [None]:
documents_train = []
labels_train = []
documents_test = []
labels_test = []
#categories = reuters.categories()
categories = ['wheat','gold','ship','coffee','grain']
for cat in categories:
  print (cat)
  print (len(reuters.fileids(cat)))
  for fileid in reuters.fileids(cat):
    if fileid.startswith('training'):
      documents_train.append(reuters.raw(fileid))
      labels_train.append(cat)
    else:
      documents_test.append(reuters.raw(fileid))
      labels_test.append(cat)
  

In [None]:
documents_train_preprocessed = []
for d in documents_train:
  newd = d.lower()
  newd = re.sub(r'[^A-Za-z0-9 ]+', '', newd)
  documents_train_preprocessed.append(newd)
  
documents_test_preprocessed = []
for d in documents_test:
  newd = d.lower()
  newd = re.sub(r'[^A-Za-z0-9 ]+', '', newd)
  documents_test_preprocessed.append(newd)
  
vectorizer = CountVectorizer(stop_words='english')
X_train_bow = vectorizer.fit_transform(documents_train_preprocessed)
X_test_bow = vectorizer.transform(documents_test_preprocessed)
vocabulary = vectorizer.get_feature_names()

## LDA

In [None]:
no_topics = 5
lda = LatentDirichletAllocation(n_components=no_topics).fit(X_train_bow)


In [None]:
no_top_words = 5
for topic_idx, topic in enumerate(lda.components_):
        print ("Topic ", topic_idx)
        print (" ".join([vocabulary[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
lda.transform(X_train_bow[0])

In [None]:
lda.transform(X_test_bow[0])