In [73]:
import pandas as pd
import numpy as np
import os
import matplotlib as plt
import re
import warnings
from wordcloud import WordCloud
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

from pprint import pprint

#SKYLEARN
from sklearn.datasets import fetch_20newsgroups 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#NTLK
import nltk
nltk.download('stopwords')
nltk.download('words')
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer, WhitespaceTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn 
from nltk.corpus import stopwords

#CORPORA
import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaMulticore
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet

#PYLDAVIS
import pyLDAvis
import pyLDAvis.gensim



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dominic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/dominic/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [74]:

def preprocess( data ):
    newdata = []
    for idx, x in enumerate(data): #x -- document in data 
        newdoc = ""
        stop_words = stopwords.words('english')
        words = set(nltk.corpus.words.words())

        for token in sent_tokenize(x): #token -- sentence in doc
            seperator = " " 
            token = re.sub('[^A-Za-z0-9]+', ' ', token) #remove special characters
            token = re.sub(r'\d+', '', token) #remove numbers
            token = re.sub(r'\b\w{1,2}\b', '', token) #remove words with <= 2 characters
            #token = " ".join(w for w in nltk.wordpunct_tokenize(token) \
            #         if w.lower() in words or not w.isalpha())
            whitespace_token = WhitespaceTokenizer().tokenize( token )
            wo_stopwords_token = [x for x in whitespace_token
                                  if not x in stop_words]
            newdoc += seperator.join( (wo_stopwords_token) ).lower()
            newdoc += " " 
        #create word tokens for each document
        word_tokens = RegexpTokenizer('\s+', gaps=True).tokenize(newdoc)
        
        data[idx] = word_tokens 
        newdata.append( data[idx] )
    return newdata


In [75]:
def tf_idf( data ):
    
    dictionary = Dictionary(data)
    pprint("Dictionary Loaded")
    bow_corpus = [dictionary.doc2bow(doc) for doc in data]
    
    for idx, doc in enumerate(data):
        data[idx] = " ".join(doc)
    
  
    tfidf_vectorizer = TfidfVectorizer(max_features=4000, analyzer='word', use_idf=True)
    tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(data)
    #pprint(tfidf_vectoriz)
    #first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
    
    #df_idf = pd.DataFrame(tfidf_vectorizer.idf_, index=tfidf_vectorizer.get_feature_names(),columns=["idf_weights"])
    #df_idf.sort_values(by=['idf_weights'], ascending=False)
    #pprint(df_idf)
    
    names = tfidf_vectorizer.get_feature_names()
    tf_weights = tfidf_vectorizer.idf_
    tfidf = TfidfModel(bow_corpus)
    # Get TF-IDF weights
    weights = tfidf[bow_corpus[0]]
    
    weights = dict([(dictionary[pair[0]], pair[1]) for pair in weights])
    #pprint(list(names[:10]))
    #weights = dict(zip(names, tf_weights))
   # pprint(list(weights[:10]))
    
    wc = WordCloud(
    background_color="white",
    max_words=2000,
    width = 1024,
    height = 720,
    )

    # Generate the cloud
    wc.generate_from_frequencies(weights)

    # Save the could to a file
    wc.to_file("word_cloud.png")

In [76]:
def bag_of_words(data):
    dictionary = Dictionary(data)
    pprint("Dictionary Loaded")
    bow_corpus = [dictionary.doc2bow(doc) for doc in data]
    
    pprint("----------------NEW----------------")
    pprint(len(dictionary))
    
    vocab = list(dictionary.values()) #list of terms in the dictionary
    
    vocab_tf = [dict(i) for i in bow_corpus]
    vocab_tf = list(pd.DataFrame(vocab_tf).sum(axis=0)) #list of term frequencies
    
    bow = pd.DataFrame(vocab_tf, index=vocab, columns=["term_freq"])
    #pprint(bow)

In [92]:
def bow_lda_model(data):
    l_dictionary = Dictionary(data)
    pprint("Dictionary Loaded")
    bow_corpus = [l_dictionary.doc2bow(doc) for doc in data]
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=l_dictionary,
                                           num_topics=2, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)
    
    pprint(lda_model.print_topics())
    
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary=l_dictionary)
    # Save model to disk.
    #temp_file = datapath("model")
    #lda_model.save(temp_file)
    pyLDAvis.save_html(vis, 'lda.html')
    pyLDAvis.view(vis)


In [94]:
def tfidf_lda_model(data):
    tf_dictionary = Dictionary(data)
    pprint("Dictionary Loaded")
    tfidfcorpus = [tf_dictionary.doc2bow(doc) for doc in data]
    
    tfidf_lda_model = gensim.models.LdaMulticore(tfidfcorpus,
                            num_topics = 4, id2word=tf_dictionary)
    
    pprint(tfidf_lda_model.print_topics())
    
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(tfidf_lda_model, tfidfcorpus, dictionary=tf_dictionary)
    pyLDAvis.save_html(vis, 'tfidf_lda.html')
    pyLDAvis.display(vis)

In [82]:
print("Loading 20 newsgroups dataset for categories:")
#pprint(list(newsgroups_train.target_names))

newsgroups_train = fetch_20newsgroups(subset='train', categories=['sci.space'],
                                      remove=('headers', 'footers', 'quotes'))

#pprint(newsgroups_train.data[102])
#pprint(" ------ CLEANED DATA --------- ")

newdata = preprocess( newsgroups_train.data );
#pprint( newdata[102] )
#pprint(list(newdata[:10]))


Loading 20 newsgroups dataset for categories:


In [95]:
#tf_idf( newdata )
#bag_of_words( newdata )
#bow_lda_model( newdata )
tfidf_lda_model( newdata )

'Dictionary Loaded'
[(0,
  '0.011*"the" + 0.011*"space" + 0.004*"would" + 0.004*"one" + 0.004*"nasa" + '
  '0.003*"orbit" + 0.003*"shuttle" + 0.003*"also" + 0.003*"like" + '
  '0.003*"moon"'),
 (1,
  '0.019*"space" + 0.008*"the" + 0.004*"would" + 0.003*"launch" + 0.003*"like" '
  '+ 0.003*"nasa" + 0.003*"one" + 0.003*"time" + 0.003*"year" + 0.003*"this"'),
 (2,
  '0.009*"space" + 0.009*"the" + 0.007*"nasa" + 0.005*"launch" + 0.004*"one" + '
  '0.004*"would" + 0.004*"also" + 0.003*"data" + 0.003*"new" + 0.003*"like"'),
 (3,
  '0.015*"the" + 0.010*"space" + 0.006*"would" + 0.005*"nasa" + 0.004*"earth" '
  '+ 0.003*"this" + 0.003*"lunar" + 0.003*"moon" + 0.003*"orbit" + '
  '0.003*"first"')]
