In [1]:
import pickle
import gensim
#import pyLDAvis
#import pyLDAvis.gensim
import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import warnings
from pprint import pprint

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package stopwords to /home/amy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Note this notebook runs an HDP model to find the best number of topics. It is not strictly part of the flow and can be considered optional. It also uses a smaller sample of data to run faster.

In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#### Remove new lines

In [3]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]

In [4]:
#tweet_df = pd.read_csv("current-tweets_cleaned.csv")
tweet_df = pd.read_csv("crawled_83k_cleaned.csv")
# the actual preprocessed text in the tweets
tweets = tweet_df["text"]

#### Tokenize and remove punctuation

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [6]:
words_tr = list(sent_to_words(tweet_df.text))
#words_te = list(sent_to_words(tweet_df.text))

In [10]:
words_tr[21][:]

[]

In [11]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [12]:
words_tr = remove_stopwords(words_tr)

In [13]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    trigram = gensim.models.Phrases(bigram[words], min_count = tri_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod

In [14]:
bigram_tr, trigram_tr = bigrams(words_tr)

#### Check some items

In [15]:
print(trigram_tr[bigram_tr[words_tr[7000]]][:200])
print(bigram_tr)

['wait', 'go', 'around', 'corner', 'apartment', 'building', 'liz', 'miss', 'seeing', 'old', 'man', 'pee', 'sidewalk']
<gensim.models.phrases.Phraser object at 0x7fbc385d15f8>


#### Remove stopwords and lemmatize

In [16]:
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#spacy.load('en_core_web_sm')

In [17]:
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#### Run test through trained model - will later run test data through trained model here

In [18]:
trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr]

In [19]:
lemma_lg = lemmatization(trigrams_tr)

In [20]:
with open('tweets_lg.pkl', 'wb') as f:
    pickle.dump(lemma_lg, f)

#### Note difference to un-lemmatized un-stop-worded above

In [21]:
lemma_lg[8811][:20]

[]

## LDA

#### Dictionary and Corpus creation for LDA

In [22]:
id2word_lg = gensim.corpora.Dictionary(words_tr)
id2word_lg.filter_extremes(no_below=10, no_above=0.35)
id2word_lg.compactify()
id2word_lg.save('train_dict_lg')
corpus_lg = [id2word_lg.doc2bow(text) for text in words_tr]

In [23]:
with open('83k_tweets_lg.pkl', 'wb') as f:
    pickle.dump(corpus_lg, f)

In [24]:
corpus_lg[21][:2]

[]

In [25]:
[(id2word_lg[id], freq) for id, freq in corpus_lg[21]]

[]

## HDP Model - auto-dinals the best number of topics

In [26]:
from gensim.models import HdpModel
hdp = HdpModel(corpus_lg, id2word_lg, chunksize=5000)

In [27]:
len(hdp.print_topics())

20

In [28]:
hdp.print_topics(num_topics=20)

[(0,
  '0.001*sword + 0.001*nudes + 0.001*smooth + 0.001*verse + 0.001*made + 0.001*mots + 0.001*kat + 0.001*playing + 0.001*cheese + 0.001*screenshots'),
 (1,
  '0.002*tons + 0.002*pay + 0.001*blame + 0.001*conspiracy + 0.001*game + 0.001*dumbass + 0.001*trophy + 0.001*fear + 0.001*facts + 0.001*sue'),
 (2,
  '0.002*sword + 0.001*promise + 0.001*negative + 0.001*thank + 0.001*govt + 0.001*scratch + 0.001*past + 0.001*eyes + 0.001*protecting + 0.001*omg'),
 (3,
  '0.002*trees + 0.002*mobile + 0.001*circle + 0.001*goin + 0.001*challenge + 0.001*cross + 0.001*mercy + 0.001*history + 0.001*bright + 0.001*daughter'),
 (4,
  '0.002*partner + 0.002*articles + 0.002*habit + 0.002*reminding + 0.002*masks + 0.001*completely + 0.001*friend + 0.001*make + 0.001*cream + 0.001*ateez'),
 (5,
  '0.002*look + 0.002*mama + 0.001*receiving + 0.001*sweet + 0.001*ff + 0.001*shook + 0.001*match + 0.001*netflix + 0.001*billie + 0.001*monday'),
 (6,
  '0.002*male + 0.001*wakes + 0.001*says + 0.001*dislike + 