### This notebook assigns a topic number to each English-language tweet based on our pre-trained LDA model.
- You can find the training code in <i>train_lda.py</i>

In [1]:
import pandas as pd
import gensim

from gensim.test.utils import datapath

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()

# Load the DataFrame:
main_df = pd.read_csv('./20190406_AM_Data.csv')

# Filter by english language:
df = main_df.loc[main_df['tweetLang'].str.startswith('en')]

# Get tweets:
tweets = df['tweetText'].tolist()

# Prepare stopwords list from four languages + some custom words:
languages = ['english', 'french', 'german', 'spanish']
STOPWORDS = []
for lang in languages:
    STOPWORDS.extend(stopwords.words(lang))
custom_stopwords = ["amp", "n't", "rt", "http", "https", "migration", "migrant"]
STOPWORDS.extend(custom_stopwords)
STOPWORDS = set(STOPWORDS)
print("4 languages + custom stopwords, we have a total of {} stopwords.".format(len(STOPWORDS)))


def pre_process_text(tweet):
    """
    Takes in a tweet as input. Returns the processed tweet as a list of words.
    Processing steps include: word tokenization, lowercase, stopword removal, lemmatization.
    """
    output_words = []
    
    # Tokenize:
    words = word_tokenize(unicode(tweet, errors='ignore'))
    
    for word in words:
        word = word.lower() # lowercase
        if word[0].isalpha() and word not in STOPWORDS: # stopword removal
            w = lemmatizer.lemmatize(word) # lemmatization
            output_words.append(w)
    
    return output_words



processed_tweets = []
for tweet in tweets:
    processed_tweets.append(pre_process_text(tweet))
    
print("Number of tweets: ", len(processed_tweets))

4 languages + custom stopwords, we have a total of 847 stopwords.
('Number of tweets: ', 111785)




In [2]:
# Create Dictionary
id2word = gensim.corpora.Dictionary(processed_tweets)

# Create Corpus
texts = processed_tweets

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]]


In [3]:
# Load our trained LDA model:
lda_model = gensim.models.LdaModel.load(datapath("5_topic_LDA_only_english_extra_stopwords"))

lda_model.print_topics()

[(0,
  u'0.016*"group" + 0.016*"get" + 0.015*"watch" + 0.014*"american" + 0.012*"new" + 0.011*"refugee" + 0.011*"crisis" + 0.011*"people" + 0.011*"right" + 0.010*"guess"'),
 (1,
  u'0.029*"caravan" + 0.025*"know" + 0.025*"trump" + 0.016*"family" + 0.015*"u" + 0.015*"prevent" + 0.014*"implement" + 0.012*"vote" + 0.012*"need" + 0.011*"today"'),
 (2,
  u'0.040*"europe" + 0.036*"pact" + 0.032*"hungary" + 0.032*"voice" + 0.031*"brussels" + 0.030*"v_of_europe" + 0.016*"illegal" + 0.016*"life" + 0.015*"police" + 0.013*"street"'),
 (3,
  u'0.060*"child" + 0.019*"gang" + 0.016*"free" + 0.015*"part" + 0.014*"say" + 0.014*"government" + 0.013*"u.s." + 0.013*"youth" + 0.013*"worker" + 0.013*"salvadoran"'),
 (4,
  u'0.056*"border" + 0.042*"law" + 0.031*"cross" + 0.028*"arizona" + 0.028*"via" + 0.028*"bus" + 0.027*"section" + 0.027*"unsecured" + 0.027*"unloads" + 0.021*"stop"')]

#### Each tweet is a distribution of topics. We pick the one with the highest proportion:

In [4]:
map_tweetText_topicNumber = {}

for index, tweet in enumerate(tweets):
    topic_dist = lda_model.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    # Pick the top one:
    topic_number = sorted_topic_dist[0][0]
    map_tweetText_topicNumber[tweet] = topic_number

In [5]:
main_df['topicNumber'] = main_df['tweetText'].map(map_tweetText_topicNumber)

In [6]:
main_df.to_csv('./20190412_AM_Data.csv', index=None)

## fin.