In [1]:
import os
import argparse
import re

import logging
#import yaml
import time
import random
import pandas as pd
import numpy as np
import string 
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.wrappers import DtmModel
from gensim import matutils
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import logging.config

# configure logging
logger = logging.getLogger(__name__)

In [2]:
# set working directory
os.chdir("/home/mrh1996/LDA_COVID_Tweets")

In [3]:
# function to load data
def load_tweet_data(data_path = 'data/constructs.csv'):
    """Load data from S3 Bucket. 

    Args:
        s3path_str: str - name and path of the S3 bucket.

    Returns:
        tweet_data: dataframe - dataframe of all the tweets.
    """    
    
    logger.debug("Load data from path.")
    
    tweet_data = pd.read_csv(data_path)
    
    if len(tweet_data) > 0:
        logger.info("Dataset was loaded with %s rows", len(tweet_data))
    else: 
        logger.warning("Dataset is empty or did not load correctly!")
    
    return tweet_data

# function to remove duplicates
def remove_duplicates(df):
    """Remove any rows with duplicate text.
    
    Args: 
        df: dataframe of tweet data.
        
    Returns: 
        tweet_data: dataframe - dataframe without duplicate text entries.
    """
    
    logger.debug("Drop duplicate rows.")
    
    tweet_data = df.drop_duplicates(subset=['read_text_clean2'], keep='first')
    
    logger.info("%s rows were dropped", len(df) - len(tweet_data))
    
    return(tweet_data)

# function to format dates
def format_dates(df):
    """Format the 'create_at' data column to contain the month, day, and year only.
    
    Args: 
        df: dataframe - dataframe of the tweet_data.
        
    Returns: 
        df: dataframe - tweet data with a revised date column.
    """
    
    logger.debug("Format date column.")
    
    df['date'] = df['created_at'].str.split(' ').str[1:3]
    df['date'] = df['date'].str.join(' ')
    df['date'] = df['date'].astype(str)
    
    df['date'] = pd.to_datetime(df['date'] + ' 2020', format='%b %d %Y', errors='coerce')
    
    logger.info("New column created.")
    
    return df

# function to clean text 
def clean_text(tweets, stop_words_list, exclude, lemma):
    """Clean text data by removing punctuation and implementing lemmatization.
    
    Args:
        tweets: dataframe - dataframe subset by time.
        stop_words_list: list - list of words to remove from the analysis.
        exclude: set - set of non-alphanumeric characters to remove.
        lemma: nltk method to lemmatize text.
    
    Returns:
        doc_clean: dataframe - dataframe with processed text.
    """

    logger.debug("Begin text processing.")
    
    logger.info("Tokenize and Remove stop words")
    stop_free = " ".join([i for i in word_tokenize(tweets) if i not in stop_words_list])
    
    logger.info("Remove punctuation.")
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    
    logger.info("Lemmatize words.")
    doc_clean = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    
    return doc_clean

# function to create document term matrix and dictionary corpus 
def create_dictionary(df):
    """Create dictionary and a matrix of the terms per document.
    
    Args: 
        df: dataframe - processed dataframe.
    
    Returns:
        dictionary: corpora.dictionary - dictionary mapping each term to it's integer id.
        doc_term_matrix: list - bag of words matrix with frequency of each term mapped to dictionary id.
    
    """
    
    logger.debug("Create dictionary.")
    
    dictionary = corpora.Dictionary(df)
    
    logger.info("Dictionary created.")

    logger.debug("Create document term matrix.")
    
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in df]
    
    logger.info("Document term matrix created.")
    
    return dictionary, doc_term_matrix

In [None]:
phenotypes = phecode_vector_clean.groupby('patient_num')['phenotype'].apply(', '.join).reset_index()
phenotypes

In [4]:
# download nltk sets
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrh1996/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/mrh1996/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/mrh1996/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Prepare methods for text processing
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# create list of stop words
stop_words = set(stopwords.words('english'))
alphabet_remove = list(string.ascii_lowercase)
number_remove = list(range(0, 9999))
number_remove = map(str, number_remove) 
number_remove = list(number_remove)

stop_words_list = stop_words.union(number_remove, alphabet_remove)

In [6]:
start = time.time()

# Process data
tweet_data = pd.read_csv('data/constructs.csv')
tweet_data_train, test_data = train_test_split(tweet_data, test_size=0.20, random_state=82121)

end = time.time()
print((end - start)/60)

print('Train data:', len(tweet_data))
print('Test data:', len(test_data))

0.5472170790036519
Train data: 5585780
Test data: 1117156


In [7]:
print('Train data:', len(tweet_data_train))
print('Test data:', len(tweet_data_train))

Train data: 4468624
Test data: 4468624


## **Subset data temporality to speed up preliminary analysis**

In [8]:
tweet_data_train = tweet_data#.head(1000000)

In [9]:
tweet_data_train = remove_duplicates(tweet_data_train)
tweet_data_formatted = format_dates(tweet_data_train)

print(len(tweet_data_formatted))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


4309366


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
tweet_data_formatted.head(50)

Unnamed: 0,2,predicted,created_at,read_user_id,read_tweet_id,user_location,coordinates,place,read_text_clean2,Perceived_susceptibility,Perceived_severity,Perceived_benefits,Perceived_barriers,date
0,23,1,Fri Mar 27 18:03:15 +0000 2020,443692189,1243599480177471489,"Hyderabad, India",,,"yet no , federal lockdown \? \? make america g...",1,1,0,0,2020-03-27
1,25,1,Fri May 01 02:04:45 +0000 2020,529055116,1256041840807206914,United States,,,cases of coronavirus has been climbing the las...,1,0,0,0,2020-05-01
2,28,1,Wed Apr 15 19:33:55 +0000 2020,139283160,1250507666956443651,,,,tennessee reports 256 new cases and 11 new dea...,1,1,0,0,2020-04-15
3,50,1,Tue Jun 16 01:25:11 +0000 2020,999766907468308481,1272701725036679168,,,,well let s let oklahoma decide they don t have...,1,0,0,0,2020-06-16
4,67,1,Fri May 22 09:34:30 +0000 2020,4835434534,1263765169055858688,"Abuja, Nigeria",,,please don t be that dismissive of facts coron...,1,0,0,0,2020-05-22
5,102,1,Mon Jun 08 21:01:38 +0000 2020,48769276,1270098685628682242,,,,nyt breaking news new daily coronavirus cases ...,1,1,0,0,2020-06-08
6,107,1,Tue May 19 21:04:44 +0000 2020,1240761596894416900,1262851706037194754,,,,"ohio usa northamerica cases 28 , 956 \( 1 \) d...",1,1,0,0,2020-05-19
7,117,1,Thu Apr 23 09:17:13 +0000 2020,417553124,1253251569253888000,West Coast,,,roberts failed to understand there are far mor...,1,1,0,0,2020-04-23
8,140,1,Sat May 30 20:50:36 +0000 2020,88661178,1266834417600933895,Nigeria,,"{'id': '011a942e0a0e8fb2', 'url': 'https://api...",america coming from a lockdown to utter chaos ...,1,1,0,0,2020-05-30
9,177,1,Sun Apr 26 03:25:02 +0000 2020,85035396,1254250103113691145,🅱🅷🅰🆁🅰🆃,,,"total number of covid19 cases rise to 26 , 496...",1,0,0,0,2020-04-26


In [14]:
start = time.time()

doc_clean = [clean_text(tweets, stop_words_list, exclude, lemma).split() for tweets in tweet_data_formatted['read_text_clean2']]

end = time.time()
print((end - start)/60)

7.675488789876302e-05


In [16]:
tweet_data_formatted

Unnamed: 0,2,predicted,created_at,read_user_id,read_tweet_id,user_location,coordinates,place,read_text_clean2,Perceived_susceptibility,Perceived_severity,Perceived_benefits,Perceived_barriers,date
0,23,1,Fri Mar 27 18:03:15 +0000 2020,443692189,1243599480177471489,"Hyderabad, India",,,"yet no , federal lockdown \? \? make america g...",1,1,0,0,2020-03-27
1,25,1,Fri May 01 02:04:45 +0000 2020,529055116,1256041840807206914,United States,,,cases of coronavirus has been climbing the las...,1,0,0,0,2020-05-01
2,28,1,Wed Apr 15 19:33:55 +0000 2020,139283160,1250507666956443651,,,,tennessee reports 256 new cases and 11 new dea...,1,1,0,0,2020-04-15
3,50,1,Tue Jun 16 01:25:11 +0000 2020,999766907468308481,1272701725036679168,,,,well let s let oklahoma decide they don t have...,1,0,0,0,2020-06-16
4,67,1,Fri May 22 09:34:30 +0000 2020,4835434534,1263765169055858688,"Abuja, Nigeria",,,please don t be that dismissive of facts coron...,1,0,0,0,2020-05-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5585773,30611160,1,Mon Feb 03 22:17:07 +0000 2020,25539652,1224456809420709890,,,,china's hubei province reports 64 coronavirus ...,0,1,0,0,2020-02-03
5585774,30611162,1,Sun May 03 06:24:24 +0000 2020,1158447564036988928,1256831959902433282,"Dontmesswith, Texas",,,oh yee of such great intellect does this seem ...,1,1,0,0,2020-05-03
5585776,30611186,1,Fri Apr 24 18:27:26 +0000 2020,456541283,1253752426714234880,Great USA.,,,if you died in the hospital from say a heart a...,0,1,0,0,2020-04-24
5585777,30611187,1,Tue May 26 18:24:37 +0000 2020,402409431,1265348129055838208,United States,,,its wrong and risky for a president as a leade...,1,1,0,0,2020-05-26


In [28]:
doc_clean

[['yet', 'federal', 'lockdown', 'make', 'america', 'great', 'death'],
 ['case',
  'coronavirus',
  'climbing',
  'last',
  'three',
  'day',
  'across',
  'country',
  'ready',
  'reopen',
  'anything']]

In [13]:
dictionary, doc_term_matrix = create_dictionary(doc_clean)

**```time_seq``` should split the data by time (days, months, years)**

For this analysis, we will split by month

In [14]:
tweet_data_formatted = tweet_data_formatted.sort_values('date')
tweet_data_formatted.head()

Unnamed: 0,2,predicted,created_at,read_user_id,read_tweet_id,user_location,coordinates,place,read_text_clean2,Perceived_susceptibility,Perceived_severity,Perceived_benefits,Perceived_barriers,date
5350479,26701110,1,Mon Jan 06 14:54:46 +0000 2020,382391949,1214198629553950721,,,,"tldr not sars , possibly new coronavirus diffi...",0,0,0,0,2020-01-06
3465576,26465365,1,Fri Jan 10 13:25:20 +0000 2020,733491445798162432,1215625674376937473,"Boston, MA",,,cdc health advisory about the chinese coronavi...,0,1,0,0,2020-01-10
2881234,16746947,1,Sat Jan 11 08:09:14 +0000 2020,4134073822,1215908513064607745,,,,china reports 1st death from 'new type of coro...,0,1,0,0,2020-01-11
360846,5987839,1,Sat Jan 11 14:00:11 +0000 2020,100986964,1215996832515530752,USA,,,china reports first death from new coronavirus...,0,1,0,0,2020-01-11
2206380,5541744,1,Mon Jan 13 17:02:28 +0000 2020,1270238612,1216767481383137280,"Washington, DC",,,good news on wuhan coronavirus rpts outbreak h...,1,0,0,0,2020-01-13


In [29]:
# sort by day
#time_seq = tweet_data_formatted[['read_text_clean2', 'date']].groupby(['date'], as_index=False).agg(['count']).reset_index()

# sort by month
time_seq = tweet_data_formatted[['read_text_clean2', 'date']].groupby(pd.Grouper(key='date', freq='1M')).agg('count') # groupby each 1 month
time_seq = time_seq.reset_index()
time_seq = time_seq['read_text_clean2']

In [30]:
time_seq

0      46843
1     200078
2     679424
3    1231986
4    1359403
5     791632
Name: read_text_clean2, dtype: int64

In [16]:
def validate(topic_term, top_k):
    topic_term = np.exp(topic_term)
    topic_term = topic_term / topic_term.sum()
    topic_term = topic_term * top_k
    return topic_term

def get_topics(topic_terms, topic_number):
    topic_terms = topic_terms[topic_number]
    bestn = matutils.argsort(topic_terms, 20, reverse=True)
    beststr = [dictionary[id_] for id_ in bestn]
    return beststr

# next is the vocabulary, which we already have
vocab = []
for i in range(0, len(dictionary)):
    vocab.append(dictionary[i])

# we now need term-frequency and doc_lengths

def term_frequency(doc_term_matrix, dictionary):
    term_frequency = [0] * len(dictionary)
    doc_lengths = []
    for doc in doc_term_matrix:
        doc_lengths.append(len(doc))
        for pair in doc:
            term_frequency[pair[0]] += pair[1]
    return term_frequency, doc_lengths

In [17]:
!chmod 755 -R '/home/mrh1996/LDA_COVID_Tweets/dtm-linux64'

In [18]:
start = time.time()

cov_model = DtmModel('/home/mrh1996/LDA_COVID_Tweets/dtm-linux64', doc_term_matrix, time_seq, num_topics=4,
                             id2word=dictionary, initialize_lda=True, rng_seed = 82121)

end = time.time()
print((end - start)/60)

554.4876468539238


In [19]:
results = []

for i in range(0, (len(time_seq)-1)):
    coherence_topics = cov_model.dtm_coherence(time=i)
    cm_wrapper_cv = CoherenceModel(topics=coherence_topics, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    score = cm_wrapper_cv.get_coherence()
    print(score)
    tup = 11, i, score
    results.append(tup)

lda_results = pd.DataFrame(results, columns=['topic', 'time', 'score'])



0.36806897761694946
0.36806897761694946
0.36806897761694946
0.3728567665167569
0.37074662496564886


In [21]:
import pickle

cov_model.save(fname_or_handle = '4_topics_dups_removed') #separately=None, sep_limit=10485760, ignore=frozenset([]), pickle_protocol=2)

In [None]:
with open("doc_term_matrix_500000.txt", "wb") as fp:
  pickle.dump(doc_term_matrix, fp)