# Build Data Representations 
### text only, text + hashtags, and hybrid

It is assumed that a master csv file (named "master_individual.csv") exists containing the following information for each individual tweet:
- tweet ID, full text, sentiment score, date, hashtags
- location of origin data (city, state, place type, zip code, metropolitan area)
- data for the zip code from which the tweet originates (average Zillow Home Value Index (ZHVI), number of establishments in educational services, number of establishments in healthcare and social assistance, number of establishments in professional, scientific, and technical services, ground truth vaccine hesitancy, binarized ground truth vaccine hesitancy)

Note: When binarizing the ground truth vaccine hesitancy for each zip code, we used 0.70 as the cut-off (i.e. a continuous ground truth vaccine hesitancy of >= 0.70 corresponds to a 1 and all other cases results in 0). However, binarized ground truth vaccine hesitancy is ultimately not used in study.

In [71]:
import pandas as pd
import numpy as np
import fasttext
import re
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

The full_text attribute contains the entire tweet: text + hashtags.

In [72]:
master_individual = pd.read_csv('master_individual.csv')
print(master_individual.shape)

(29458, 16)


# 1) Count number of tweets and hashtags before initial text processing

### 1.1) Number of tweets in each metropolitan area

In [74]:
num_hashtags_df = pd.DataFrame()
num_hashtags_df['full_text'] = master_individual['full_text'].values
num_hashtags_df['hashtags'] = master_individual['hashtags'].values
num_hashtags_df['metropolitan_area'] = master_individual['metropolitan_area'].values

newyork = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'NewYork']
newyork.reset_index(drop=True, inplace=True)

losangeles = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'LosAngeles']
losangeles.reset_index(drop=True, inplace=True)

chicago = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Chicago']
chicago.reset_index(drop=True, inplace=True)

houston = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Houston']
houston.reset_index(drop=True, inplace=True)

sandiego = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'SanDiego']
sandiego.reset_index(drop=True, inplace=True)

philadelphia = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Philadelphia']
philadelphia.reset_index(drop=True, inplace=True)

dallas = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Dallas']
dallas.reset_index(drop=True, inplace=True)

phoenix = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Phoenix']
phoenix.reset_index(drop=True, inplace=True)

sanantonio = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'SanAntonio']
sanantonio.reset_index(drop=True, inplace=True)

print('Number of tweets:')
print('New York:', newyork.shape)
print('Los Angeles:', losangeles.shape)
print('Chicago:', chicago.shape)
print('Houston:', houston.shape)
print('San Diego:', sandiego.shape)
print('Philadelphia:', philadelphia.shape)
print('Dallas:', dallas.shape)
print('Phoenix:', phoenix.shape)
print('San Antonio:', sanantonio.shape)

Number of tweets:
New York: (12612, 3)
Los Angeles: (10532, 3)
Chicago: (1544, 3)
Houston: (1529, 3)
San Diego: (1061, 3)
Philadelphia: (817, 3)
Dallas: (664, 3)
Phoenix: (496, 3)
San Antonio: (203, 3)


### 1.2) Number of hashtags in each metropolitan area 

In [75]:
# count hashtags by looking at how many '#' appear in full text
def count_hashtags(df):
    tokenized_df = df.split()
    count = 0
    for item in tokenized_df:
        # check if # is in the item as opposed to the first character
        # because if the # immediately follows an emoji, for instance, then
        # it won't be counted (even though it should)
        if '#' in item:
            for character in item:
                if character == '#':
                    count += 1
    return count

In [76]:
print('total num hashtags New York:', newyork['full_text'].apply(count_hashtags).sum())
print('total num hashtags Los Angeles:', losangeles['full_text'].apply(count_hashtags).sum())
print('total num hashtags Chicago:', chicago['full_text'].apply(count_hashtags).sum())
print('total num hashtags Houston:', houston['full_text'].apply(count_hashtags).sum())
print('total num hashtags San Diego:', sandiego['full_text'].apply(count_hashtags).sum())
print('total num hashtags Philadelphia:', philadelphia['full_text'].apply(count_hashtags).sum())
print('total num hashtags Dallas:', dallas['full_text'].apply(count_hashtags).sum())
print('total num hashtags Phoenix:', phoenix['full_text'].apply(count_hashtags).sum())
print('total num hashtags San Antonio:', sanantonio['full_text'].apply(count_hashtags).sum())

total num hashtags New York: 41232
total num hashtags Los Angeles: 37030
total num hashtags Chicago: 3857
total num hashtags Houston: 5557
total num hashtags San Diego: 3019
total num hashtags Philadelphia: 2753
total num hashtags Dallas: 2250
total num hashtags Phoenix: 1576
total num hashtags San Antonio: 765


# 2) Initial text processing

Check if any tweets are retweets.

In [77]:
num_retweets = len(master_individual[master_individual['full_text'].str.startswith('RT')])
print('Number of retweets:', num_retweets)

Number of retweets: 0


Tokenize full text, make lower case, and remove mentions.

In [78]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=False, strip_handles=True)
master_individual['processed_full_text'] = master_individual['full_text'].apply(tweet_tokenizer.tokenize)

Remove URLs, stop words, tokens of length <= 1, any characters other than letters.

In [79]:
def remove_extra_information(df):
    
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    stop_words.remove('no')
    stop_words.remove('nor')
    stop_words.remove('very')
    stop_words.remove('most')
    
    new_tweet = []
    
    for token in df:
        
        if token[0:4] == 'http':
            continue
        if token in stop_words:
            continue
        if token == '' or token == ' ' or len(token) == 1:
            continue
        
        token = re.sub(r'[^a-zA-Z#]', '', token)
        token_list = token.split()
        
        if token_list:
            for item in token_list:
                if item in stop_words:
                    continue
                if item == '' or item == ' ' or len(item) == 1:
                    continue
                new_tweet.append(item)
    
    return new_tweet

In [80]:
master_individual['processed_full_text'] = master_individual['processed_full_text'].apply(remove_extra_information)

# 3) Count number of tweets and hashtags after initial text processing

### 3.1) Number of tweets in each metropolitan area

In [81]:
num_hashtags_df = pd.DataFrame()
num_hashtags_df['full_text'] = master_individual['full_text'].values
num_hashtags_df['processed_full_text'] = master_individual['processed_full_text'].values
num_hashtags_df['metropolitan_area'] = master_individual['metropolitan_area'].values

newyork = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'NewYork']
newyork.reset_index(drop=True, inplace=True)

losangeles = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'LosAngeles']
losangeles.reset_index(drop=True, inplace=True)

chicago = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Chicago']
chicago.reset_index(drop=True, inplace=True)

houston = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Houston']
houston.reset_index(drop=True, inplace=True)

sandiego = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'SanDiego']
sandiego.reset_index(drop=True, inplace=True)

philadelphia = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Philadelphia']
philadelphia.reset_index(drop=True, inplace=True)

dallas = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Dallas']
dallas.reset_index(drop=True, inplace=True)

phoenix = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'Phoenix']
phoenix.reset_index(drop=True, inplace=True)

sanantonio = num_hashtags_df[num_hashtags_df['metropolitan_area'] == 'SanAntonio']
sanantonio.reset_index(drop=True, inplace=True)

print('Number of tweets:')
print('New York:', newyork.shape)
print('Los Angeles:', losangeles.shape)
print('Chicago:', chicago.shape)
print('Houston:', houston.shape)
print('San Diego:', sandiego.shape)
print('Philadelphia:', philadelphia.shape)
print('Dallas:', dallas.shape)
print('Phoenix:', phoenix.shape)
print('San Antonio:', sanantonio.shape)

Number of tweets:
New York: (12612, 3)
Los Angeles: (10532, 3)
Chicago: (1544, 3)
Houston: (1529, 3)
San Diego: (1061, 3)
Philadelphia: (817, 3)
Dallas: (664, 3)
Phoenix: (496, 3)
San Antonio: (203, 3)


### 3.2) Number of hashtags in each metropolitan area

In [82]:
# function to count number of hashtags
def get_num_hashtags(df):
    hashtag_count = 0
    unique_hashtags = set()
    for index, row in df.iterrows():
        count, unique_hashtags = count_hashtags(row['processed_full_text'], unique_hashtags)
        hashtag_count += count
    return hashtag_count, len(unique_hashtags)

In [83]:
# function to count number of hashtags and add unique hashtags to set
def count_hashtags(full_text, unique_hashtag_set):
    hashtag_count = 0
    for token in full_text:
        if token[0] == '#':
            hashtag_count += 1
            unique_hashtag_set.add(token)
    return hashtag_count, unique_hashtag_set

In [84]:
print('(total num hashtags, num unique hashtags) New York:', get_num_hashtags(newyork))
print('(total num hashtags, num unique hashtags) Los Angeles:', get_num_hashtags(losangeles))
print('(total num hashtags, num unique hashtags) Chicago:', get_num_hashtags(chicago))
print('(total num hashtags, num unique hashtags) Houston:', get_num_hashtags(houston))
print('(total num hashtags, num unique hashtags) San Diego:', get_num_hashtags(sandiego))
print('(total num hashtags, num unique hashtags) Philadelphia:', get_num_hashtags(philadelphia))
print('(total num hashtags, num unique hashtags) Dallas:', get_num_hashtags(dallas))
print('(total num hashtags, num unique hashtags) Phoenix:', get_num_hashtags(phoenix))
print('(total num hashtags, num unique hashtags) San Antonio:', get_num_hashtags(sanantonio))

(total num hashtags, num unique hashtags) New York: (40419, 10764)
(total num hashtags, num unique hashtags) Los Angeles: (36507, 12422)
(total num hashtags, num unique hashtags) Chicago: (3792, 1891)
(total num hashtags, num unique hashtags) Houston: (5505, 2526)
(total num hashtags, num unique hashtags) San Diego: (2980, 1769)
(total num hashtags, num unique hashtags) Philadelphia: (2727, 1349)
(total num hashtags, num unique hashtags) Dallas: (2231, 1211)
(total num hashtags, num unique hashtags) Phoenix: (1563, 1003)
(total num hashtags, num unique hashtags) San Antonio: (756, 340)


# 4) Build data representations

### 4.1) Text only

In [85]:
def remove_hashtags(df):
    new_tweet = []
    for token in df:
        if token[0] == '#':
            continue
        new_tweet.append(token)
    return new_tweet

In [86]:
def count_words(df):
    return len(df)

In [87]:
def lemmatize_text(df):
    lemmatizer = WordNetLemmatizer()
    new_tweet_text = []
    for token in df:
        new_tweet_text.append(lemmatizer.lemmatize(token))
    return new_tweet_text

In [88]:
def create_full_string(df):
    return ' '.join(df)

In [89]:
pd.options.display.max_colwidth = 200
master_individual_copy = master_individual.copy()

text_only_df = pd.DataFrame()
text_only_df['metropolitan_area'] = master_individual_copy['metropolitan_area']
text_only_df['zip_code'] = master_individual_copy['zip_code']
text_only_df['vac_hes_bin'] = master_individual_copy['vac_hes_bin']

text_only_df['processed_tweet'] = master_individual_copy['processed_full_text'].apply(remove_hashtags)
text_only_df['num_words'] = text_only_df['processed_tweet'].apply(count_words)
text_only_df['processed_tweet'] = text_only_df['processed_tweet'].apply(lemmatize_text)
text_only_df['processed_tweet'] = text_only_df['processed_tweet'].apply(create_full_string)

print(text_only_df.shape)
print(text_only_df['zip_code'].nunique())

(29458, 5)
493


In [90]:
group_by_vac_hes = (text_only_df.groupby('vac_hes_bin')['num_words'].apply(lambda x: np.mean(x)).reset_index(name='avg_len_tweet'))
group_by_met_area = (text_only_df.groupby('metropolitan_area')['num_words'].apply(lambda x: np.mean(x))).reset_index(name='avg_len_tweet')

### 4.2) Text + hashtags

In [91]:
def remove_pound_symbol(df):
    new_tweet = []
    for token in df:
        if token[0] == '#':
            token = token[1:]
        new_tweet.append(token)
    return new_tweet

In [92]:
pd.options.display.max_colwidth = 200
master_individual_copy = master_individual.copy()

text_and_hashtags_df = pd.DataFrame()
text_and_hashtags_df['metropolitan_area'] = master_individual_copy['metropolitan_area']
text_and_hashtags_df['zip_code'] = master_individual_copy['zip_code']
text_and_hashtags_df['vac_hes_bin'] = master_individual_copy['vac_hes_bin']

text_and_hashtags_df['processed_tweet'] = master_individual_copy['processed_full_text'].apply(remove_pound_symbol)
text_and_hashtags_df['processed_tweet'] = text_and_hashtags_df['processed_tweet'].apply(lemmatize_text)
text_and_hashtags_df['num_words'] = text_and_hashtags_df['processed_tweet'].apply(count_words)
text_and_hashtags_df['processed_tweet'] = text_and_hashtags_df['processed_tweet'].apply(create_full_string)

print(text_and_hashtags_df.shape)
print(text_and_hashtags_df['zip_code'].nunique())

(29458, 5)
493


In [93]:
group_by_vac_hes = (text_and_hashtags_df.groupby('vac_hes_bin')['num_words'].apply(lambda x: np.mean(x)).reset_index(name='avg_len_tweet'))
group_by_met_area = (text_and_hashtags_df.groupby('metropolitan_area')['num_words'].apply(lambda x: np.mean(x))).reset_index(name='avg_len_tweet')

### 4.3) Hybrid

In [94]:
def count_hashtags_and_words(df):
    tokenized_hashtags = df.split()
    return len(tokenized_hashtags)

In [95]:
def lemmatize_hashtags_and_words(df):
    tokenized_hashtags = df.split()
    lemmatizer = WordNetLemmatizer()
    new_tweet_text = []
    for token in tokenized_hashtags:
        new_tweet_text.append(lemmatizer.lemmatize(token))
    return ' '.join(new_tweet_text)

In [96]:
master_individual_copy = master_individual.copy()

tweet_list = []
for index, row in master_individual_copy.iterrows():
    
    hashtags = row['hashtags']
    full_text_list = row['processed_full_text']
    new_tweet = []
    
    if hashtags == '[]':
        # use full text
        tweet_list.append(' '.join(full_text_list))
    else:
        # use hashtags
        hashtags_list = []
        for token in full_text_list:
            if token[0] == '#':
                hashtags_list.append(token[1:])
        tweet_list.append(' '.join(hashtags_list))

hybrid_df = pd.DataFrame(tweet_list, columns=['processed_tweet'])
hybrid_df['processed_tweet'] = hybrid_df['processed_tweet'].apply(lemmatize_hashtags_and_words)
hybrid_df['num_tokens'] = hybrid_df['processed_tweet'].apply(count_hashtags_and_words)

hybrid_df['metropolitan_area'] = master_individual_copy['metropolitan_area']
hybrid_df['zip_code'] = master_individual_copy['zip_code']
hybrid_df['vac_hes_bin'] = master_individual_copy['vac_hes_bin']

print(hybrid_df.shape)
print(hybrid_df['zip_code'].nunique())

(29458, 5)
493


In [97]:
group_by_vac_hes = (hybrid_df.groupby('vac_hes_bin')['num_tokens'].apply(lambda x: np.mean(x)).reset_index(name='avg_len_tweet'))
group_by_met_area = (hybrid_df.groupby('metropolitan_area')['num_tokens'].apply(lambda x: np.mean(x))).reset_index(name='avg_len_tweet')

# 5) Embed tweet text

### 5.1) Load pre-trained model from fastText

In [98]:
def load_pretrained_model(pretrained_model_path='/Volumes/More Memory/Covid-19 Project Data/wiki/wiki.en.bin'):
    return fasttext.load_model(pretrained_model_path)

In [99]:
model = load_pretrained_model()

In [100]:
# function to vectorize tweets using fastText pre-trained model
def vectorize_tweets(tweets_list):
    vectors = []
    for tweet in tweets_list:
        vectors.append(model.get_sentence_vector(tweet))
    return np.asarray(vectors)  

In [101]:
# function to create headers of dimensions in dataframe
def create_dimensions_array():
    values = np.arange(0, 300, 1)
    dimensions = []
    for dim in values:
        dimensions.append('dim_' + str(dim))
    return dimensions

In [102]:
# function to add additional information (tweet-level and zip code-level) to dataframe 
def add_additional_information(df):
    df['id'] = master_individual['id'].values
    df['sentiment'] = master_individual['sentiment'].values
    df['zip_code'] = master_individual['zip_code'].values
    df['metropolitan_area'] = master_individual['metropolitan_area'].values
    df['avg_zhvi'] = master_individual['avg_zhvi'].values
    df['num_est_educ_serv'] = master_individual['num_est_educ_serv'].values
    df['num_est_healthcare_social_assist'] = master_individual['num_est_healthcare_social_assist'].values
    df['num_est_prof_sci_tech_serv'] = master_individual['num_est_prof_sci_tech_serv'].values
    df['vac_hes'] = master_individual['vac_hes'].values
    df['vac_hes_bin'] = master_individual['vac_hes_bin'].values
    return df

### 5.2) Embed text only representation

In [103]:
tweets_list = text_only_df['processed_tweet'].values
vectors_array = vectorize_tweets(tweets_list)
vectors_array.shape

(29458, 300)

In [104]:
dimensions = create_dimensions_array()
text_only_vectors = pd.DataFrame(vectors_array, columns=dimensions)
text_only_vectors = add_additional_information(text_only_vectors)
print(text_only_vectors.shape)
print(text_only_vectors['zip_code'].nunique())

(29458, 310)
493


### 5.3) Embed text + hashtags representation

In [105]:
tweets_list = text_and_hashtags_df['processed_tweet'].values
vectors_array = vectorize_tweets(tweets_list)
vectors_array.shape

(29458, 300)

In [106]:
dimensions = create_dimensions_array()
text_and_hashtags_vectors = pd.DataFrame(vectors_array, columns=dimensions)
text_and_hashtags_vectors = add_additional_information(text_and_hashtags_vectors)
print(text_and_hashtags_vectors.shape)
print(text_and_hashtags_vectors['zip_code'].nunique())

(29458, 310)
493


### 5.4) Embed hybrid representation

In [107]:
tweets_list = hybrid_df['processed_tweet'].values
vectors_array = vectorize_tweets(tweets_list)
vectors_array.shape

(29458, 300)

In [108]:
dimensions = create_dimensions_array()
hybrid_vectors = pd.DataFrame(vectors_array, columns=dimensions)
hybrid_vectors = add_additional_information(hybrid_vectors)
print(hybrid_vectors.shape)
print(hybrid_vectors['zip_code'].nunique())

(29458, 310)
493


# 6) Save dataframes as CSV files

In [109]:
text_only_vectors.to_csv('text_only_tweets.csv', index=False)
text_and_hashtags_vectors.to_csv('text_and_hashtags_tweets.csv', index=False)
hybrid_vectors.to_csv('hybrid_tweets.csv', index=False)