In [None]:
print('kernel is live')

In [2]:
import pandas as pd
import numpy as np
import os
import json

In [3]:
pd.options.display.max_colwidth = 400

In [4]:
data_path = '/data_volume/antivax/'
tweets = pd.read_parquet(data_path+'all_geotagged_tweets.parquet') #nonpublic data
df = tweets[tweets['retweeted_tweet_id']==-1].drop_duplicates(subset=['text']).reset_index(drop=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6357813 entries, 0 to 17464208
Data columns (total 8 columns):
tweet_id              int64
text                  object
tweet_time            datetime64[ns]
user_id               int64
lang                  object
retweeted_tweet_id    int64
FIPS                  int64
is_antivax            int64
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 436.6+ MB


In [6]:
df2 = df.sample(50000).drop_duplicates(subset='text').reset_index()[['text']]
len(df2)

50000

In [7]:
df3=df2.copy()

In [33]:
df2 = df3[45000:50000] # select 5000 tweets to manually label

In [34]:
# dump to jsonl file
json_str = ''
for t in df2.to_dict('records'):
    json_str+=json.dumps(t)+'\n'
json_str = json_str[:-1]

with open('/data_volume/antivax/random_sample_5000j.jsonl', 'w+') as f:
    f.write(json_str)

In [None]:
# use a keyword search to look for tweets that are more likely to be antivax related, to get a more balanced training dataset. 
# these are NOT used for test data. test data is a pure random sample of all_geotagged_tweets.parquet, also labeled using Prodigy.
antivax_keywords = [
                    'experiment', 
                    'palsy','plandemic','survival rate', 'unprov', 'against humanity', 
                    'not take', 'msm', 'hoax', 'rush', 'poison', 'injur', 'untest', 
                    'chemical', 'not a vaccine', 'fertil','miscar','autism', 'not get',
                    "health defense",'nuremberg',
                    'guinea','depopulat', 'censor','bioweapon', '"vaccine"',"'vaccine'",
                    'gene', 'vaer'
]

df3 = df2[df2['text'].apply(lambda x: any([(k in x.lower()) for k in antivax_keywords]))].reset_index(drop=True)
print(len(df3))

# dump to jsonl file
json_str = ''
for t in df3.sample(frac=1,replace=False).to_dict('records'):
    json_str+=json.dumps(t)+'\n'
json_str = json_str[:-1]

with open('/data_volume/antivax/contain_antivax_keyword.jsonl', 'w+') as f:
    f.write(json_str)
    
df3

In [None]:
#Conduct manual labeling using Prodigy

# label using:
# python3 -m prodigy textcat.manual antivax /data_volume/antivax/tweets_to_label.jsonl --label Antivax

#then:
# python3 -m prodigy db-out antivax >> /data_volume/home/jmbollen/antivax_paper/prodigy_labels.jsonl


In [None]:
#Clean up the Prodigy output to a usable csv

# load jsonl
labels = list()
with open('/data_volume/home/jmbollen/antivax_paper/prodigy_labels.jsonl','r') as f:
    for line in f.readlines():
        labels.append(json.loads(line))
# labels = labels[:2400]
with open('/data_volume/antivax/random_train_labeled.jsonl','r') as f:
    for line in f.readlines():
        labels.append(json.loads(line))
        
#convert to df
labels = pd.DataFrame(labels)
labels = labels[['text','answer']]
labels = labels[labels['answer']!='ignore'].reset_index(drop=True) #drop ignored tweets
labels['answer'] = labels['answer'].map({'accept':True,'reject':False})
labels = labels.rename(columns ={'answer':'is_antivax'})
labels = labels.drop_duplicates().dropna().reset_index(drop=True)
labels

In [None]:
labels['is_antivax'].sum()/len(labels)

In [None]:
labels.to_csv('./labeled_data/antivax_labels.csv',index=False)

# rebalance labeled data by word frequency

some training data was selected by keywords associated with antivax content. this biased the training data.  this rebalancing attempts to correct for that (and does so quite successfully, judging by the improvement in classifier results on the unbiased test data). 


In [None]:
# try to correct for oversampling of certain words introduced by the keyword search above.

print('live')

import pandas as pd
import itertools
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords

df = pd.read_csv('./labeled_data/antivax_labels.csv')
df2 = pd.read_parquet('/data_volume/antivax/'+'all_geotagged_tweets.parquet')

df2 = df2.sample(50000)

punct = """!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~"""
def flatten_list_of_lists(l):
    return list(itertools.chain(*l))

def find_oversampled_words(df, baseline_words):
    df['words'] = df['text'].apply(lambda x: [i.strip(punct) for i in x.lower().split() if (i not in stopwords and len(i)>2)])
    train_words = pd.Series(flatten_list_of_lists(list(df['words']))).value_counts()
    train_words = train_words[train_words>5]
    
    words = list(set(train_words.index).intersection(set(baseline_words.index)))
    baseline_words = baseline_words[words]
    baseline_words = (baseline_words/baseline_words.sum()).sort_values(ascending=False)
    train_words = train_words[words]
    train_words = (train_words/train_words.sum()).sort_values(ascending=False)
    
    dif = (train_words-baseline_words)/baseline_words**(1/2)
    oversampled_words = set(dif[dif>0.02].index).difference(set(["i'm", "doesn't", "san","antonio"]))

    return oversampled_words

df2['words'] = df2['text'].apply(lambda x: [i.strip(punct) for i in x.lower().split() if (i not in stopwords and len(i)>2)])
baseline_words = pd.Series(flatten_list_of_lists(list(df2['words']))).value_counts()
baseline_words = baseline_words[baseline_words>5]

oversampled_words = find_oversampled_words(df, baseline_words)

while len(oversampled_words)>0:
    df['contains_oversampled'] = df['words'].apply(lambda words: any([(word in oversampled_words) for word in words]))
    df = df.drop(df[df['contains_oversampled']].sample(1).index)
    oversampled_words = find_oversampled_words(df, baseline_words)

df.to_parquet('./labeled_data/antivax_labels_rebalanced.parquet')


