# Load, filter COVID Twitter data
By keywords and media attachments

In [6]:
import re

terms_path = '../resources/antisemitic_terms.txt'
with open(terms_path) as f:
    search_terms = f.read().splitlines()
pats = [re.compile(r'\b{}\b'.format(re.escape(term.lower()))) for term in search_terms]

import pdb

def select_tweet(tweet):
    """ See if a tweet is worth keeping (matches enough criteria) """
    select = False
    
    # Basic cleaning
    if len(tweet) == 1 and 'limit' in tweet:
        return select
    
    # Language is English
    if tweet['lang'] != 'en':
        return select
    
    # Has media
    if 'media' not in tweet['entities']:
        return select
    
    # Contains possibly antisemitic terms
    if 'extended_tweet' in tweet:
        text = tweet['extended_tweet']['full_text'].lower()
    else:
        text = tweet['text'].lower()
    for p in pats:
        m = re.search(p, text)
        # if any([re.search(p, tweet['extended_tweet']['full_text'].lower()) for p in pats]):
        if m is not None:
            select = m.group()
            # tqdm.write('one selected')
            
    return select

# Load COVID Twitter data (Carley lab)
import os
import gzip
import json
from tqdm.notebook import tqdm

basepath = '/storage3/coronavirus/'

# Older data
dirname = 'json_keyword_stream'
# Newer data
dirname = 'json_keyword_stream_mike'
selected = []
n_selected = 0
for fname in sorted([fname for fname in os.listdir(os.path.join(basepath, dirname)) if fname.endswith('.json.gz')])[:1]:
    print(fname)
    fpath = os.path.join(basepath, dirname, fname)
    with gzip.open(fpath, 'rb') as f:
        for i, line in tqdm(enumerate(f), total=9560357):
            if len(line) == 1:
                continue
        # for i, line in tqdm(enumerate(f), total=974483, bar_format='selected: {postfix} | Elapsed: {elapsed} | {rate_fmt}', postfix=n_selected):
            tweet = json.loads(line)
            match = select_tweet(tweet)
            if match:
                tweet['search_match'] = match
                selected.append(tweet)
                n_selected += 1
            # if i > 100:
            #     break
n_selected

covid_20200602.json.gz


  0%|          | 0/974483 [00:00<?, ?it/s]

31

In [8]:
pats

[re.compile(r'\b\(\(\(\b', re.UNICODE),
 re.compile(r'\b\)\)\)\b', re.UNICODE),
 re.compile(r'\bjewish\b', re.UNICODE),
 re.compile(r'\bjew\-ish\b', re.UNICODE),
 re.compile(r'\bglobalist\b', re.UNICODE),
 re.compile(r'\bjewish\ lobby\b', re.UNICODE),
 re.compile(r'\bthe\ lobby\b', re.UNICODE),
 re.compile(r'\bgeorge\ soros\b', re.UNICODE),
 re.compile(r'\bvolodymyr\ zelenskyy\b', re.UNICODE),
 re.compile(r'\bvladimir\ zelensky\b', re.UNICODE),
 re.compile(r'\bjeffrey\ epstein\b', re.UNICODE),
 re.compile(r'\bjared\ kushner\b', re.UNICODE),
 re.compile(r'\brothschild\b', re.UNICODE),
 re.compile(r'\binternationalist\b', re.UNICODE),
 re.compile(r'\bbanker\b', re.UNICODE),
 re.compile(r'\bzio\b', re.UNICODE),
 re.compile(r'\bzionist\b', re.UNICODE),
 re.compile(r'\bzionazi\b', re.UNICODE),
 re.compile(r'\bzog\b', re.UNICODE),
 re.compile(r'\bcultural\ marxism\b', re.UNICODE),
 re.compile(r'\bmarxist\b', re.UNICODE),
 re.compile(r'\bjudeo\-bolshevism\b', re.UNICODE),
 re.compile(r'\bjude

In [11]:
pat = re.compile(r'\b88\b', re.UNICODE)
print(re.search(pat, 'https://t.co/FC88v4dWnc'))

None


In [7]:
n_selected

31

In [4]:
from collections import Counter

Counter([select['search_match'] for select in selected]).most_common()

[('israel', 26)]

In [8]:
# Save out selected
outpath = os.path.join('../output', f'{fname.split(".")[0]}.jsonl')
with open(outpath, 'w') as f:
    f.write('\n'.join([json.dumps(tweet) for tweet in selected]))

In [13]:
no_media = [select for select in selected if 'media' not in select['entities']]
len(no_media)

648

In [15]:
no_media[0]['entities']

{'hashtags': [],
 'urls': [{'url': 'https://t.co/7oecmaPjY2',
   'expanded_url': 'https://twitter.com/i/web/status/1222538649788649472',
   'display_url': 'twitter.com/i/web/status/1…',
   'indices': [117, 140]}],
 'user_mentions': [{'screen_name': 'ScottAdamsSays',
   'name': 'Scott Adams',
   'id': 2853461537,
   'id_str': '2853461537',
   'indices': [6, 21]}],
 'symbols': []}

In [23]:
test_tweet = {'lang': 'en', 'text': 'goyim'}
select_tweet(test_tweet)

one selected


True

## Process selected output tweets to CSV

In [13]:
# Load filtered tweets
import os
import json
from tqdm.notebook import tqdm

dirpath = '../output/'
dfs = []
for fname in tqdm(sorted(os.listdir(dirpath))):
    fpath = os.path.join(dirpath, fname)
    with open(fpath) as f:
        dfs.append(pd.json_normalize([json.loads(line) for line in f.read().splitlines()]))
out_df = pd.concat(dfs)
out_df.shape

# TODO: Don't concatenate, just save out CSVs for every file

  0%|          | 0/47 [00:00<?, ?it/s]

(4347, 349)

In [14]:
outpath = os.path.join(dirpath, 'filtered_covid_tweets.csv')
out_df.to_csv(outpath)