Also see extract_identities.py

# Check output from extracting NetMapper identities from tweet texts

## Check output of merging tweets with bios

In [8]:
import pandas as pd

test_fpath = '../output/tweets_identities/covid_20220618.jsonl'
test = pd.read_json(test_fpath, lines=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71017 entries, 0 to 71016
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   id_str            71017 non-null  int64              
 1   created_at        71017 non-null  datetime64[ns, UTC]
 2   user.id_str       71017 non-null  int64              
 3   user.name         71017 non-null  object             
 4   user.description  71017 non-null  object             
 5   text              71017 non-null  object             
 6   bio               71017 non-null  object             
 7   identities        71017 non-null  object             
 8   identity_spans    71017 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(2), object(6)
memory usage: 4.9+ MB


In [9]:
test.text.count()

71017

In [11]:
char_counts = test.text.str.len().value_counts()
char_counts[:20]

140    29900
139     4910
144     2631
143     1349
33       795
113      540
111      525
116      455
37       446
64       428
23       422
136      418
137      409
100      377
138      346
125      316
129      296
135      287
122      261
75       251
Name: text, dtype: int64

## Check for duplicates in tweets_bios_identities

In [6]:
import os
import pandas as pd

tweet_output_dirpath = '../output/tweets_bios_identities/'
bio_output_dirpath = '../output/tweets_identities/'

for tweet_fpath in ['../output/tweets_bios_identities/virus_2020_1_29.jsonl']:
    fname = os.path.basename(tweet_fpath)

    # Find matching original document
    matching_bio_fpath = os.path.join(bio_output_dirpath, fname)
    
    # Load bio and tweet-identified files
    # tweets_output = pd.read_json(tweet_fpath, lines=True)
    
    bio_output = pd.read_json(matching_bio_fpath, lines=True)
    
bio_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232560 entries, 0 to 232559
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   id_str            232560 non-null  int64              
 1   created_at        232560 non-null  datetime64[ns, UTC]
 2   user.id_str       232560 non-null  int64              
 3   user.name         232560 non-null  object             
 4   user.description  232560 non-null  object             
 5   bio               232560 non-null  object             
 6   identities        232560 non-null  object             
 7   identity_spans    232560 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(2), object(5)
memory usage: 14.2+ MB


In [7]:
bio_output['id_str'].duplicated(keep='first').sum()

80

## Look for discrepancies between tweets_bios_identities and tweets_identities

In [15]:
# Load output
import os
from glob import glob
import pandas as pd
from tqdm.auto import tqdm
import pdb

diff_lines = []

tweet_output_dirpath = '../output/tweets_bios_identities/'
bio_output_dirpath = '../output/tweets_identities/'
tweet_fpaths = sorted(glob(os.path.join(tweet_output_dirpath, '*')))

for tweet_fpath in tqdm(tweet_fpaths):
# for tweet_fpath in ['../output/tweets_bios_identities/virus_2020_1_29.jsonl']:
    fname = os.path.basename(tweet_fpath)

    # Find matching original document
    matching_bio_fpath = os.path.join(bio_output_dirpath, fname)
    
    # Load bio and tweet-identified files
    tweets_output = pd.read_json(tweet_fpath, lines=True)
    bio_output = pd.read_json(matching_bio_fpath, lines=True)
    
    if len(tweets_output) != len(bio_output):
        print(f'{fname} Mismatch: {len(tweets_output)} tweets (new), {len(bio_output)} tweets with bios (old)')
        diff_lines.append(fname)
        
print(len(diff_lines))
print(len(tweet_fpaths))

  0%|          | 0/785 [00:00<?, ?it/s]

covid_20200602.jsonl Mismatch: 369604 tweets (new), 924634 tweets with bios (old)



KeyboardInterrupt



In [12]:
bio_output.columns

Index(['id_str', 'created_at', 'user.id_str', 'user.name', 'user.description',
       'bio', 'identities', 'identity_spans'],
      dtype='object')

In [13]:
len(bio_output)

232560

In [16]:
tweet_id_str in bio_output.id_str

False

In [18]:
bio_output.id_str

0         1222538561456689152
1         1222539193664069632
2         1222539388124639232
3         1222544175645761536
4         1222545010543013888
                 ...         
232555    1222670733072973824
232556    1222670734134108160
232557    1222670734138466304
232558    1222670734201425920
232559    1222670736403222528
Name: id_str, Length: 232560, dtype: int64

In [22]:
tweets_output.id_str

0         1222544175645761536
1         1222651791839506432
2         1222538562521968640
3         1222538562647793664
4         1222538563213983744
                 ...         
109725    1222670733072973824
109726    1222670734134108160
109727    1222670734138466304
109728    1222670734201425920
109729    1222670736403222528
Name: id_str, Length: 109730, dtype: int64

In [19]:
missing = set(bio_output.id_str) - set(tweets_output.id_str)
len(missing)

122807

In [24]:
test = str(list(missing)[0])
test

'1222570743139045376'

In [15]:
# Look for tweets in dump file
import gzip
import json
import pdb
from tqdm.auto import tqdm

dump_fpath = '/storage3/coronavirus/json_keyword_stream/virus_2020_1_29.json.gz'
tweet_id_str = '1222584347917791232'

In [26]:
ids = []
with gzip.open(dump_fpath, 'rb') as f:
    for line in tqdm(f):
        if len(line) == 1: continue
        tweet = json.loads(line)
        if not 'user' in tweet: continue
        if tweet['id_str'] == test:
            pdb.set_trace()
        ids.append(tweet['id_str'])
len(ids)

0it [00:00, ?it/s]

974128

In [27]:
test in ids

False

In [36]:
print(len(bio_output))
len(set(bio_output.id_str.astype(str)).intersection(set(ids)))

232560


109673

## Calculate most frequent extracted identities

In [2]:
# Load output
import os
from glob import glob
import pandas as pd
from tqdm.auto import tqdm

output_dirpath = '../output/tweets_bios_identities/'
fpaths = glob(os.path.join(output_dirpath, '*'))
combined = []
for path in tqdm(fpaths):
    dump_data = pd.read_json(path, lines=True)
    # Filter to just matches
    selected = dump_data[dump_data.tweet_identities.str.len() > 2]
    combined.append(selected)
data = pd.concat(combined)
data.info()

  0%|          | 0/32 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1434859 entries, 6 to 383313
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   id_str                1434859 non-null  int64         
 1   created_at            1434859 non-null  datetime64[ns]
 2   user.id_str           1434859 non-null  int64         
 3   user.name             1434859 non-null  object        
 4   user.description      1434859 non-null  object        
 5   bio                   1434859 non-null  object        
 6   identities            1434859 non-null  object        
 7   identity_spans        1434859 non-null  object        
 8   text                  1434859 non-null  object        
 9   tweet_identities      1434859 non-null  object        
 10  tweet_identity_spans  1434859 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 131.4+ MB


In [3]:
data.tweet_identities.sample(10)

57023                [congressman, our, governor]
3427                           [we, they, people]
28791                [chinese, doctors, new york]
1312                     [woman, her, prefecture]
20637       [parents, who, their, kids, mom, dad]
28596     [you, bro, president, you, hunters, we]
132916                        [teacher, he, i, i]
51744            [parents, them, their, daughter]
25583             [you, your, elderly, neighbors]
148536                         [we, those, those]
Name: tweet_identities, dtype: object

In [4]:
from collections import Counter
from tqdm.auto import tqdm
tqdm.pandas()

ctr = Counter()
data.tweet_identities.progress_map(ctr.update)

  0%|          | 0/1434859 [00:00<?, ?it/s]

6         None
7         None
20        None
21        None
31        None
          ... 
383302    None
383304    None
383305    None
383310    None
383313    None
Name: tweet_identities, Length: 1434859, dtype: object

In [5]:
ctr.most_common(50)

[('i', 472957),
 ('it', 421776),
 ('you', 359549),
 ('we', 253162),
 ('people', 243642),
 ('who', 236056),
 ('chinese', 207667),
 ('they', 197620),
 ('he', 152823),
 ('us', 124743),
 ('my', 113771),
 ('their', 112178),
 ('your', 105023),
 ('don', 101979),
 ('our', 94851),
 ('his', 86890),
 ('me', 81263),
 ('them', 49492),
 ('president', 48984),
 ('its', 47095),
 ('other', 46752),
 ('doctor', 45500),
 ('americans', 43743),
 ('man', 41100),
 ('she', 40444),
 ('her', 39808),
 ('dr', 38577),
 ('person', 32583),
 ('racist', 32465),
 ('those', 32369),
 ('patients', 31861),
 ('american', 30944),
 ('white', 29272),
 ('everyone', 27662),
 ('woman', 26730),
 ('him', 26557),
 ('officials', 25689),
 ('asian', 24464),
 ('police', 23773),
 ('africans', 23372),
 ('someone', 21397),
 ('human', 20779),
 ('patient', 19737),
 ('citizens', 19441),
 ('doctors', 19233),
 ('girl', 18900),
 ('god', 18786),
 ('anyone', 18336),
 ('communist', 17644),
 ('official', 15476)]

# Load identities list

In [2]:
# Load current NetMapper identity list
import pandas as pd

path = '../resources/generic_agents-identity_v26_Netanomics.xlsx'
identities = pd.read_excel(path)
identities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20778 entries, 0 to 20777
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   English       20778 non-null  object
 1   conceptTo     20778 non-null  object
 2   metaOntology  20778 non-null  object
 3   nodeType      20778 non-null  object
 4   Category 1    10955 non-null  object
dtypes: object(5)
memory usage: 811.8+ KB


In [4]:
identities = identities.drop_duplicates('English')
identities.info()

19897

# Try to filter identities list with dataset vocab

In [6]:
# Build dataset vocab
import os

# Load COVID Twitter data (Carley lab)
basepath = '/storage3/coronavirus'

# Older data
print("Searching older data...")
dirname = 'json_keyword_stream'
paths = [os.path.join(basepath, dirname, fname) for fname in sorted(os.listdir(os.path.join(basepath, dirname)))]
# with Pool(n_cores) as p:
#     list(tqdm(p.imap(self.process_dump, paths), ncols=80, total=len(paths)))
# list(map(self.process_dump, paths)) # debugging

# Newer data
# print("Filtering newer data...")
# dirname = 'json_keyword_stream_mike'
# paths = [os.path.join(basepath, dirname, fname) for fname in sorted(os.listdir(os.path.join(basepath, dirname))) if fname.endswith('json.gz')]
# with Pool(n_cores) as p:
#     list(tqdm(p.imap(self.process_dump, paths), ncols=80, total=len(paths)))
len(paths)

Filtering older data...


148

In [None]:
import gzip
import json
from tqdm.auto import tqdm

vocab = set()
dirname = 'json_keyword_stream'
for path in tqdm(paths):
    with gzip.open(path, 'rb') as f:
        # limit = 0
        lines = f.read().splitlines()
        for line in tqdm(lines):
        # for line in f:
            # if limit >= 100:
            #     break
            if len(line) == 1:
                continue
            try:
                tweet = json.loads(line)
            except json.decoder.JSONDecodeError:
                tqdm.write('json decode error')
                continue
            except UnicodeDecodeError:
                tqdm.write('unicode decode error')
                continue
            if 'user' in tweet and tweet['user']['description'] is not None:
                for wd in tweet['user']['description'].split():
                    vocab.add(wd)
            # limit += 1
len(vocab)

In [None]:
tweets[0]

{'created_at': 'Wed Jan 29 15:14:41 +0000 2020',
 'id': 1222538561385312263,
 'id_str': '1222538561385312263',
 'text': "RT @TWilly_951: Y'en as vous mangez les brochettes à la Gare de Saint Denis mais vous vous inquiétez quand même du coronavirus",
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 2760744963,
  'id_str': '2760744963',
  'name': 'marie',
  'screen_name': 'mariejamn',
  'location': None,
  'url': None,
  'description': None,
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 312,
  'friends_count': 117,
  'listed_count': 4,
  'favourites_count': 8251,
  'statuses_count': 20931,
  'created_at': 'Wed Sep 03 22:52:12 +0000 2014',
  'utc_offset': None,
  'time_zone': None,
  'geo_enabled

In [None]:
# Combine vocabs
from tqdm.auto import tqdm
from glob import glob
import json

vocab = set()
vocab_dirpath = '../output/vocab/*'
for path in tqdm(glob(vocab_dirpath)):
    with open(path) as f:
        vocab |= set(json.load(f))
len(vocab)

In [6]:
# Save out vocab
with open('../tmp/vocab.json', 'w') as f:
    json.dump(list(vocab), f, indent=4)

In [11]:
%%timeit -n 1 -r 1

# Load vocab
with open('../tmp/vocab.json') as f:
    vocab = json.load(f)

13.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [8]:
# Save out vocab
import pickle

with open('../tmp/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [10]:
%%timeit -n 1 -r 1

# Load vocab
with open('../tmp/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

8.35 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# 1-time

In [1]:
import pickle

path = '../tmp/pat.pkl'
with open(path, 'rb') as f:
    pat = pickle.load(f)
pat

re.compile(r"\b1\ star\b|\b1\-star\b|\bone\ star\b|\bone\-star\b|\b2\ star\b|\b2\-star\b|\btwo\ star\b|\btwo\-star\b|\b3\ star\b|\b3\-star\b|\bthree\ star\b|\bthree\-star\b|\b30\ under\ 30\b|\b4\ star\b|\b4\-star\b|\bfour\ star\b|\bfour\-star\b|\bfourstar\ general\b|\bfour\-star\ general\b|\b9\ 11\ victims\b|\babandoner\b|\babbess\b|\babbot\b|\babbott\b|\babductee\b|\babductees\b|\babductor\b|\babductors\b|\babenaki\b|\btemporarily\ able\-bodied\b|\babolitionist\b|\babolitionists\b|\baboriginal\b|\baboriginals\b|\baborigine\b|\baborigines\b|\babschieben\b|\babsentee\b|\babsentee\xa0\b|\babsolutist\b|\babsurdist\b|\babu\ ghraib\ prisoner\b|\babu\ ghraibprisoner\b|\babuser\b|\babusers\b|\babyan\ governorate\b|\babyan\ governoratewill\b|\babyssinian\b|\bacademical\b|\bacademician\b|\bacademicians\b|\bacademics\b|\bacademy\ award\ win\b|\bacademy\ award\ winner\b|\bacademy\ award\-win\b|\bacadian\b|\bmember\ of\ acca\b|\baccomplice\b|\baccomplices\b|\baccountant\b|\baccountants\b|\bactuari

In [5]:
import re
matches = list(re.finditer(pat, 'adult aide'))
for match in matches:
    print(match.group())
    print(match.span())

adult
(0, 5)
aide
(6, 10)


In [19]:
import re
pat = re.compile(r'|'.join([p.pattern for p in pats]))

In [20]:
with open(path, 'wb') as f:
    pickle.dump(pat, f)