See bio_tweet_identity_pairs.py

# Group identity terms into categories for bios and tweets
Groupings from Josh's table

In [23]:
# Load groupings from Josh's table
import pandas as pd

cats_path = '../resources/identities_tags_v2_2022_5_26.xslx'
cats = pd.read_excel(cats_path)
cats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7166 entries, 0 to 7165
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   IDENTITY          7166 non-null   object 
 1   FAMILY            7166 non-null   int64  
 2   POLITICAL         7165 non-null   object 
 3   GENDER            7165 non-null   float64
 4   RELIGION          7166 non-null   int64  
 5   RACE/NATIONALITY  7166 non-null   int64  
 6   JOB               7166 non-null   int64  
 7   OTHER             7166 non-null   int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 448.0+ KB


In [24]:
cats['identity'] = cats.IDENTITY.str.replace('_', ' ').str.lower()

In [30]:
# Check for any terms annotated for multiple categories
indicators = cats[['FAMILY', 'POLITICAL', 'GENDER', 'RELIGION', 'RACE/NATIONALITY', 'JOB', 'OTHER']]
indicators.sum(axis=1).value_counts()

  indicators.sum(axis=1).value_counts()


1.0    6407
2.0     678
0.0      66
3.0      14
4.0       1
dtype: int64

In [28]:
cats.head()

Unnamed: 0,IDENTITY,FAMILY,POLITICAL,GENDER,RELIGION,RACE/NATIONALITY,JOB,OTHER,identity
0,Zuni,0,0,0.0,0,1,0,0,zuni
1,zoologist,0,0,0.0,0,0,1,0,zoologist
2,zookeeper,0,0,0.0,0,0,1,0,zookeeper
3,zoogeographer,0,0,0.0,0,0,1,0,zoogeographer
4,zombie,0,0,0.0,0,0,0,1,zombie


In [32]:
pd.from_dummies(indicators.fillna(0))

TypeError: Passed DataFrame contains non-dummy data

In [None]:
# Build dictionary of identity_term: category
term_categories = {}
for category in ['FAMILY', 'POLITICAL', 'GENDER', 'RELIGION', 'RACE/NATIONALITY', 'JOB', 'OTHER']:
    category_terms = cats.loc[cats[category] == 1, 'identity']
    term_categories.update({term: category.lower() for term in category_terms})
len(term_categories)

# Check output from pairing script

In [38]:
import pandas as pd
from glob import glob
from tqdm.auto import tqdm

dirpath = '../output/bio_tweet_identity_pairs_1000/'
fpaths = glob(os.path.join(dirpath, '*'))
dfs = []

for fpath in tqdm(fpaths):
    dfs.append(pd.read_json(fpath, lines=True))
len(dfs)

data = pd.concat(dfs).reset_index()
data.info()

  0%|          | 0/729 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76168883 entries, 0 to 76168882
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   index           int64         
 1   bio_identity    object        
 2   tweet_identity  object        
 3   user_count      int64         
 4   date            datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 2.8+ GB


In [39]:
# Calculate top terms by month
gped = pd.DataFrame(data.groupby(['bio_identity', 'tweet_identity', pd.Grouper(key='date', freq='MS')])['user_count'].sum()).reset_index()

In [40]:
gped.to_pickle('../tmp/bio_tweet_identities_monthly.pkl')

In [1]:
# Load from pickle
import pandas as pd
gped = pd.read_pickle('../tmp/bio_tweet_identities_monthly.pkl')
gped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16797078 entries, 0 to 16797077
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   bio_identity    object        
 1   tweet_identity  object        
 2   date            datetime64[ns]
 3   user_count      int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 512.6+ MB


In [3]:
gped = gped.sort_values(['date', 'user_count'], ascending=False)
gped.info()
gped.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16797078 entries, 13917959 to 16797077
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   bio_identity    object        
 1   tweet_identity  object        
 2   date            datetime64[ns]
 3   user_count      int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 640.8+ MB


Unnamed: 0,bio_identity,tweet_identity,date,user_count
13917959,she,children,2022-06-01,1431
7196896,her,children,2022-06-01,1377
10079084,mom,children,2022-06-01,1343
10093827,mom,kids,2022-06-01,1240
13933727,she,kids,2022-06-01,1133


In [20]:
pd.set_option('display.max_rows', None)
from itertools import cycle

top_monthly = {} # by number of top terms to keep
for nterms in [10, 20, 100, 1000]:
    top_monthly[nterms] = gped.groupby('date').head(nterms).sort_values('date')
    top_monthly[nterms] = top_monthly[nterms][['date', 'bio_identity', 'tweet_identity', 'user_count']]
    seq = cycle(range(1,nterms+1))
    top_monthly[nterms]['rank'] = [next(seq) for count in range(top_monthly[nterms].shape[0])]

    # Save out
    top_monthly[nterms].to_csv(f'../output/analysis/top{nterms}_bio_tweet_identities.csv', index=False)

len(top_monthly)

4

In [21]:
top_monthly[10]

Unnamed: 0,date,bio_identity,tweet_identity,user_count,rank
9272112,2020-01-01,maga,others,4545,1
13918015,2020-01-01,she,chinese,8260,2
13945993,2020-01-01,she,racist,7368,3
7196949,2020-01-01,her,chinese,7290,4
13912325,2020-01-01,she,americans,7129,5
15291791,2020-01-01,they,chinese,4587,6
7191198,2020-01-01,her,americans,6118,7
13913130,2020-01-01,she,asian,5093,8
7134050,2020-01-01,he,chinese,4934,9
7225141,2020-01-01,her,racist,6408,10


# Calculate which bios identities mention which tweet identities

In [4]:
import os
from glob import glob
import pandas as pd
from tqdm.auto import tqdm
import pdb
from collections import Counter
import itertools
tqdm.pandas()

dfs = []

tweet_output_dirpath = '../output/tweets_bios_identities/'
bio_output_dirpath = '../output/tweets_identities/'
tweet_fpaths = sorted(glob(os.path.join(tweet_output_dirpath, '*')))

bio_stops = ['i', 'you', 'us', 'we', 'my', 'me', 'it', 'your', 'our', 'who', 'its', 'those', 'other', 'everyone', 
            'people', 'don']
tweet_stops = bio_stops + ['they', 'he', 'his', 'their', 'she', 'her', 'hers', 'theirs',]

# To make efficient, probably just save the top 1000 terms and save out
for tweet_fpath in tqdm(tweet_fpaths[:1]):
# for tweet_fpath in ['../output/tweets_bios_identities/virus_2020_1_29.jsonl']:
    fname = os.path.basename(tweet_fpath)

    # Find matching original document
    matching_bio_fpath = os.path.join(bio_output_dirpath, fname)
    
    # Load bio and tweet-identified files
    tweets_output = pd.read_json(tweet_fpath, lines=True)
    # bio_output = pd.read_json(matching_bio_fpath, lines=True)
    
    # Calculate most frequent 1000 identities in bios and most frequent in 1000 tweets
    tweets_output['identities'] = tweets_output['identities'].map(lambda x: [w for w in x if w not in bio_stops])
    tweets_output['tweet_identities'] = tweets_output['tweet_identities'].map(lambda x: [w for w in x if w not in tweet_stops])
    bio_ctr = Counter()
    tweets_ctr = Counter()
    tweets_output.identities.map(bio_ctr.update)
    tweets_output.tweet_identities.map(tweets_ctr.update)
    top_bio_identities, _ = zip(*bio_ctr.most_common(1000))
    top_tweet_identities, _ = zip(*tweets_ctr.most_common(1000))
    
    tweets_output['identities'] = tweets_output['identities'].map(lambda x: sorted(set([w for w in x if w in top_bio_identities]))
    tweets_output['tweet_identities'] = tweets_output['tweet_identities'].map(lambda x: [w for w in x if w in top_tweet_identities])
    
    # Remove rows if don't have at least one top identity in bios and in tweet
    tweets_output = tweets_output[(tweets_output['identities'].map(lambda x: len(x) > 0)) & (tweets_output['tweet_identities'].map(lambda x: len(x) > 0))]
len(tweets_output)

  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
fname

'covid_20200602.jsonl'

In [5]:
len(tweets_output)

140444

In [12]:
tweets_output.columns

Index(['id_str', 'created_at', 'user.id_str', 'user.name', 'user.description',
       'bio', 'identities', 'identity_spans', 'text', 'tweet_identities',
       'tweet_identity_spans'],
      dtype='object')

In [17]:
# Get all unique user combinations of tweet-bio identity pairs
processed = tweets_output.groupby('user.id_str').agg({
    'identities': 'first',
    'tweet_identities': lambda x: sorted(set([identity for identities in x for identity in identities]))})

Unnamed: 0_level_0,identities,tweet_identities
user.id_str,Unnamed: 1_level_1,Unnamed: 2_level_1
577,[herald],"[asian, black]"
614,"[man, lover]","[homeowner, police]"
767,[writer],[presidential]
885,"[ceo, cofounder, entrepreneur, husband, father]",[taxpayers]
1338,[director],"[americans, president]"
...,...,...
1267931711058309120,"[victims, his]","[blacks, chef, cook, victims]"
1267934630478372864,[black],[protesters]
1267937624460845056,[queen],[president]
1267947762957660160,"[doctor, surgeon]","[artisans, parent, parents, refugee, someone, ..."


In [10]:
tweet_bio_ctr = Counter()
tweets_output.progress_apply(lambda row: tweet_bio_ctr.update(itertools.product(row['identities'], row['tweet_identities'])), axis=1)

# bio_identities, tweet_identities = pd.core.reshape.util.cartesian_product([tweets_output['identities'], tweets_output['tweet_identities']])
# takes a ton of RAM, around 300 GB). Could reduce by constructing dict iteratively
# identity_pairs = pd.DataFrame(dict(bio_identity=bio_identities, tweet_identity=tweet_identities)).groupby(['bio_identity', 'tweet_identity']).count()
# dfs.append(identity_pairs)

tweet_bio_ctr.most_common(50)

  0%|          | 0/140444 [00:00<?, ?it/s]

[(('she', 'black'), 2580),
 (('her', 'black'), 2211),
 (('he', 'black'), 1164),
 (('they', 'black'), 1164),
 (('him', 'black'), 948),
 (('lover', 'president'), 834),
 (('black', 'black'), 810),
 (('writer', 'black'), 773),
 (('she', 'president'), 754),
 (('she', 'police'), 679),
 (('her', 'president'), 676),
 (('mom', 'president'), 648),
 (('them', 'black'), 625),
 (('her', 'police'), 611),
 (('artist', 'black'), 587),
 (('god', 'black'), 572),
 (('lover', 'black'), 540),
 (('writer', 'president'), 539),
 (('he', 'president'), 521),
 (('wife', 'president'), 515),
 (('she', 'detainees'), 503),
 (('retired', 'president'), 492),
 (('mother', 'president'), 439),
 (('him', 'president'), 435),
 (('student', 'black'), 432),
 (('maga', 'governor'), 428),
 (('her', 'detainees'), 425),
 (('maga', 'president'), 406),
 (('he', 'police'), 390),
 (('maga', 'rioters'), 381),
 (('god', 'president'), 381),
 (('lover', 'americans'), 381),
 (('enthusiast', 'black'), 380),
 (('artist', 'president'), 374),

In [10]:
# Get all combinations of tweet-bio identity pairs
tweet_bio_ctr = Counter()
tweets_output.progress_apply(lambda row: tweet_bio_ctr.update(itertools.product(row['identities'], row['tweet_identities'])), axis=1)

# bio_identities, tweet_identities = pd.core.reshape.util.cartesian_product([tweets_output['identities'], tweets_output['tweet_identities']])
# takes a ton of RAM, around 300 GB). Could reduce by constructing dict iteratively
# identity_pairs = pd.DataFrame(dict(bio_identity=bio_identities, tweet_identity=tweet_identities)).groupby(['bio_identity', 'tweet_identity']).count()
# dfs.append(identity_pairs)

tweet_bio_ctr.most_common(50)

  0%|          | 0/140444 [00:00<?, ?it/s]

[(('she', 'black'), 2580),
 (('her', 'black'), 2211),
 (('he', 'black'), 1164),
 (('they', 'black'), 1164),
 (('him', 'black'), 948),
 (('lover', 'president'), 834),
 (('black', 'black'), 810),
 (('writer', 'black'), 773),
 (('she', 'president'), 754),
 (('she', 'police'), 679),
 (('her', 'president'), 676),
 (('mom', 'president'), 648),
 (('them', 'black'), 625),
 (('her', 'police'), 611),
 (('artist', 'black'), 587),
 (('god', 'black'), 572),
 (('lover', 'black'), 540),
 (('writer', 'president'), 539),
 (('he', 'president'), 521),
 (('wife', 'president'), 515),
 (('she', 'detainees'), 503),
 (('retired', 'president'), 492),
 (('mother', 'president'), 439),
 (('him', 'president'), 435),
 (('student', 'black'), 432),
 (('maga', 'governor'), 428),
 (('her', 'detainees'), 425),
 (('maga', 'president'), 406),
 (('he', 'police'), 390),
 (('maga', 'rioters'), 381),
 (('god', 'president'), 381),
 (('lover', 'americans'), 381),
 (('enthusiast', 'black'), 380),
 (('artist', 'president'), 374),

In [20]:
most_common = tweet_bio_ctr.most_common(10)
pd.DataFrame([{'bio_identity': el[0][0], 'tweet_identity': el[0][1],
               'user_count': el[1]} for el in most_common])

Unnamed: 0,bio_identity,tweet_identity,user_count
0,she,black,2580
1,her,black,2211
2,he,black,1164
3,they,black,1164
4,him,black,948
5,lover,president,834
6,black,black,810
7,writer,black,773
8,she,president,754
9,she,police,679


# Compare top bio and tweet identities over time

## Top bio identities over time

In [None]:
import os
from glob import glob
import pandas as pd
from tqdm.auto import tqdm
import pdb

dfs = []

tweet_output_dirpath = '../output/tweets_bios_identities/'
bio_output_dirpath = '../output/tweets_identities/'
tweet_fpaths = sorted(glob(os.path.join(tweet_output_dirpath, '*')))

bio_stops = ['i', 'you', 'us', 'we', 'my', 'me', 'it', 'your', 'our', 'who', 'its', 'those', 'other', 'everyone', 
            'people', 'don']
tweet_stops = bio_stops + ['they', 'he', 'his', 'their', 'she', 'her', 'hers', 'theirs',]


# To make efficient, probably just save the top 1000 terms and save out
for tweet_fpath in tqdm(tweet_fpaths[:1]):
# for tweet_fpath in ['../output/tweets_bios_identities/virus_2020_1_29.jsonl']:
    fname = os.path.basename(tweet_fpath)

    # Find matching original document
    matching_bio_fpath = os.path.join(bio_output_dirpath, fname)
    
    # Load bio and tweet-identified files
    # tweets_output = pd.read_json(tweet_fpath, lines=True)
    bio_output = pd.read_json(matching_bio_fpath, lines=True)
    expanded = bio_output[['identities', 'created_at']].explode('identities')
    expanded = expanded[~expanded.identities.isin(stops)]
    dfs.append(expanded)

len(dfs)

  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# Bin by month, calculate top identities per month
pd.set_option('display.max_rows', None)

test = dfs[0]
monthly_counts = test.groupby([pd.Grouper(key='created_at', freq='MS'), 'identities']).size().sort_values(ascending=False)
monthly_counts[:100]

created_at                 identities  
2020-06-01 00:00:00+00:00  she             38464
                           who             37786
                           lover           36742
                           her             35426
                           don             35070
                           writer          32292
                           people          31456
                           maga            30085
                           god             28377
                           he              28149
                           they            24481
                           him             20996
                           retired         20642
                           wife            20064
                           human           19728
                           member          18966
                           mom             18645
                           director        18088
                           mother          18005
                           st

In [22]:
# Investigate particular terms
pd.set_option('display.max_colwidth', None)

bio_output[bio_output['identities'].str.contains('don', regex=False)].drop_duplicates(subset='user.id_str').head()

Unnamed: 0,id_str,created_at,user.id_str,user.name,user.description,text,bio,identities,identity_spans
0,1267676555872002048,2020-06-02 04:36:57+00:00,3345626650,Danielle *essential services worker* #TeamJOE,RT's don't always mean I endorse. *Happily married 3 kids*. Dyslexic. \n#OneVoice1 #Z14 #DemCastOK,"RT @RepValDemings: This president, who wouldn’t lift a finger to help Americans dying of COVID-19, will gladly impose martial law. Resist.",RT's don't always mean I endorse. *Happily married 3 kids*. Dyslexic. \n#OneVoice1 #Z14 #DemCastOK,"[don, i, kids]","[[5, 8], [23, 24], [53, 57]]"
271,1267676578428968960,2020-06-02 04:37:03+00:00,4848203716,Melvin Nudelman,"VOTER I.D.Retired. 😊Married. Animal Lover. Vegetarian,TRUMP 2nd Amendment http://Advocate.Pro Israel A right 2 Free Speech just\ndon't force me 2 believe U","RT @antoniaiadi: AMERICA while you work, play, sleep, pray the NWO Globalist MAFIA plans their next move! Before 2018 Midterms Parkland FL…","VOTER I.D.Retired. 😊Married. Animal Lover. Vegetarian,TRUMP 2nd Amendment http://Advocate.Pro Israel A right 2 Free Speech just\ndon't force me 2 believe U","[voter, i, retired, lover, vegetarian, advocate, don, me]","[[0, 5], [6, 7], [10, 17], [36, 41], [43, 53], [81, 89], [128, 131], [140, 142]]"
283,1267676579959918592,2020-06-02 04:37:03+00:00,111267949,J-Man Rider says 'Punch Fascists!' BLM 🔞,Crazy badass nerdy Texas boy exploring the strange and much cooler realm of Ohio. Don't follow me if you're under 18.,RT @GrahamBrookie: For those keeping score at home:\n\nThe President refused to invoke the Defense Production Act to mobilize and save lives…,Crazy badass nerdy Texas boy exploring the strange and much cooler realm of Ohio. Don't follow me if you're under 18.,"[badass, boy, don, me, you]","[[6, 12], [25, 28], [82, 85], [95, 97], [101, 104]]"
446,1267676604047609856,2020-06-02 04:37:09+00:00,176202134,BELLA,"Publicist, writer, producer. . I enjoy carrots. I don't like beets. If you like beets DO NOT even think of following me. Belong to team #carrotsnotbeets","RT @BW: ""Superforecasters"" say there’s only a 9% chance that there will be a widely available vaccine for Covid-19 before next April https:…","Publicist, writer, producer. . I enjoy carrots. I don't like beets. If you like beets DO NOT even think of following me. Belong to team #carrotsnotbeets","[writer, producer, i, i, don, you, me]","[[11, 17], [19, 27], [31, 32], [48, 49], [50, 53], [71, 74], [117, 119]]"
490,1267676612822212608,2020-06-02 04:37:11+00:00,2744695457,Mars🌹Ⓥ🏳️‍🌈 Bob Kroll is a White Supremacist,Lurking in the left. They/Them. Vegan. NB. Fight for someone you don't know. #TotalLiberation #NotMeUs #BlackLivesMatter,RT @pant_leg: WHAT THE FUCK,Lurking in the left. They/Them. Vegan. NB. Fight for someone you don't know. #TotalLiberation #NotMeUs #BlackLivesMatter,"[they, them, vegan, someone, you, don]","[[21, 25], [26, 30], [32, 37], [53, 60], [61, 64], [65, 68]]"


## Top identities in tweets

In [19]:
import os
from glob import glob
import pandas as pd
from tqdm.auto import tqdm
import pdb

dfs = []

tweet_output_dirpath = '../output/tweets_bios_identities/'
bio_output_dirpath = '../output/tweets_identities/'
tweet_fpaths = sorted(glob(os.path.join(tweet_output_dirpath, '*')))

# To make efficient, probably just save the top 1000 terms and save out
for tweet_fpath in tqdm(tweet_fpaths[:1]):
# for tweet_fpath in ['../output/tweets_bios_identities/virus_2020_1_29.jsonl']:
    fname = os.path.basename(tweet_fpath)

    # Load bio and tweet-identified files
    tweets_output = pd.read_json(tweet_fpath, lines=True)
    expanded = tweets_output[['tweet_identities', 'created_at']].explode('tweet_identities')
    expanded = expanded[~expanded.tweet_identities.isin(tweet_stops)]
    dfs.append(expanded)

len(dfs)

  0%|          | 0/1 [00:00<?, ?it/s]

1

In [20]:
# Bin by month, calculate top identities per month
pd.set_option('display.max_rows', None)

test = dfs[0]
monthly_counts = test.groupby([pd.Grouper(key='created_at', freq='MS'), 'tweet_identities']).size().sort_values(ascending=False)
monthly_counts[:100]

created_at  tweet_identities
2020-06-01  people              29300
            black               18539
            president           12602
            police               6339
            don                  5816
            americans            5378
            them                 5191
            dr                   5076
            patients             3612
            senators             3287
            workers              3116
            white                2843
            mr                   2661
            pm                   2351
            anyone               2342
            him                  2324
            detainees            2126
            protesters           2119
            citizens             2078
            asian                2051
            official             2012
            secretary            1951
            children             1887
            man                  1847
            staff                1822
            experts  