# Get tweets for neutral (non-white supremacist) data
See get_tweets_by_query.py

In [44]:
# Load white supremacist dataset to count tweets over time
import datetime
import pandas as pd

path = '../tmp/white_supremacist_corpus.pkl'
ws_data = pd.read_pickle(path)
ws_data.domain.unique()

# Select tweet data, Group by year
yearly = ws_data.query('domain=="tweet/short propaganda"').groupby(by=ws_data.timestamp.dt.year)['text'].count()
lookup = pd.DataFrame(yearly)
lookup['begin'] = pd.to_datetime(yearly.index.astype(int).astype(str), format='%Y')
lookup['end'] = [x.replace(year=x.year + 1) for x in lookup['begin']]
# lookup['end'] = lookup.begin.shift(-1, fill_value = datetime.datetime(2023,1,1))
lookup.index.name = 'year'
lookup.index = lookup.index.astype(int)
lookup.rename(columns={'text': 'post_count'}, inplace=True)
lookup

Unnamed: 0_level_0,post_count,begin,end
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009,256,2009-01-01,2010-01-01
2010,698,2010-01-01,2011-01-01
2011,722,2011-01-01,2012-01-01
2012,1212,2012-01-01,2013-01-01
2013,2807,2013-01-01,2014-01-01
2014,13958,2014-01-01,2015-01-01
2015,15995,2015-01-01,2016-01-01
2016,21132,2016-01-01,2017-01-01
2017,36666,2017-01-01,2018-01-01
2020,354,2020-01-01,2021-01-01


In [None]:
ws_data.query('domain=="tweet/short propaganda"').groupby(by=ws_data.timestamp.dt.year).source.value_counts()

timestamp  source                
2009.0     twitter                      3
2010.0     twitter                     23
2011.0     twitter                     36
2012.0     twitter                    109
2013.0     twitter                    161
2014.0     twitter                    692
2015.0     twitter                    886
2016.0     twitter                    971
2017.0     twitter                   2496
2020.0     offline_flyers_banners     354
2021.0     offline_flyers_banners     368
2022.0     offline_flyers_banners     307
Name: source, dtype: int64

## Sample query words from white supremacist tweet data 
To get tweets that share terms but aren't likely white supremacist

In [46]:
import nltk
import string
from collections import Counter
import random

random.seed(9)
stopwords = nltk.corpus.stopwords.words('english') + list(string.punctuation) + ['...', '…', '’', ';-)', 'rt', '<url>', '“', '”', "n't"] 
slurs_fpath = '/storage2/mamille3/data/hate_speech/hatebase_slurs.txt'
with open(slurs_fpath) as f:
    slurs = f.read().splitlines() + ['mudshark', 'illegals', 'anti-white']
def check_word(word):
    """ See if word is ok to be a query """
    return not (word in stopwords or word in slurs or word.startswith('#') or word.startswith('http') or word.startswith('http') or '.' in word)

words_by_year = ws_data.query('domain=="tweet/short propaganda"').groupby(by=ws_data.timestamp.dt.year).agg(
    {'text': lambda x: {w: count for w, count in Counter([w for w in ' '.join(x).split() if check_word(w)]).items() if count > 1}})
words_by_year['text'] = words_by_year
words_by_year.index

lookup['total_words'] = words_by_year
lookup['sampled_words'] = [Counter(random.choices(list(ctr.keys()), weights=list(ctr.values()), k=int(n/3))).most_common() for ctr, n in zip(lookup.total_words, lookup.post_count)]
lookup.drop(columns='total_words', inplace=True)
pd.DataFrame(lookup.sampled_words.str[:20])

Unnamed: 0_level_0,sampled_words
year,Unnamed: 1_level_1
2009,"[(illegal, 8), (aliens, 5), (new, 2), (report, 2), (border, 2), (system, 2), (obama, 2), (rep, 2), (video, 2), (join, 2), (generation, 2), (yr, 2), (program, 2), (arpaio, 2), (immigrants, 2), (immigrant, 2), (3-5, 2), (4, 1), (dixie, 1), (works, 1)]"
2010,"[(illegal, 14), (support, 6), (amnesty, 5), (aliens, 5), (immigration, 5), (new, 4), (hc, 3), (update, 3), (arizona, 3), (dream, 3), (shock, 2), (thank, 2), (senator, 2), (u, 2), (state, 2), (4, 2), (fair, 2), (wound, 2), (merzbow, 2), (killed, 2)]"
2011,"[(illegal, 6), (today, 4), (border, 4), (new, 4), (top, 3), (american, 3), (amnesty, 3), (icymi, 3), (law, 3), (soon, 3), (aliens, 2), (voters, 2), (bill, 2), (gov, 2), (enforcement, 2), (election, 2), (tune, 2), (woman, 2), (soleilmoon, 2), (tour, 2)]"
2012,"[(illegal, 8), (new, 6), (via, 6), (aliens, 5), (like, 5), (american, 4), (amnesty, 4), (white, 4), (7, 4), (mt, 3), (black, 3), (men, 3), (military, 3), (criminal, 3), (support, 3), (us, 3), (enforcement, 3), (workers, 3), (know, 3), (please, 3)]"
2013,"[(amnesty, 11), (illegal, 10), (white, 9), (via, 9), (us, 9), (bill, 7), (new, 7), (dixie, 6), (today, 6), (people, 6), (blog, 5), (immigration, 5), (american, 5), (u, 5), (realist, 4), (america, 4), (part, 4), (law, 4), (get, 4), (w, 4)]"
2014,"[(via, 39), (white, 26), (u, 24), (people, 21), (new, 19), (2, 18), (illegal, 17), (aap, 17), (amnesty, 17), (part, 16), (also, 16), (don't, 15), (bjp, 15), (us, 15), (pm, 15), (india, 15), (know, 14), (pl, 14), (party, 14), (get, 14)]"
2015,"[(via, 76), (white, 42), (people, 32), (us, 30), (media, 29), (don't, 25), (u, 24), (account, 23), (twitter, 22), (like, 20), (learn, 20), (policy, 20), (must, 19), (india, 19), (temporarily, 17), (immigration, 17), (new, 16), (racist, 15), (say, 15), (black, 14)]"
2016,"[(trump, 85), (via, 69), (u, 45), (white, 39), (like, 38), (illegal, 35), (us, 34), (people, 30), (immigration, 28), (america, 28), (jews, 27), (don't, 23), (obama, 22), (‘, 22), (go, 20), (country, 20), (must, 20), (media, 20), (need, 20), (i'm, 20)]"
2017,"[(white, 118), (people, 97), (trump, 94), (like, 85), (via, 73), (wall, 70), (don't, 60), (get, 59), (us, 57), (it's, 55), (one, 46), (would, 46), (illegal, 42), (good, 40), (need, 39), (hate, 38), (i'm, 36), (think, 36), (make, 35), (time, 34)]"
2020,"[(jews, 4), (white, 4), (victory, 3), (american, 3), (men, 3), (people, 3), (england, 2), (standing, 2), (immigration, 2), (2020, 2), (part, 2), (starts, 2), (want, 2), (antifa, 2), (mayor, 2), (stand, 2), (replace, 2), (terrorists, 2), (reclaim, 2), (131, 2)]"


## Get tweets from sampled words

In [54]:
import tweepy
import pandas as pd

# Load authentication
keys = pd.read_csv('/storage2/mamille3/tweepy_oauth_academic.csv', index_col='name').to_dict()['key']
client = tweepy.Client(keys['bearer_token'], wait_on_rate_limit=True)

In [55]:
from datetime import datetime
from tqdm.notebook import tqdm

tweet_fields = [
    'id', 'created_at', 'text', 'author_id', 'conversation_id', 'entities', 'public_metrics', 'geo', 'lang', 'referenced_tweets'
]
place_fields = [
    'full_name', 'id', 'contained_within'
]

tweets_by_year = {}
for i, row in tqdm(lookup.iterrows(), total=len(lookup)):
    fetched = []
    for word, count in row.sampled_words:
        try:
            fetched.append(client.search_all_tweets(word, expansions='geo.place_id', place_fields=place_fields, tweet_fields=tweet_fields, start_time=row.begin, end_time=row.end, max_results=count*3).data)
        except tweepy.BadRequest:
            tqdm.write(f'Bad request: {word}')
    tweets_by_year[row.begin.year] = [tweet for response in fetched for tweet in response]

  0%|          | 0/12 [00:00<?, ?it/s]

Rate limit exceeded. Sleeping for 653 seconds.


KeyboardInterrupt: 

# Get tweets using the API from tweet IDs
Also see get_tweets_by_id.py

In [41]:
import tweepy
import pandas as pd

# Load authentication
keys = pd.read_csv('/storage2/mamille3/tweepy_oauth_academic.csv', index_col='name').to_dict()['key']
client = tweepy.Client(keys['bearer_token'], wait_on_rate_limit=True)

In [43]:
# Load ElSherief+2021 tweet IDs
stg1_meta = pd.read_csv('/storage2/mamille3/data/hate_speech/elsherief2021/implicit_hate_v1_stg1.tsv', sep='\t')
stg1_meta_tweets = stg1_meta[~stg1_meta['ID'].str.contains('_')].copy()
tweet_ids = stg1_meta_tweets['ID']
tweet_ids

# Load Qian+2018 tweet IDs
# fpath = '/storage2/mamille3/data/hate_speech/qian2018/white_supremacist_tweets.csv'
# tweet_ids = pd.read_csv(fpath)['tweet id']
# tweet_ids

0        399886440588247041
1        929901925100937216
2        728678509497954304
3        625688620444180481
4        441089979322597376
                ...        
20804    892542828475285504
20805    900565598358425600
20806    762438713397813120
20807    828761621170421760
20808    850115546746146816
Name: ID, Length: 20809, dtype: object

In [44]:
# Break up into lists of 100 (max for Tweepy)
chunks = [tweet_ids[x:min(x+100, len(tweet_ids))].values.tolist() for x in range(0, len(tweet_ids), 100)]
print(len(chunks))
responses = []

209


In [46]:
# Do query and save results out
import os
from tqdm.notebook import tqdm
import json
# dataset = 'qian2018'
dataset = 'elsherief2021'

tweet_fields = [
    'id', 'created_at', 'text', 'author_id', 'conversation_id', 'entities', 'public_metrics', 'geo', 'lang', 'referenced_tweets'
]
user_fields = [
    'id', 'name', 'username', 'description'
]

for chunk in tqdm(chunks[:5]):
    response = client.get_tweets(chunk, expansions=['author_id', 'entities.mentions.username'], tweet_fields=tweet_fields, user_fields=user_fields)

    out_dirpath = os.path.join('/storage2/mamille3/white_supremacist_lang/data', dataset)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)

    # Save out tweet data
    outpath = os.path.join(out_dirpath, 'data.jsonl')
    tweets = [tweet.data for tweet in response.data]
    with open(outpath, 'a') as f:
        f.write('\n'.join([json.dumps(tweet) for tweet in tweets]) + '\n')

    # Save out users
    outpath = os.path.join(out_dirpath, 'users.jsonl')
    users = [user.data for user in response.includes['users']]
    with open(outpath, 'a') as f:
        f.write('\n'.join([json.dumps(user) for user in users]) + '\n')

    # Save out errors
    outpath = os.path.join(out_dirpath, 'errors.jsonl')
    with open(outpath, 'a') as f:
        f.write('\n'.join([json.dumps(error) for error in response.errors]) + '\n')

  0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
dir(responses[0])

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'count',
 'data',
 'errors',
 'includes',
 'index',
 'meta']

In [33]:
responses[0].errors

[{'value': '598730121197924352',
  'detail': 'Could not find tweet with ids: [598730121197924352].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'ids',
  'resource_id': '598730121197924352',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'resource_id': '925132136595701761',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '925132136595701761',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [925132136595701761].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id': '566720469412610049',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '566720469412610049',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [566720469412610049].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id':

In [14]:
len(responses)

1963

In [15]:
len(chunks)

1963

In [20]:
type(responses[0].data)

list

In [22]:
len([tweet.data for r in responses for tweet in r.data])

240

In [24]:
# Save out tweet data
import json

outpath = '/storage2/mamille3/white_supremacist_lang/data/qian2018/data.jsonl'
tweets = [tweet.data for r in responses for tweet in r.data]
with open(outpath, 'a') as f:
    f.write('\n'.join([json.dumps(tweet) for tweet in tweets]))

In [28]:
responses[0].includes['users'][0].data

{'username': 'DixieRepublic',
 'name': 'Dixie Republic',
 'id': '935587117',
 'description': 'Hard Edge Southern Rock! Our new album Redneck way now available on iTunes. http://t.co/QiKgYUZ8ps'}

In [29]:
# Save out users
import json

outpath = '/storage2/mamille3/white_supremacist_lang/data/qian2018/users.jsonl'
users = [user.data for r in responses for user in r.includes['users']]
with open(outpath, 'a') as f:
    f.write('\n'.join([json.dumps(tweet) for tweet in tweets]))

In [30]:
# Load saved out users
import json

path = '/storage2/mamille3/white_supremacist_lang/data/qian2018/users.jsonl'
with open(path) as f:
    users = [json.loads(line) for line in f.read().splitlines()]
len(users)

240

In [36]:
fetched.includes

{'users': [<User id=935587117 name=Dixie Republic username=DixieRepublic>,
  <User id=96464367 name=Sloane Spencer Talks username=CountryFriedROK>,
  <User id=42877116 name=Ric Geyer username=RicGeyer>,
  <User id=790680583 name=Stanley D. Wyatt username=StanleyDWyatt>,
  <User id=285754520 name=Dylan Mathis username=DylanPaulMathis>,
  <User id=50872927 name=Lisa Hayes username=lisahayes4>,
  <User id=388823538 name=IG: MRDJNY username=DJNY1>,
  <User id=278806606 name=The Write Hook username=IAMTHEWRITEHOOK>,
  <User id=181720822 name=Christopher Shawn Shaw 🎥 🎬 username=directorCSS>,
  <User id=23725431 name=Sharon Hope username=sharonhope2>,
  <User id=396652848 name=alan millns username=alanmillns>,
  <User id=17209867 name=Jon Franchino username=JonFranchino>,
  <User id=33352787 name=Abraham Lincoln's Log Cabin username=Mr_Lincoln>,
  <User id=2809256199 name=Bob Jackrabit username=scwhiterabbit>,
  <User id=1055892414 name=Country Soundtrack username=18country18>,
  <User id=150

In [31]:
len(tweets)

44

In [33]:
tweets[5].data

{'author_id': '396652848',
 'created_at': '2017-01-23T22:00:48.000Z',
 'text': 'Support the Cause! https://t.co/XCz6Xdsl3b via @@dixienetdotorg I support !!!',
 'conversation_id': '823651725831716866',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 1,
  'quote_count': 0},
 'entities': {'urls': [{'start': 19,
    'end': 42,
    'url': 'https://t.co/XCz6Xdsl3b',
    'expanded_url': 'http://leagueofthesouth.com/support-the-cause/',
    'display_url': 'leagueofthesouth.com/support-the-ca…'}],
  'mentions': [{'start': 48,
    'end': 63,
    'username': 'dixienetdotorg',
    'id': '36315753'}]},
 'id': '823651725831716866',
 'lang': 'en'}

In [34]:
tweets[5].includes

AttributeError: 

In [23]:
type(fetched.data[-1])

tweepy.tweet.Tweet

In [25]:
dir(fetched.data[-1])

['__abstractmethods__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_abc_impl',
 'attachments',
 'author_id',
 'context_annotations',
 'conversation_id',
 'created_at',
 'data',
 'entities',
 'geo',
 'get',
 'id',
 'in_reply_to_user_id',
 'items',
 'keys',
 'lang',
 'non_public_metrics',
 'organic_metrics',
 'possibly_sensitive',
 'promoted_metrics',
 'public_metrics',
 'referenced_tweets',
 'reply_settings',
 'source',
 'text',
 'values',
 'withheld']

In [28]:
fetched.data[0].data

{'author_id': '935587117',
 'conversation_id': '276177111490904064',
 'lang': 'en',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 0,
  'quote_count': 0},
 'created_at': '2012-12-05T04:12:09.000Z',
 'id': '276177111490904064',
 'entities': {'annotations': [{'start': 62,
    'end': 74,
    'probability': 0.8335,
    'type': 'Place',
    'normalized_text': 'Hometown Cafe'},
   {'start': 79,
    'end': 91,
    'probability': 0.801,
    'type': 'Place',
    'normalized_text': 'Hohenwald, TN'}]},
 'text': 'We had a great time playing tonight for a packed house at the Hometown Cafe in Hohenwald, TN. Great people and great local talent!!'}

In [18]:
len(fetched.data)

44

In [22]:
tweets = fetched.data

In [21]:
fetched.errors

[{'value': '598730121197924352',
  'detail': 'Could not find tweet with ids: [598730121197924352].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'ids',
  'resource_id': '598730121197924352',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'resource_id': '925132136595701761',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '925132136595701761',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [925132136595701761].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id': '566720469412610049',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '566720469412610049',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [566720469412610049].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id':