# Get random tweets for neutral (non-white supremacist) data

In [2]:
# Load white supremacist dataset to count tweets over time
import datetime
import pandas as pd

path = '../tmp/white_supremacist_corpus.pkl'
ws_data = pd.read_pickle(path)
ws_data.domain.unique()

array(['tweet/short propaganda', 'forum', 'chat', 'long-form'],
      dtype=object)

In [5]:
# Select forum data, Group by year
yearly = ws_data.query('domain=="tweet/short propaganda"').groupby(by=ws_data.timestamp.dt.year)['text'].count()
lookup = pd.DataFrame(yearly)
lookup['begin'] = pd.to_datetime(yearly.index.astype(int).astype(str), format='%Y')
lookup['end'] = lookup.begin.shift(-1, fill_value = datetime.datetime(2018,1,1))
lookup.index.name = 'year'
lookup.index = lookup.index.astype(int)
lookup.rename(columns={'text': 'post_count'}, inplace=True)
lookup

Unnamed: 0_level_0,post_count,begin,end
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009,256,2009-01-01,2010-01-01
2010,698,2010-01-01,2011-01-01
2011,722,2011-01-01,2012-01-01
2012,1212,2012-01-01,2013-01-01
2013,2807,2013-01-01,2014-01-01
2014,13958,2014-01-01,2015-01-01
2015,15995,2015-01-01,2016-01-01
2016,21132,2016-01-01,2017-01-01
2017,36666,2017-01-01,2020-01-01
2020,354,2020-01-01,2021-01-01


In [6]:
# Try to get random tweets
import tweepy
import pandas as pd

# Load authentication
keys = pd.read_csv('/storage2/mamille3/tweepy_oauth_academic.csv', index_col='name').to_dict()['key']
client = tweepy.Client(keys['bearer_token'])

In [9]:
from datetime import datetime

tweet_fields = [
    'id', 'created_at', 'text', 'author_id', 'conversation_id', 'entities', 'public_metrics', 'geo', 'lang', 'referenced_tweets'
]
place_fields = [
    'full_name', 'id', 'contained_within'
]

# Want random tweets
query = 'the'
start_time = datetime(2016,1,1)
end_time = datetime(2016,12,31)

fetched = client.search_all_tweets(query, expansions='geo.place_id', place_fields=place_fields, tweet_fields=tweet_fields, start_time=start_time, end_time=end_time, max_results=10)
fetched.data

BadRequest: 400 Bad Request
There were errors processing your request: Rules must contain at least one positive, non-stopword clause (at position 1)

# Get tweets using the API from tweet IDs
Also see get_tweets.py

In [41]:
import tweepy
import pandas as pd

# Load authentication
keys = pd.read_csv('/storage2/mamille3/tweepy_oauth_academic.csv', index_col='name').to_dict()['key']
client = tweepy.Client(keys['bearer_token'], wait_on_rate_limit=True)

In [43]:
# Load ElSherief+2021 tweet IDs
stg1_meta = pd.read_csv('/storage2/mamille3/data/hate_speech/elsherief2021/implicit_hate_v1_stg1.tsv', sep='\t')
stg1_meta_tweets = stg1_meta[~stg1_meta['ID'].str.contains('_')].copy()
tweet_ids = stg1_meta_tweets['ID']
tweet_ids

# Load Qian+2018 tweet IDs
# fpath = '/storage2/mamille3/data/hate_speech/qian2018/white_supremacist_tweets.csv'
# tweet_ids = pd.read_csv(fpath)['tweet id']
# tweet_ids

0        399886440588247041
1        929901925100937216
2        728678509497954304
3        625688620444180481
4        441089979322597376
                ...        
20804    892542828475285504
20805    900565598358425600
20806    762438713397813120
20807    828761621170421760
20808    850115546746146816
Name: ID, Length: 20809, dtype: object

In [44]:
# Break up into lists of 100 (max for Tweepy)
chunks = [tweet_ids[x:min(x+100, len(tweet_ids))].values.tolist() for x in range(0, len(tweet_ids), 100)]
print(len(chunks))
responses = []

209


In [46]:
# Do query and save results out
import os
from tqdm.notebook import tqdm
import json
# dataset = 'qian2018'
dataset = 'elsherief2021'

tweet_fields = [
    'id', 'created_at', 'text', 'author_id', 'conversation_id', 'entities', 'public_metrics', 'geo', 'lang', 'referenced_tweets'
]
user_fields = [
    'id', 'name', 'username', 'description'
]

for chunk in tqdm(chunks[:5]):
    response = client.get_tweets(chunk, expansions=['author_id', 'entities.mentions.username'], tweet_fields=tweet_fields, user_fields=user_fields)

    out_dirpath = os.path.join('/storage2/mamille3/white_supremacist_lang/data', dataset)
    if not os.path.exists(out_dirpath):
        os.mkdir(out_dirpath)

    # Save out tweet data
    outpath = os.path.join(out_dirpath, 'data.jsonl')
    tweets = [tweet.data for tweet in response.data]
    with open(outpath, 'a') as f:
        f.write('\n'.join([json.dumps(tweet) for tweet in tweets]) + '\n')

    # Save out users
    outpath = os.path.join(out_dirpath, 'users.jsonl')
    users = [user.data for user in response.includes['users']]
    with open(outpath, 'a') as f:
        f.write('\n'.join([json.dumps(user) for user in users]) + '\n')

    # Save out errors
    outpath = os.path.join(out_dirpath, 'errors.jsonl')
    with open(outpath, 'a') as f:
        f.write('\n'.join([json.dumps(error) for error in response.errors]) + '\n')

  0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
dir(responses[0])

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'count',
 'data',
 'errors',
 'includes',
 'index',
 'meta']

In [33]:
responses[0].errors

[{'value': '598730121197924352',
  'detail': 'Could not find tweet with ids: [598730121197924352].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'ids',
  'resource_id': '598730121197924352',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'resource_id': '925132136595701761',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '925132136595701761',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [925132136595701761].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id': '566720469412610049',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '566720469412610049',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [566720469412610049].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id':

In [14]:
len(responses)

1963

In [15]:
len(chunks)

1963

In [20]:
type(responses[0].data)

list

In [22]:
len([tweet.data for r in responses for tweet in r.data])

240

In [24]:
# Save out tweet data
import json

outpath = '/storage2/mamille3/white_supremacist_lang/data/qian2018/data.jsonl'
tweets = [tweet.data for r in responses for tweet in r.data]
with open(outpath, 'a') as f:
    f.write('\n'.join([json.dumps(tweet) for tweet in tweets]))

In [28]:
responses[0].includes['users'][0].data

{'username': 'DixieRepublic',
 'name': 'Dixie Republic',
 'id': '935587117',
 'description': 'Hard Edge Southern Rock! Our new album Redneck way now available on iTunes. http://t.co/QiKgYUZ8ps'}

In [29]:
# Save out users
import json

outpath = '/storage2/mamille3/white_supremacist_lang/data/qian2018/users.jsonl'
users = [user.data for r in responses for user in r.includes['users']]
with open(outpath, 'a') as f:
    f.write('\n'.join([json.dumps(tweet) for tweet in tweets]))

In [30]:
# Load saved out users
import json

path = '/storage2/mamille3/white_supremacist_lang/data/qian2018/users.jsonl'
with open(path) as f:
    users = [json.loads(line) for line in f.read().splitlines()]
len(users)

240

In [36]:
fetched.includes

{'users': [<User id=935587117 name=Dixie Republic username=DixieRepublic>,
  <User id=96464367 name=Sloane Spencer Talks username=CountryFriedROK>,
  <User id=42877116 name=Ric Geyer username=RicGeyer>,
  <User id=790680583 name=Stanley D. Wyatt username=StanleyDWyatt>,
  <User id=285754520 name=Dylan Mathis username=DylanPaulMathis>,
  <User id=50872927 name=Lisa Hayes username=lisahayes4>,
  <User id=388823538 name=IG: MRDJNY username=DJNY1>,
  <User id=278806606 name=The Write Hook username=IAMTHEWRITEHOOK>,
  <User id=181720822 name=Christopher Shawn Shaw ðŸŽ¥ ðŸŽ¬ username=directorCSS>,
  <User id=23725431 name=Sharon Hope username=sharonhope2>,
  <User id=396652848 name=alan millns username=alanmillns>,
  <User id=17209867 name=Jon Franchino username=JonFranchino>,
  <User id=33352787 name=Abraham Lincoln's Log Cabin username=Mr_Lincoln>,
  <User id=2809256199 name=Bob Jackrabit username=scwhiterabbit>,
  <User id=1055892414 name=Country Soundtrack username=18country18>,
  <User 

In [31]:
len(tweets)

44

In [33]:
tweets[5].data

{'author_id': '396652848',
 'created_at': '2017-01-23T22:00:48.000Z',
 'text': 'Support the Cause! https://t.co/XCz6Xdsl3b via @@dixienetdotorg I support !!!',
 'conversation_id': '823651725831716866',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 1,
  'quote_count': 0},
 'entities': {'urls': [{'start': 19,
    'end': 42,
    'url': 'https://t.co/XCz6Xdsl3b',
    'expanded_url': 'http://leagueofthesouth.com/support-the-cause/',
    'display_url': 'leagueofthesouth.com/support-the-caâ€¦'}],
  'mentions': [{'start': 48,
    'end': 63,
    'username': 'dixienetdotorg',
    'id': '36315753'}]},
 'id': '823651725831716866',
 'lang': 'en'}

In [34]:
tweets[5].includes

AttributeError: 

In [23]:
type(fetched.data[-1])

tweepy.tweet.Tweet

In [25]:
dir(fetched.data[-1])

['__abstractmethods__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_abc_impl',
 'attachments',
 'author_id',
 'context_annotations',
 'conversation_id',
 'created_at',
 'data',
 'entities',
 'geo',
 'get',
 'id',
 'in_reply_to_user_id',
 'items',
 'keys',
 'lang',
 'non_public_metrics',
 'organic_metrics',
 'possibly_sensitive',
 'promoted_metrics',
 'public_metrics',
 'referenced_tweets',
 'reply_settings',
 'source',
 'text',
 'values',
 'withheld']

In [28]:
fetched.data[0].data

{'author_id': '935587117',
 'conversation_id': '276177111490904064',
 'lang': 'en',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 0,
  'quote_count': 0},
 'created_at': '2012-12-05T04:12:09.000Z',
 'id': '276177111490904064',
 'entities': {'annotations': [{'start': 62,
    'end': 74,
    'probability': 0.8335,
    'type': 'Place',
    'normalized_text': 'Hometown Cafe'},
   {'start': 79,
    'end': 91,
    'probability': 0.801,
    'type': 'Place',
    'normalized_text': 'Hohenwald, TN'}]},
 'text': 'We had a great time playing tonight for a packed house at the Hometown Cafe in Hohenwald, TN. Great people and great local talent!!'}

In [18]:
len(fetched.data)

44

In [22]:
tweets = fetched.data

In [21]:
fetched.errors

[{'value': '598730121197924352',
  'detail': 'Could not find tweet with ids: [598730121197924352].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'ids',
  'resource_id': '598730121197924352',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'resource_id': '925132136595701761',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '925132136595701761',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [925132136595701761].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id': '566720469412610049',
  'parameter': 'ids',
  'resource_type': 'tweet',
  'section': 'data',
  'title': 'Authorization Error',
  'value': '566720469412610049',
  'detail': 'Sorry, you are not authorized to see the Tweet with ids: [566720469412610049].',
  'type': 'https://api.twitter.com/2/problems/not-authorized-for-resource'},
 {'resource_id':