## Twitter scraping code
Example of USA Sunrise Movement

In [1]:
import tweepy
from tweepy import OAuthHandler

import pickle
import pandas as pd
import json

In [2]:
# Set up Twitter API access
with open("../twitter_auth.json") as auth_file:
    auth_data = json.load(auth_file)

consumer_key = auth_data['consumer_key']
consumer_secret = auth_data['consumer_secret']
access_token = auth_data['access_token']
access_secret = auth_data['access_secret']

   
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify=True)

In [3]:
def scrape_tweets(accnames, df, save_name, save_every=20):
    """Scrape tweets from a list of accounts and store selected features"""
    for i,acc in enumerate(accnames):
        print(f'Going through {i} ({i}) - {acc}')
        results = tweepy.Cursor(api.user_timeline, screen_name=acc, tweet_mode="extended").items()
        try:
            json_data = [r._json for r in results]
            mini_df = pd.json_normalize(json_data)
            mini_df = mini_df[['created_at','id','full_text', 'retweet_count', 'favorite_count', \
                       'favorited', 'retweeted', 'possibly_sensitive', 'lang', 'user.id', 'user.id_str', 'user.name',\
                       'user.screen_name', 'user.location', 'user.description', 'user.url',
                        'user.followers_count', 'user.friends_count', 'user.created_at']]
            df = pd.concat([df, mini_df])
            if i%save_every==0: 
                print('Saving file')
                with open(save_name,'wb') as file:
                    pickle.dump(df,file)
        except Exception as e:
            print(e)
            pass
    return df

In [4]:
# Set the name of the movement here
movement = 'sunrise'
save_name = f'data/raw/{movement}_tweets'
accnames_path = f'data/accnames/{movement}_accnames.csv'

In [5]:
# Get list of account names
accnames = pd.read_csv(accnames_path)
accnames = list(set(accnames['accnames']))

In [6]:
df = pd.DataFrame()
df = scrape_tweets(accnames, df, save_name)

Going through 0 (0) - SunriseChico
Saving file
Going through 1 (1) - sunrisemvmtkc
Going through 2 (2) - SunriseWhb
Going through 3 (3) - sunrisemvmtgr
Going through 4 (4) - Sunrise_Davis
Going through 5 (5) - sunrisemvmtCLT
Going through 6 (6) - sunriseslvrsprg
Going through 7 (7) - sunrisemvmtSC
Going through 8 (8) - SunriseMvmtDC
Going through 9 (9) - SunriseMadison1
Going through 10 (10) - sunrisemvmtbgky
Going through 11 (11) - sunriseCWRU
Going through 12 (12) - wmsunrisemvmt
Going through 13 (13) - GreenTheDollar
Going through 14 (14) - SunriseKnox
Going through 15 (15) - SunriseMaine
Going through 16 (16) - sunriseburly
Going through 17 (17) - SunriseMvmtCLE
Going through 18 (18) - SunriseRivertwn
Going through 19 (19) - SE_WY_DSA
Going through 20 (20) - sunrise_rgv
Saving file
Going through 21 (21) - SunriseNOLA
Going through 22 (22) - SunriseWake
Going through 23 (23) - SunriseCville
Going through 24 (24) - SunriseTLH
Going through 25 (25) - sunrisemvmtFLG
Going through 26 (2

Rate limit reached. Sleeping for: 343


Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Going through 45 (45) - SacSunrise
Going through 46 (46) - sunrisenashtn
Going through 47 (47) - SunriseBerks
Going through 48 (48) - SunriseLAYouth
Going through 49 (49) - sunrisemvmtokc
Going through 50 (50) - SunriseJoliet
Going through 51 (51) - SCsunrisemvmt
Going through 52 (52) - sunrisechiarea
Going through 53 (53) - SunriseOberlin
Going through 54 (54) - sunrisebham
Going through 55 (55) - sunrisecolumbia
Going through 56 (56) - BmcSunrise
Going through 57 (57) - sunrise_slo
Going through 58 (58) - Sunrise_WA
Going through 59 (59) - Sunrise_JS
Going through 60 (60) - Lex_Sunrise
Saving file
Going through 61 (61) - sunrisemvmtnova
Going through 62 (62) - sunrisemvmt
Going through 63 (63) - sunriseneu
Going through 64 (64) - sunrisemvmtCR
Going through 65 (65) - BoulderSunrise
Going through 66 (66) - SunrisePDX
Going thr

Rate limit reached. Sleeping for: 346


Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Going through 81 (81) - SunriseMvmtLA
Going through 82 (82) - sunrisemvmtgwu
Going through 83 (83) - oc_sunrise
Going through 84 (84) - SunriseMvmtTC
Going through 85 (85) - SunriseBoise
Going through 86 (86) - sunrise_indy
Going through 87 (87) - SunriseGso
Going through 88 (88) - Sunrisemvmt_sea
Going through 89 (89) - SunriseLuc
Going through 90 (90) - SunriseNewHaven
Going through 91 (91) - SunriseMvmtHTX
Going through 92 (92) - sunriseatx
Going through 93 (93) - StlSunrise
Going through 94 (94) - SunrisemvmtCorv
Going through 95 (95) - sunrisemvmtnyc
Going through 96 (96) - sunriseannarbor
Going through 97 (97) - sunrise_PVD
Going through 98 (98) - BeavertonHub
Going through 99 (99) - SunriseLansing
Going through 100 (100) - sunrisefcma
Saving file
Going through 101 (101) - SunrisePaloAlto
Going through 102 (102) - sunrise

Rate limit reached. Sleeping for: 340


Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Going through 109 (109) - SunriseCADesert
Going through 110 (110) - SunriseMKE
Going through 111 (111) - SunriseKern
Going through 112 (112) - sunrisebayarea
Going through 113 (113) - sunrisemvmt_rb
Going through 114 (114) - SunriseCNY
Going through 115 (115) - morrissunrise
Going through 116 (116) - HubScioto
Going through 117 (117) - SunriseDanbury
Going through 118 (118) - sunriseeasthamp
Going through 119 (119) - SunriseSouthB
Going through 120 (120) - RoanokeSunrise
Saving file
Going through 121 (121) - SunriseFresno
Going through 122 (122) - sunrisecapecod


In [7]:
def check_completeness(accnames, df):
    """Check whether all accnames have been scraped and return a list of those unscraped"""
    lower_scraped_accnames = [acc.lower() for acc in list(set(df['user.screen_name'].values))]
    lower_accnames = [acc.lower() for acc in list(set(accnames))]
    rest_accnames = list(set(lower_accnames) - set(lower_scraped_accnames))
    return rest_accnames

In [8]:
# Scrape any leftover accnames
rest_accnames = check_completeness(accnames, df)
if len(rest_accnames)>0:
    df = scrape_tweets(rest_accnames, df,  save_name)

Going through 0 (0) - sunrisekzoo
Saving file
Going through 1 (1) - sunriseboston
Going through 2 (2) - sunrisemvmtrva


In [9]:
# Check df shape and number of unique accounts
print(df.shape)
print(len(set(df['user.screen_name'].values)))
# Drop duplicates and check shape/number again
df = df.drop_duplicates(subset = ['id', 'full_text'], keep='first')
print(df.shape)
print(len(set(df['user.screen_name'].values)))

(56464, 19)
123
(56464, 19)
123


In [10]:
# Save final df
with open(save_name,'wb') as file:
    pickle.dump(df,file)