Twitter scraping code: example of USA democratic socialists (case 1)

In [1]:
import tweepy
from tweepy import OAuthHandler

import pickle
import pandas as pd
import json

In [2]:
# Set up Twitter API access
with open("twitter_auth.json") as auth_file:
    auth_data = json.load(auth_file)

consumer_key = auth_data['consumer_key']
consumer_secret = auth_data['consumer_secret']
access_token = auth_data['access_token']
access_secret = auth_data['access_secret']

   
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify=True)

In [3]:
def scrape_tweets(accnames, df, save_name, save_every=20):
    """Scrape tweets from a list of accounts and store selected features"""
    for i,acc in enumerate(accnames):
        print(f'Going through {i} ({i}) - {acc}')
        results = tweepy.Cursor(api.user_timeline, screen_name=acc, tweet_mode="extended").items()
        try:
            json_data = [r._json for r in results]
            mini_df = pd.json_normalize(json_data)
            mini_df = mini_df[['created_at','id','full_text', 'retweet_count', 'favorite_count', \
                       'favorited', 'retweeted', 'possibly_sensitive', 'lang', 'user.id', 'user.id_str', 'user.name',\
                       'user.screen_name', 'user.location', 'user.description', 'user.url',
                        'user.followers_count', 'user.friends_count', 'user.created_at']]
            df = pd.concat([df, mini_df])
            if i%save_every==0: 
                print('Saving file')
                with open(save_name,'wb') as file:
                    pickle.dump(df,file)
        except Exception as e:
            print(e)
            pass
    return df

In [4]:
# Set the name of the movement here
movement = 'demsoc'
save_name = f'{movement}_tweets'
accnames_path = f'{movement}_accnames.csv'

In [5]:
# Get list of account names
accnames = pd.read_csv(accnames_path)
accnames = list(set(accnames['accnames']))

In [6]:
df = pd.DataFrame()
df = scrape_tweets(accnames, df, save_name)

Going through 0 (0) - nycYDSA
Saving file
Going through 1 (1) - cbkDSA
Going through 2 (2) - DSA_Immigration
Going through 3 (3) - dsa_louisville
Going through 4 (4) - nbkDSA
Going through 5 (5) - pghDSA
Going through 6 (6) - DemSocialists
Going through 7 (7) - YDSA_loyola
Going through 8 (8) - HeartOfIowaDSA
Going through 9 (9) - Boston_DSA


Rate limit reached. Sleeping for: 332


Going through 10 (10) - TampaDSA
Going through 11 (11) - salem_dsa
Going through 12 (12) - DSA_Enviro
Going through 13 (13) - uclaydsa
Going through 14 (14) - DSASantaCruz
Going through 15 (15) - ChambanaDSA
Going through 16 (16) - AnchorageDSA
Going through 17 (17) - CentralInDSA
Going through 18 (18) - QuietCorner_DSA


Rate limit reached. Sleeping for: 343


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 19 (19) - Antelope_DSA
Going through 20 (20) - DSAOrangeCounty
Saving file
Going through 21 (21) - BozemanDSA
Going through 22 (22) - IthacaDSA
Going through 23 (23) - ydsa_nu
Going through 24 (24) - NorthNJDSA
Going through 25 (25) - DSAPress
Going through 26 (26) - NorthShoreDSA
Going through 27 (27) - ydsfresnostate
Going through 28 (28) - lawrenceksdsa
Going through 29 (29) - CharlestonDSA
Twitter error response: status code = 401
Going through 30 (30) - ChicagoCityDSA
Going through 31 (31) - DSALongBeach


Rate limit reached. Sleeping for: 359


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 32 (32) - whatcomdsa
Going through 33 (33) - ydsumn
Going through 34 (34) - DSA_LosAngeles
Going through 35 (35) - dsaiowacity
Going through 36 (36) - DSA_SanDiego
Going through 37 (37) - GSO_DSA
Going through 38 (38) - dsantxcc
Going through 39 (39) - Billings_DSA
Going through 40 (40) - austin_DSA


Rate limit reached. Sleeping for: 352


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 41 (41) - AuburnDSA
Going through 42 (42) - okcdsausa
Going through 43 (43) - BoiseDSA
Going through 44 (44) - DSASnoCo
Going through 45 (45) - JerseyShoreDSA
Going through 46 (46) - BuffaloDSA
Going through 47 (47) - bernieDSAPDX
Going through 48 (48) - DSA_Cleveland
Going through 49 (49) - wes_ydsa
Going through 50 (50) - AshevilleDSA
Going through 51 (51) - VT_YDSA
Going through 52 (52) - LakeMcHenryDSA


Rate limit reached. Sleeping for: 356


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 53 (53) - HuronValleyDSA
Going through 54 (54) - SV_DSA
Going through 55 (55) - DSA_WesternCT
Going through 56 (56) - CapDistrictDSA
Going through 57 (57) - BuxMontDSA
Going through 58 (58) - YaleYDSA
Going through 59 (59) - YooperDSA


Rate limit reached. Sleeping for: 366


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 60 (60) - DSALaPlataCo
Saving file
Going through 61 (61) - nyuydsa
Going through 62 (62) - SDSA4Bernie
Going through 63 (63) - DSA_NorthTexas
Going through 64 (64) - Wichita_DSA
Going through 65 (65) - ncwvdsa
Going through 66 (66) - PortlandDSA
Going through 67 (67) - smithcollegeyds
Going through 68 (68) - NoMiDSA
Going through 69 (69) - dsasouthjersey
Going through 70 (70) - LasVegasDSA


Rate limit reached. Sleeping for: 366


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 71 (71) - DSM_DSA
Going through 72 (72) - DSA_Dayton
Going through 73 (73) - losalydsa
Going through 74 (74) - DSAolympia
Going through 75 (75) - SEO_DSA
Going through 76 (76) - N_Indiana_DSA
Going through 77 (77) - DSAMadison
Going through 78 (78) - ydsancsu
Going through 79 (79) - KVDSA
Going through 80 (80) - WorcDSA
Saving file
Going through 81 (81) - SEAlaska_DSA
Going through 82 (82) - HudCoDSA
Going through 83 (83) - ctdsa
Twitter error response: status code = 404
Going through 84 (84) - BloNo_DSA
Going through 85 (85) - Orlando_DSA


Rate limit reached. Sleeping for: 333


Going through 86 (86) - YDSA_WM
Going through 87 (87) - SeattleDSA
Going through 88 (88) - PalmBeachDSA
Going through 89 (89) - Columbus_DSA
Going through 90 (90) - Montana_DSA
Going through 91 (91) - PhillyDSA
Going through 92 (92) - FoothillsDSA
Going through 93 (93) - BrazosValleyDSA
Going through 94 (94) - YDSA_OU
Going through 95 (95) - SangamonDSA
Going through 96 (96) - DSA_KC
Going through 97 (97) - snhdsa


Rate limit reached. Sleeping for: 357


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 98 (98) - SEUYDSA
Going through 99 (99) - hunter_ydsa
Going through 100 (100) - LansingDSA
Saving file
Going through 101 (101) - MilwaukeeDSA
Going through 102 (102) - ColumbiaDSA
Going through 103 (103) - BU_YDSA
Going through 104 (104) - CentralFLDSA
Going through 105 (105) - N_NevadaDSA
Going through 106 (106) - DSAVeterans
Going through 107 (107) - PinellasDSA
Going through 108 (108) - DBQSocialists
Going through 109 (109) - rrvdsa
Going through 110 (110) - DSA_Jax
Going through 111 (111) - DSARichmond


Rate limit reached. Sleeping for: 340


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 112 (112) - epwvdsa
Going through 113 (113) - YDS_BC
Going through 114 (114) - InlandEmpireDSA
Going through 115 (115) - DSAdenver
Going through 116 (116) - BhamDSA
Going through 117 (117) - DSA_Phoenix
Going through 118 (118) - dsa_rva
Going through 119 (119) - DSANorthernIL
Going through 120 (120) - YDSAosu
Saving file
Going through 121 (121) - nycDSA


Rate limit reached. Sleeping for: 326


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 122 (122) - ProvDSA
Going through 123 (123) - SV_DSA_SJ
Going through 124 (124) - dsam4a
Going through 125 (125) - PVDemSoc
Going through 126 (126) - CapeCodDSA
Going through 127 (127) - slcDSA
Going through 128 (128) - QueerDSA


Rate limit reached. Sleeping for: 345


Going through 129 (129) - abqdsa
Going through 130 (130) - DSADurham
Going through 131 (131) - kansascitydsa
Going through 132 (132) - NewOrleansDSA
Going through 133 (133) - cltdsa
Going through 134 (134) - DSAarkansas
Going through 135 (135) - Pensasocialists
Going through 136 (136) - middletndsa
"['possibly_sensitive'] not in index"
Going through 137 (137) - VCU_YDSA
Going through 138 (138) - BatonRougeDSA
Going through 139 (139) - DSALincolnNE
Going through 140 (140) - YDSAGT
Saving file
Going through 141 (141) - MiamiDSA


Rate limit reached. Sleeping for: 349


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 142 (142) - stlouisdsa
Going through 143 (143) - MSU_YDSA
Going through 144 (144) - SeattleDSA_East
Going through 145 (145) - SoKYDSA
Going through 146 (146) - SouthernUtahDSA
Going through 147 (147) - SWMIDSA
Going through 148 (148) - SuffolkDSA
Going through 149 (149) - DSAEastBay
Going through 150 (150) - DSAoftheRGV
Going through 151 (151) - YDSAArcadia
Twitter error response: status code = 404
Going through 152 (152) - NassauDSA
Going through 153 (153) - OmahaDSA


Rate limit reached. Sleeping for: 293


Going through 154 (154) - DSA_SF
Going through 155 (155) - BloomingtonDSA
Going through 156 (156) - PortageDSA
Going through 157 (157) - ydsa_sdsu
Going through 158 (158) - rpiYDSA
Going through 159 (159) - CDSAForBernie
Going through 160 (160) - DSAVentura
Saving file
Going through 161 (161) - bmoreDSA
Going through 162 (162) - DaytonaBeachDSA
Going through 163 (163) - DSA_AugustaCSRA
Going through 164 (164) - SoInd_DSA
Going through 165 (165) - TacomaDSA
Going through 166 (166) - KnoxvilleDSA


Rate limit reached. Sleeping for: 351


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 167 (167) - WestSoundDSA
Going through 168 (168) - akrondsa
Going through 169 (169) - DSA_SouthernME
Going through 170 (170) - dsa_chatt
Going through 171 (171) - CentralNJDSA
Going through 172 (172) - southern_il_dsa
Going through 173 (173) - UptownBronxDSA
Going through 174 (174) - TallahasseeDSA
Going through 175 (175) - QuadCitiesDSA
Going through 176 (176) - rocDSA


Rate limit reached. Sleeping for: 332


Going through 177 (177) - DSAFundathon
Going through 178 (178) - MemphisDSA
Going through 179 (179) - MobileDSA
Going through 180 (180) - DSALehighValley
Saving file
Going through 181 (181) - DSASantaFe
Going through 182 (182) - DSADesignCmte
Going through 183 (183) - chicodsa
Going through 184 (184) - BrowardDSA
Going through 185 (185) - DSA_Eugene
Going through 186 (186) - unc_ydsa


Rate limit reached. Sleeping for: 331


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 187 (187) - dsa_lsc
Going through 188 (188) - SV_DSA_LP
Going through 189 (189) - TidewaterDSA
Going through 190 (190) - BoulderDSA
Going through 191 (191) - sbkDSA
Going through 192 (192) - YDSABloomington
Going through 193 (193) - mdc_dsa
Going through 194 (194) - mhvdsa
Going through 195 (195) - DSACincy
Going through 196 (196) - greencountrydsa
Going through 197 (197) - WesternMT_DSA


Rate limit reached. Sleeping for: 336


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 198 (198) - champvalleydsa
Going through 199 (199) - PBDSA
Going through 200 (200) - detroitdsa
Saving file
Going through 201 (201) - GrandRapidsDSA
Going through 202 (202) - harrisburg_dsa
Going through 203 (203) - lhvdsa
Going through 204 (204) - SE_WY_DSA
Going through 205 (205) - QueensDSA
Going through 206 (206) - eriedsa
Going through 207 (207) - CvilleDSA
Going through 208 (208) - littlerockDSA
Going through 209 (209) - dsanorthbay
Going through 210 (210) - Lancaster_DSA


Rate limit reached. Sleeping for: 343


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 211 (211) - syrDSA
Going through 212 (212) - HoustonDSA
Going through 213 (213) - DSA_law
Going through 214 (214) - LowManDSA
Going through 215 (215) - TwinCitiesDSA
Going through 216 (216) - BrookingsDSA
"['possibly_sensitive'] not in index"
Going through 217 (217) - DSASac
Going through 218 (218) - palousedsa
Going through 219 (219) - sb_dsa
Going through 220 (220) - fxbgdsa
Saving file
Going through 221 (221) - centrecodsa


Rate limit reached. Sleeping for: 344


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 222 (222) - YDSA_
Going through 223 (223) - DSA_Tucson
Going through 224 (224) - MetroATLDSA
Going through 225 (225) - dsa_nova
Going through 226 (226) - YDS_Princeton
Going through 227 (227) - Yallidarity
Going through 228 (228) - SpringfieldDSA
Twitter error response: status code = 404
Going through 229 (229) - MidMoDSA
Going through 230 (230) - MidTNDSA
Going through 231 (231) - WestSuburbILDSA
Going through 232 (232) - DSAEmerge
Going through 233 (233) - YDSAumd
Going through 234 (234) - DSA_Labor


Rate limit reached. Sleeping for: 348


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 235 (235) - dsa_fresno
Going through 236 (236) - NC_DSA
Going through 237 (237) - spokanedsa
Going through 238 (238) - LexingtonDSA
Going through 239 (239) - northeasttndsa
Going through 240 (240) - nycDSAantiwar
Saving file
Going through 241 (241) - HaysSocialists
Going through 242 (242) - PomonaValleyDSA
Going through 243 (243) - SpaceCoastDSA
Going through 244 (244) - SanAntonioDSA
Going through 245 (245) - nwohiodsa
Going through 246 (246) - DSA_tuscaloosa
Going through 247 (247) - NYCDSA_Climate
Going through 248 (248) - PeoriaDSA


Rate limit reached. Sleeping for: 315


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))


In [7]:
def check_completeness(accnames, df):
    """Check whether all accnames have been scraped and return a list of those unscraped"""
    lower_scraped_accnames = [acc.lower() for acc in list(set(df['user.screen_name'].values))]
    lower_accnames = [acc.lower() for acc in list(set(accnames))]
    rest_accnames = list(set(lower_accnames) - set(lower_scraped_accnames))
    return rest_accnames

In [8]:
# Scrape any leftover accnames
rest_accnames = check_completeness(accnames, df)
if len(rest_accnames)>0:
    df = scrape_tweets(rest_accnames, df,  save_name)

Going through 0 (0) - springfielddsa
Twitter error response: status code = 404
Going through 1 (1) - peoriadsa
Going through 2 (2) - nycdsa
Going through 3 (3) - charlestondsa
Twitter error response: status code = 401
Going through 4 (4) - ctdsa
Twitter error response: status code = 404
Going through 5 (5) - knoxvilledsa
Going through 6 (6) - brookingsdsa
"['possibly_sensitive'] not in index"
Going through 7 (7) - quietcorner_dsa
Going through 8 (8) - westernmt_dsa
Going through 9 (9) - snhdsa
Going through 10 (10) - miamidsa
Going through 11 (11) - dsarichmond
Going through 12 (12) - lakemchenrydsa
Going through 13 (13) - yooperdsa


Rate limit reached. Sleeping for: 322


Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Going through 14 (14) - centrecodsa
Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out.
Going through 15 (15) - dsalongbeach
Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /1.1/statuses/user_timeline.json?screen_name=dsalongbeach&tweet_mode=extended (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000019C396DA148>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
Going through 16 (16) - ydsaarcadia
Twitter error response: status code = 404
Going through 17 (17) - dsa_labor
Going through 18 (18) - lasvegasdsa
Going through 19 (19) - austin_dsa
Going through 20 (20) - lanca

In [9]:
# Check df shape and number of unique accounts
print(df.shape)
print(len(set(df['user.screen_name'].values)))
# Drop duplicates and check shape/number again
df = df.drop_duplicates(subset = ['id', 'full_text'], keep='first')
print(df.shape)
print(len(set(df['user.screen_name'].values)))

(387854, 19)
240
(387854, 19)
240


In [11]:
# Save final df
with open(save_name,'wb') as file:
    pickle.dump(df,file)