# Download Twitter followers for a set of users

### Notebook Author: Nikhil Utane


In [1]:
# Pip install GetOldTweets3 if you don't already have the package
# !pip install GetOldTweets3

# Imports
import GetOldTweets3 as got
import pandas as pd
import time
import tweepy
import csv
import sys
from collections import Counter 
import requests
import http.client, urllib
import re
import glob
import os

#### Read all the security tokens from a keys.py file

In [2]:
from keys import keys #keep keys in separate file, keys.py

consumer_key = keys['consumer_key']
consumer_secret = keys['consumer_secret']
access_token = keys['access_token']
access_token_secret = keys['access_token_secret']
pushover_token = keys['pushover_token']
pushover_user = keys['pushover_user']

#### I am using Pushover to notify me if any cell stops running

In [3]:
def pushoverNotify():
    conn = http.client.HTTPSConnection("api.pushover.net:443")
    conn.request("POST", "/1/messages.json",
      urllib.parse.urlencode({
        "token": pushover_token,
        "user": pushover_user,
        "message": "Cell finished execution",
      }), { "Content-type": "application/x-www-form-urlencoded" })
    r=conn.getresponse()
    print(r.status, r.reason)

In [4]:
from IPython.core.magic import register_cell_magic

# Register the magic so that later any cell that we want to be notified on exception can be done
@register_cell_magic('handle')
def handle(line, cell):
    try:
        exec(cell)        
    except Exception as e:
        pushoverNotify()
        raise # if you want the full trace-back in the notebook


### Get List of Followers. 
#### We are getting the IDs since the rate limit for that is quite high ~45000 per 15 mins vs ~3000 for usernames
#### Then we'll convert ID to username and using GetOldTweets3 to download in bulk going as far back as 2014

In [5]:
# Define all the user configuration here
side = "left"

In [6]:
followers_folder = "../data/" + side + "/followers/"
tweets_folder = "../data/" + side + "/tweets/"
handles_file = "../data/" + side + "_handles.txt"
followers_id_file = followers_folder + "all_followers_id.txt"
followers_id_dedup_file = followers_folder + "all_followers_id_dedup.txt"
followers_username_file = followers_folder + "all_followers_username.txt"
fetched_username_files = tweets_folder + "fetched_list.txt"
GetOldTweets3_bin = "/home/nikhil/packages/GetOldTweets3/bin/GetOldTweets3"
processed_path=tweets_folder + "/processed"
processed_tweets_file=processed_path + "/all_tweets.txt"
cleaned_tweets_file=processed_path + "/all_tweets_cleaned.txt"

In [11]:
# Below source code credit: https://gist.github.com/PandaWhoCodes/46f58fdead71f4c71453d9ed1e21adf8
# Credentials
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

def get_and_save_followers(user_name):
    """
    get a list of all followers of a twitter account
    :param user_name: twitter username without '@' symbol
    :return: list of usernames without '@' symbol
    """
    followers = []
    with open(followers_folder + user_name + "_followers_id.csv", 'w',encoding="utf-8") as output:
        for page in tweepy.Cursor(api.followers_ids, screen_name=user_name, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True).pages():
            sys.stdout.write(".")
            sys.stdout.flush()
            try:
                #followers.extend(page)
                for user_id in page:
                    output.write('%s\n' % user_id)
            except tweepy.TweepError as e:
                print("Going to sleep:", e)
                # Sleeping to slow down. Else we hit rate limit often
                time.sleep(60)
    return followers

### Read the initial list of handles

In [None]:
%%handle

with open(handles_file) as f:
    handles = [line.rstrip() for line in f]
    
for handle in handles:
    print("Getting followers for " + handle)
    followers = get_and_save_followers(handle)    
    print("Done.")
    

### Merge, de-duplicate and sort the followers list

In [7]:
print(followers_folder)
print(followers_id_file)
!echo $followers_folder/*.csv | xargs cat > $followers_id_file

../data/left/followers/
../data/left/followers/all_followers_id.txt


In [16]:
# initializing list 
with open(followers_id_file) as f:
    id_list = [line.rstrip() for line in f]

# printing original list 
print("Number of ids before dedup: %d" % len(id_list)) 

# using Counter.most_common() + list comprehension 
# sorting and removal of duplicates 
id_dedup = [key for key, value in Counter(id_list).most_common()] 

# print result 
print("Number of ids after dedup: {}. Percent reduced: {}".format(len(id_dedup), int((len(id_dedup)*100)/len(id_list))) )

with open(followers_id_dedup_file, "w") as output:
    for user_id in id_dedup:
        output.write('%s\n' % user_id)

Number of ids before dedup: 15360681
Number of ids after dedup: 8174646. Percent reduced: 53


### Convert IDs to usernames for GetOldTweets3 to fetch in bulk

In [None]:
%%handle
# We are doing a GET on a twitter link and parsing our the username, fastest way with no rate limiting
found = not_found = last_index = 0
user_list = []

# If you are resuming from somewhere in the middle, then uncomment below lines 
# and specify the last converted ID
print("Reading ID file");
with open(followers_id_dedup_file) as f:
    id_dedup = [line.rstrip() for line in f]

last_index = id_dedup.index("1108670208607174656")
del id_dedup[0:last_index+1]

with open(followers_username_file, "w") as output:
    count = last_index + 1
    for user_id in id_dedup:
        print("[%d] Converting %s" % (count, user_id) , end=' ');
        r = requests.get('https://twitter.com/intent/user?user_id=' + user_id)
        user_search=re.search('<title>.*\(@(.*)\).*</title>', r.content.decode('utf-8'), re.IGNORECASE)
        if user_search:
            username = user_search.group(1)            
            user_list.append(username)
            output.write('%s\n' % username)
            found += 1
            print("=> %s" % username);
        else:
            not_found += 1
            print("ID %s not found" % user_id);
        count += 1
        
    print("%d usernames found. %d not found." % found, not_found)   

Reading ID file
[5239] Converting 1676066965 => _satya2013
[5240] Converting 56391919 => Dr_Aqsa_Shaikh
[5241] Converting 855320152534949888 => MUSICALRAVI
[5242] Converting 331139323 => AlamKhursheed16
[5243] Converting 943273381 => genocidalwatch
[5244] Converting 2483123348 => basha_gbd
[5245] Converting 114375529 => alafroz
[5246] Converting 3042638899 => tjbamal
[5247] Converting 363672233 => hi2deva
[5248] Converting 545827473 => CaAjmalali
[5249] Converting 2480149844 => aqshaan
[5250] Converting 1210800139 => rabindradanre
[5251] Converting 708897650 => VishnuJerome
[5252] Converting 3270224028 => RakshithPradum1
[5253] Converting 2878159046 => Shamim_asif20
[5254] Converting 2534852109 => mnkhafil
[5255] Converting 1081939701815234560 => sachasid70
[5256] Converting 1316152746 => DrishtiWahi2
[5257] Converting 120282286 => Sham_Ash
[5258] Converting 20322596 => rpsingh15
[5259] Converting 525416536 => k_saifudheen
[5260] Converting 4883298308 => Nadeem_Dilli
[5261] Converting 

[5399] Converting 1233237047873298433 => junedkhanmewati
[5400] Converting 875952621898649600 => khillare_nagesh
[5401] Converting 1233061030185570304 => Karco44926885
[5402] Converting 1211972462251872256 => ms22951
[5403] Converting 1232521069216452608 => Twiteroo8
[5404] Converting 1187066512630149120 => SayeedSarwarKh1
[5405] Converting 166450788 => WantedImram
[5406] Converting 1229459859604672512 => pandeyji_abhi
[5407] Converting 713252669792456704 => SakruddeenQadri
[5408] Converting 1232730971935559680 => ManirAl24530980
[5409] Converting 1232904038716297217 => Namenotrequire
[5410] Converting 1217729177954082817 => GausMohamad2
[5411] Converting 1025673244844601344 => HumaMirza19
[5412] Converting 1230856054604353537 => SameerBoss20
[5413] Converting 2466726272 => kabir_quraishi
[5414] Converting 1229726602071769089 => TweeticianVRG
[5415] Converting 757588764151558145 => pritamsaha758
[5416] Converting 1232728509543518208 => IAMJnehal
[5417] Converting 1217696470213656576 =>

[5550] Converting 976503418087923712 => iamfsid
[5551] Converting 1216453248107696128 ID 1216453248107696128 not found
[5552] Converting 1058377071674634240 => JeeneKaAdhikar
[5553] Converting 836585448105512960 => Azizshekh14
[5554] Converting 1226913410069123072 => HarvijaySingh13
[5555] Converting 1216388204967776262 => SatyaPr90260478
[5556] Converting 1178720980509675520 => AngryAazam
[5557] Converting 729260246502391808 => OfficeOfShadab
[5558] Converting 1226853735520927744 => abdulkh81655612
[5559] Converting 1226706233429610497 => rajukumarkann12
[5560] Converting 1226770331190190081 => JawedAk94120808
[5561] Converting 1212301802655240192 => Pradeep87864398
[5562] Converting 1226730864773664774 => Afeef_16
[5563] Converting 1223649368944889861 => MushtsqSuhail
[5564] Converting 1226615521119850497 => MohdFai66208837
[5565] Converting 1215205031453941766 => rajeevp83579110
[5566] Converting 1106614003084734464 => MazharA28330496
[5567] Converting 1222176369070596096 => Lateefk

[5702] Converting 1199370819735998464 => Shahnaw66655085
[5703] Converting 1210864613782196229 => nobadi_mr
[5704] Converting 1207724462709149697 => lier2345
[5705] Converting 344687605 => shaikhrafi007
[5706] Converting 1217013887725273088 => PrinceK82411663
[5707] Converting 4358470634 => HasnatK56034256
[5708] Converting 844237760957485057 => imparvez_21
[5709] Converting 1212644227693215744 => MDLITON45428790
[5710] Converting 811560566577721344 => Rajesh01814
[5711] Converting 1222599706385272832 => nirvair44975966
[5712] Converting 1222610521137926144 => BagwanRuman
[5713] Converting 1029419670682497025 => ZainyAli3
[5714] Converting 251931158 => iamsairabano
[5715] Converting 1216625091489890306 => manpree57639334
[5716] Converting 1222542409466175490 => india_two
[5717] Converting 1215934799187038208 => imaseemit
[5718] Converting 1222520089196384256 => Hari62269785
[5719] Converting 1222521534629085185 => RashidManan2
[5720] Converting 1222159042375602176 => Asif64829605
[5721

[5854] Converting 1219141229511938048 => RashidA84473957
[5855] Converting 1149206739197153280 => MohdAti80606034
[5856] Converting 1139954441715474433 => skmdnijamuddin2
[5857] Converting 1214589310172827649 => SayedAliAlam1
[5858] Converting 1213090459301466114 => shafiqr19280376
[5859] Converting 1181540881540640768 => Amit78166089
[5860] Converting 1218924616703344640 => MaheshriSagar
[5861] Converting 1218894550099353600 => Shaukat33923881
[5862] Converting 1218554248364883968 => AzhanMomin2
[5863] Converting 1215108410770722816 => IrshadS65853807
[5864] Converting 2204470717 => rijvanmansuri1
[5865] Converting 1218731432262041600 => SranVicki
[5866] Converting 190275188 => pavi2051
[5867] Converting 1218608655802568704 => HussainGafur
[5868] Converting 1193903764664512512 => RavindraKumR3
[5869] Converting 1218563483781844998 => Zahir34497448
[5870] Converting 1212587665381445632 => AnwalaMangera
[5871] Converting 1218545714931744768 => SanjayG62402769
[5872] Converting 121853534

[6008] Converting 1214873767819784193 => MohdSha25907990
[6009] Converting 1215780935267844096 => Randhir60297381
[6010] Converting 3055029294 => devendra1920
[6011] Converting 1213378480517201920 => MUSTAFI26616740
[6012] Converting 1175058129127690241 => Ehatesham_s
[6013] Converting 1215518215171977217 => Rafique_Shaikh7
[6014] Converting 1164116047193972738 => AmirHamza432
[6015] Converting 1197504794081951749 => fakendrbahubali
[6016] Converting 217033323 => sarathsasi84
[6017] Converting 1114056526002413568 => Faiz95076113
[6018] Converting 1182537923519889408 => hussain09217336
[6019] Converting 1121327804396834816 => ma_bidr
[6020] Converting 1188811845617868803 => mirajulhoq
[6021] Converting 1215616680816041985 => Mahabat43288728
[6022] Converting 1215568725455167490 => UncleLash
[6023] Converting 1211959224877469698 => AnsaTextiles
[6024] Converting 1215506006983098368 => ShubhamBhuyat
[6025] Converting 1213707211752402944 => MdSNawaz8
[6026] Converting 1129318260757766145 =

[6163] Converting 1205590218348224512 => jamiamillia1
[6164] Converting 1198574051276406786 => wahmodiwah
[6165] Converting 1208237634138427392 => Bia58592180
[6166] Converting 1211473296618688513 => RaffanMd
[6167] Converting 3148024229 => ahmad_ansari008
[6168] Converting 1181817481863680001 => Mozammi10518186
[6169] Converting 900691482469728258 => AmishBadwaDlaal
[6170] Converting 1213482500195930115 => Liberty60632052
[6171] Converting 1123126291199168512 => tothepo27823768
[6172] Converting 1213776309437906944 => Jhasaram1
[6173] Converting 1212140813825667072 => ScamTd
[6174] Converting 1174217039516504064 => Azhan__wwe
[6175] Converting 4582409774 => AyeshaArabid
[6176] Converting 4710812466 => EmmZedd_
[6177] Converting 1213452442609733632 => Sayana_70
[6178] Converting 1094536592620802048 => Sarfara75837689
[6179] Converting 413537673 => Azadelahi
[6180] Converting 1206165425093898240 => SinchanSaibal
[6181] Converting 1211145463564398593 => HIRALAL77195518
[6182] Converting 

[6315] Converting 1181509694138880000 => ShahidA54406080
[6316] Converting 1208750356080910336 => MarathiMuggle
[6317] Converting 1172167688371634178 => Amer36840671
[6318] Converting 1206630383296450560 => AbdulRa59283989
[6319] Converting 816811812 => vicky_maninder
[6320] Converting 1210411173260095489 => Mohamma59332668
[6321] Converting 1195225270266949632 => Reyaz77707601
[6322] Converting 1210655371288891393 => ahad8988
[6323] Converting 1210199920017211397 => WahidHu84414232
[6324] Converting 1210605267747930112 => FirozKh60711296
[6325] Converting 1195170473438404609 => Arvind25277409
[6326] Converting 1140703940515749889 => MdRizwa25753758
[6327] Converting 1210553303181824000 => rookie_indian
[6328] Converting 1020910885601071104 => SyedImr47338845
[6329] Converting 804724425325236224 => VinayKrNalanda
[6330] Converting 1210282661672996864 => yee__haw
[6331] Converting 4845164755 => BureaucracyT
[6332] Converting 573023078 => koustubhbhattac
[6333] Converting 959000647910744

[6469] Converting 1207355917588684807 => SUHAILN95K
[6470] Converting 1207833969527525377 => Mohamme41980432
[6471] Converting 4396602569 => JayeshDhawal
[6472] Converting 1206321902462963712 => HShahzer
[6473] Converting 1207738128984731648 => Nishu28277361
[6474] Converting 1207740291823726592 => AMJ59014035
[6475] Converting 1207269937254764546 => qtYiZpMuLKrfHzE
[6476] Converting 1205243549702410241 => ErfanAhmad21
[6477] Converting 842051207992500224 => wik_riki
[6478] Converting 1205225843745681408 => RaquibulR
[6479] Converting 1069326345388335105 => RAFIISL64093598
[6480] Converting 1207702631256166401 => irresis09129970
[6481] Converting 1193917968687591424 => Abhishe04258440
[6482] Converting 1181565462695407616 => mitrron
[6483] Converting 3126449196 => 0990wasi
[6484] Converting 1206967473183920128 => VijaySa49196149
[6485] Converting 1207637869235326976 => AKILAHM06002895
[6486] Converting 1207622176301404160 => Murtuza68065819
[6487] Converting 1197015200349581312 => NikS

[6627] Converting 1177444576161742849 => AIMIM_BokaroJH
[6628] Converting 1205556056580968448 => NK37602984
[6629] Converting 1204639176936701952 => shaikh_umerah
[6630] Converting 1201443292086337536 => DeshBhakt_1_2
[6631] Converting 1194161569585721345 => PKMBalways
[6632] Converting 1202575221716176897 => Indiaforsocial1
[6633] Converting 1204255985541345281 => MotiramRaut5
[6634] Converting 1194826137752309760 => Mayankp94605290
[6635] Converting 1186600947021053954 => PavanSi77519906
[6636] Converting 1205108522414989312 => SAKEELK40733872
[6637] Converting 1204582240203591680 => RajRohi87025747
[6638] Converting 2642526276 => nazimghori
[6639] Converting 1102161355652694016 => Azlan_Sayed19
[6640] Converting 1204934738831540224 => ProudUseless
[6641] Converting 1121301926388408320 => FarazSady
[6642] Converting 1178224697479028736 => KrrishnaDilSe
[6643] Converting 854360983103459328 => MohdAhmad201
[6644] Converting 1201741174634975232 => PrinceI05169613
[6645] Converting 12047

[6780] Converting 994567798406959104 => Jim28764601
[6781] Converting 1195583070432198656 => NileshKr_Singh
[6782] Converting 1405667844 => satishkhatri12
[6783] Converting 1198266820098027520 => AdnanKhan1_1
[6784] Converting 1179079244221018112 => RahulKu40209087
[6785] Converting 888151062 => patils_army
[6786] Converting 1197208269288165376 => ImtiazA30020876
[6787] Converting 1111195818969784321 => innocent0indian
[6788] Converting 973737556662566912 => surya12p
[6789] Converting 1186831275304161280 => TouhidAhmadkha1
[6790] Converting 1068280024258744322 => LalitRa97169007
[6791] Converting 1197810518028890113 => iamT_Rana
[6792] Converting 1166351684982980610 => Bishnup64903047
[6793] Converting 810454459306360834 => Vip_Ahiwale
[6794] Converting 1184858554311725056 => Surende41026064
[6795] Converting 1184385272282337280 => Heartlysinghan1
[6796] Converting 1192940790189719552 => MuntazirSiddiq9
[6797] Converting 1194606950031319040 => LakhaniSharwan
[6798] Converting 119383320

[6935] Converting 1187325944823836672 => WasimAk35929840
[6936] Converting 1064705275175698433 => AmolJarhad007
[6937] Converting 910765954992427008 => imparagdk
[6938] Converting 1129031965032767489 => KShahrukh2121
[6939] Converting 2512893046 => shankarraja10
[6940] Converting 1161689195993280512 => Asman955
[6941] Converting 2230906615 => KhanAysha5
[6942] Converting 1186595688601153536 => WigglySoul
[6943] Converting 1186445043344785408 => HRahman0512
[6944] Converting 1181779211251372032 => MowgliWithPants
[6945] Converting 1182927317997412354 => CaptainRangade
[6946] Converting 1092114780364492800 => Geetanj57989967
[6947] Converting 1140119939769962497 => AbhiramMatkar
[6948] Converting 1185212797585637376 => TayyibReza
[6949] Converting 2913077863 => sparsh4rays
[6950] Converting 1186119819755581440 => AshwinnRN
[6951] Converting 1180864322722861056 => paradox_zone
[6952] Converting 1154846445079388160 => TheBeautyOfNat9
[6953] Converting 1185631930705944577 => infofreak4
[695

[7089] Converting 1166060054379933696 => beigzakir1
[7090] Converting 1172036630418350081 => Appuraje2
[7091] Converting 1172007699464122368 => Krishna67292065
[7092] Converting 1171819820934811648 => VinodVe59682445
[7093] Converting 1166919041446842368 => ArshuBaba1
[7094] Converting 149791083 => vickypiedpiper
[7095] Converting 1138433752772665345 => Atif85592724
[7096] Converting 835189226933104640 => shakibansaristg
[7097] Converting 1162648768195338240 => GauravS77609188
[7098] Converting 1171441863175200768 => ChunnuMunnu8
[7099] Converting 1127520053077745664 => Draksha38897300
[7100] Converting 2837436894 => SkmdIrfan
[7101] Converting 1077824544105066496 => baggy03022000
[7102] Converting 1096076246902099968 => Arsi_memon
[7103] Converting 1171023407116238848 => KartiksinhJade9
[7104] Converting 4613925552 => Jalaj22
[7105] Converting 1070954088 => DevenderRawat88
[7106] Converting 1171002502990503936 => SuhailSayyed20
[7107] Converting 1164204963192459264 => MdShahi191920
[7

[7244] Converting 1159645265969631232 => Prayag30621282
[7245] Converting 4520266394 => MajidPbh
[7246] Converting 1120563355964461056 => Mariada12042385
[7247] Converting 1131963318048968705 => ABHIJEE04495560
[7248] Converting 2941128116 => sameesamee354
[7249] Converting 373825396 => _iamindersingh_
[7250] Converting 2482278774 => MudassarMulla91
[7251] Converting 1000939672355225600 => Utkarsh29136745
[7252] Converting 1157335788465430528 => imranLo78335423
[7253] Converting 3096941318 => Badusha127
[7254] Converting 912700802971471872 => MrASHaider
[7255] Converting 1157743614119698432 => Shubham47717677
[7256] Converting 1151782190914494469 => PrabhuDhanushk1
[7257] Converting 1158268588572082176 => RiyazAh02910468
[7258] Converting 1153621745397452805 => JAnouk
[7259] Converting 1158238168014970881 => Dyogi17
[7260] Converting 1153579504486379520 => Hamza_MalcolmX
[7261] Converting 1157936971081113600 => Shahana23837189
[7262] Converting 1143451298320306177 => Nadeem_khan326
[72

[7398] Converting 4839883987 => KharatPradnya
[7399] Converting 1113020110896164865 => SangramSatpath3
[7400] Converting 1140480914276982785 => Kartike81146552
[7401] Converting 1140665407289716736 => syedwaqarhasan1
[7402] Converting 1139375815290834945 => Hashtag02975577
[7403] Converting 1138612243723984897 => ShashiRanjanRo3
[7404] Converting 1139474577971544067 => BaagRafiuddin
[7405] Converting 1138306560382210048 => Shaikh_Bablu786
[7406] Converting 917281295280963584 => SamirKh78601708
[7407] Converting 1139106896822976512 => FayazNayeli
[7408] Converting 1139772343079723009 => SachinC76848785
[7409] Converting 1139155053132931073 => SwarupBhoi
[7410] Converting 1139459189942521856 => EMAPRLulTAB3V5X
[7411] Converting 1277622768 => akmalkvk
[7412] Converting 1119180435450474496 => Anurag_9911
[7413] Converting 1138387790192758784 => IMLeftist_Lalu
[7414] Converting 1138894252098777088 => sannak007
[7415] Converting 225241027 => abhaatharv
[7416] Converting 1138819657345822727 =

[7556] Converting 948194461237985280 => scitechbus
[7557] Converting 1101486750306832384 => sheldon24348182
[7558] Converting 1107267402859839488 => ShabiAbdul1
[7559] Converting 1118045643308539904 => KaranSikandar
[7560] Converting 1120045795858739201 => realmunnabhaiya
[7561] Converting 1119656083776135168 => KalluMeer
[7562] Converting 591672238 => meMrk
[7563] Converting 1119584973697064960 => Rashi52826298
[7564] Converting 117982252 => IamIndian27
[7565] Converting 1119276001005981697 => Rasheed73236643
[7566] Converting 977166517631582216 => RationalIndiatv
[7567] Converting 1100400634589659139 => Huron93190931
[7568] Converting 782270057694932992 => das_udvas
[7569] Converting 528138271 => 21janDinesh
[7570] Converting 1114129485400612865 => aditya_adhyaru
[7571] Converting 1118490845961842688 => bohemianIlych
[7572] Converting 1118038352794513408 => modernmuktibodh
[7573] Converting 897062283481759745 => FarooqueRaza5
[7574] Converting 1118197496872706048 => Tahreem64596914
[

[7716] Converting 1931267814 => akhtar7936
[7717] Converting 972100754 => balalrumy
[7718] Converting 1100709135996178434 => PRamfal
[7719] Converting 1010040969561927680 => AzmiEarm
[7720] Converting 775861914 => ArifPasha13
[7721] Converting 78770922 => shrinathshorey
[7722] Converting 231714251 => irfanaperveen
[7723] Converting 2848423590 => MehranRai
[7724] Converting 3813467112 => vaaditya60
[7725] Converting 1099611397909024769 => Rohit00290289
[7726] Converting 1099709587509440512 => hrazaa_
[7727] Converting 1059504044539043842 => Kaliwaaal
[7728] Converting 4309947200 => pimpare_balaji
[7729] Converting 1006104344691986432 => Hussainkhaninc
[7730] Converting 4602988032 => shajahanahmedma
[7731] Converting 1074027966680743938 => ThakareNupur
[7732] Converting 836650802726961158 => mypartyisNOTA
[7733] Converting 1098121976961474560 => BetterVsWorse
[7734] Converting 2427556928 => rohitkdeshpande
[7735] Converting 940492388031643648 => p_pmahato
[7736] Converting 1616520235 => 

[7879] Converting 1013760157988343808 => PramodK38637307
[7880] Converting 1071725522994884608 => WAsimSSAS
[7881] Converting 3739323198 => zubair1705
[7882] Converting 1071714037480607744 => RashidMewati5
[7883] Converting 106007199 => dawoodidudo
[7884] Converting 1005344893252898816 => rsrlives53
[7885] Converting 1046327930886881280 => ishaanAR07
[7886] Converting 150100297 => jj_joshmachine
[7887] Converting 94092252 => drsarahumer
[7888] Converting 1070780884008226819 => primepost3
[7889] Converting 2923732700 => anas_banthiya
[7890] Converting 1048881146023428096 => MDFarooqueAns17
[7891] Converting 186061127 => incshakir
[7892] Converting 883206964142108672 => avink279
[7893] Converting 60838537 => shss1990
[7894] Converting 1065535878116728838 => AbramAndrabi27
[7895] Converting 2897183292 => chakrabartysub3
[7896] Converting 1068396020696924160 => Salman87284697
[7897] Converting 2229432817 => 15Nidhish
[7898] Converting 1067435458257281024 => RighttoGet1
[7899] Converting 72

[8041] Converting 733216707804856320 => sujiths85
[8042] Converting 915248987115986946 => _ItsAshishyadav
[8043] Converting 176092552 => attarmohsin70
[8044] Converting 3009659904 => mustafaazhar93
[8045] Converting 4123427778 => AsadAbbasSial
[8046] Converting 2227901900 => AnjaliIrfan
[8047] Converting 2835413334 => Anandmha
[8048] Converting 484647645 => kashifnaushad
[8049] Converting 2937344599 => altaf3737
[8050] Converting 998617020143878144 => Basheer01838993
[8051] Converting 4103688853 => RoyMelbourne1
[8052] Converting 587311801 => shazreboot
[8053] Converting 416773785 => DrRisingsun
[8054] Converting 1170796332 => Irshad__Azam
[8055] Converting 83398696 => imsamir00
[8056] Converting 882156739 => shahbazpathan6
[8057] Converting 993190329342820353 => RADLAD9
[8058] Converting 186144818 => vvisheshanand
[8059] Converting 941328281734758400 => Mohamma19460324
[8060] Converting 934272488426561542 => Sameer__27
[8061] Converting 985114195069939717 => jarwal_prakash
[8062] Conv

[8207] Converting 836578156077322240 => 26ab039a29754f0
[8208] Converting 919424037037551619 => NilSark15700227
[8209] Converting 3304142376 => iamAHossain
[8210] Converting 1368945618 => sk_mahfuj
[8211] Converting 917985408847372290 => ModiaRawanda
[8212] Converting 911647826316165120 => KhateebRiyaz
[8213] Converting 340190586 => sharikrana
[8214] Converting 915548366775259137 => antideshbhakt
[8215] Converting 578853364 => KASHIFJILANEE
[8216] Converting 807978663920668676 => iamkk89
[8217] Converting 796658252348002305 => chunzcham
[8218] Converting 912318189479985153 => justaju4321
[8219] Converting 74093141 => senthilakumar
[8220] Converting 911859544795631616 => pritee_om
[8221] Converting 911464047299588096 => SMKHAN28597386
[8222] Converting 906709130391437312 => shahjahan123123
[8223] Converting 855365337138724864 => farhanexpert
[8224] Converting 891620621700001792 => VineetK69170548
[8225] Converting 81049556 => Moh_Tausif
[8226] Converting 307303022 => NumanNuman2
[8227] 

[8377] Converting 1375219292 => firoj125khan
[8378] Converting 817035596791103488 => bharat16072000
[8379] Converting 816702517983772673 => Muhammad0185
[8380] Converting 2510043967 => PerwezWasim
[8381] Converting 815877721175760896 => Abhishe77865116
[8382] Converting 794036366560595968 => bossofavenger
[8383] Converting 2483409470 => TheeAamAadmi
[8384] Converting 508422061 => Bubunmaaza
[8385] Converting 193601766 => asifidris
[8386] Converting 617972906 => salammd7
[8387] Converting 3575147232 => shabankhan0786
[8388] Converting 807268387843870721 => Minhajk9457
[8389] Converting 115843640 => laughloudakhil
[8390] Converting 805742887052963840 => bindass200
[8391] Converting 171933762 => zosha4
[8392] Converting 347121109 => WaqiAbbas
[8393] Converting 804233160514908160 => SarnathOm
[8394] Converting 368191887 => LoboWilbur
[8395] Converting 803229346240417792 => I_mAnnonymous
[8396] Converting 752893815783034881 => 26syeds
[8397] Converting 334654090 => bktwts
[8398] Converting 

[8544] Converting 1204653072141602816 => IqbalMa99118750
[8545] Converting 1218510793483771904 => AhmadKhan330
[8546] Converting 1105864737378717699 => MufaddalShaki11
[8547] Converting 1217882857508499456 => JantaChowk
[8548] Converting 1218028369842827265 => EPithecus
[8549] Converting 1216075766997422080 => khaniqbal6232
[8550] Converting 1217880727523454976 => Hasan_Akbar1
[8551] Converting 1217413087428399104 => parmar06081977
[8552] Converting 209930071 => Aloksamar
[8553] Converting 1217120452553953280 => MrRabot
[8554] Converting 1212010759258902529 => Abhinav39831045
[8555] Converting 2841641653 => t_events_d
[8556] Converting 1216968364398039040 => data_peace
[8557] Converting 497324992 => iamaw3sam
[8558] Converting 1216411430800392192 => voiceofindia73
[8559] Converting 1198653554031919104 => MuhammadKasif10
[8560] Converting 2547821346 => AbhiGawri
[8561] Converting 1216420883914350592 => AbdurRa72139217
[8562] Converting 2778749720 => 40727644f91a48c
[8563] Converting 121

[8707] Converting 3038634894 => proudindianyk
[8708] Converting 2409430014 => nitingaikwad077
[8709] Converting 1193158095486124032 => unikpeaceXD
[8710] Converting 2904420629 => Khalid_M_Ansari
[8711] Converting 1192755225515315200 => Komal89665115
[8712] Converting 275962630 => kazimaperfumers
[8713] Converting 1189525307189014528 => Aslan11317540
[8714] Converting 1189946610710482944 => DARK_HAXM
[8715] Converting 1186629627508342784 => ShamsTa82814774
[8716] Converting 1188913738889318400 => ActiveVoice_In
[8717] Converting 972392649666646016 => sonusaini9102
[8718] Converting 3032327280 => tejpal_verma100
[8719] Converting 1184845798908948481 => 77_panku
[8720] Converting 2435612408 => CeoTpca
[8721] Converting 1008390661408677888 => SayanjitBP
[8722] Converting 1147372558519394304 => MYahia68658306
[8723] Converting 983548188404596736 => Qatai_Liberal
[8724] Converting 1182325924739596288 => Vijay_Rathod12
[8725] Converting 81766516 => nagpalabhinav
[8726] Converting 118148403238

[8881] Converting 1949672660 => roohi21
[8882] Converting 4126405034 => firdousqasmi86
[8883] Converting 1378671139 => Adityafilmmaker
[8884] Converting 3391761739 => AkkasAliar
[8885] Converting 1874989134 => rh478
[8886] Converting 113760997 => keralahashim
[8887] Converting 913100323606085632 => SohailS09675533
[8888] Converting 176128980 => surajshete
[8889] Converting 749824488 => iamashraf05
[8890] Converting 3270712886 => NasruuShaik
[8891] Converting 4872333318 => JazZahmed16
[8892] Converting 235055744 => sirjitendrayada
[8893] Converting 456633537 => jawedkhan6
[8894] Converting 107978805 => shyam13eve
[8895] Converting 552915422 => mohdekhlaqkhan
[8896] Converting 370842431 => sithubscity
[8897] Converting 631290400 => rezwanJr
[8898] Converting 252966718 => IRFANMITS
[8899] Converting 135872499 => ergoaj
[8900] Converting 64072795 => ritzritchie
[8901] Converting 3009014419 => Masoom_Sa_Rahi
[8902] Converting 16053738 => niyaz73
[8903] Converting 570245286 => Pratikdhole
[8

[9052] Converting 2714371365 => mahipalverma83
[9053] Converting 873475614409318400 => Tejeswa76357425
[9054] Converting 829931950861217792 => AjayAjay3051
[9055] Converting 1228422679579021314 => MSAnsar84772229
[9056] Converting 1169923327579709440 => AzeemKh18124996
[9057] Converting 1106097053136936960 => AsimMallick5
[9058] Converting 1210513417519484928 => IAmRahim4
[9059] Converting 1179108652344598528 => THOINDIA
[9060] Converting 969450891525881856 => ShivChandraRamm
[9061] Converting 1180661455303004165 => Basharaali0353
[9062] Converting 1194315531894935552 => ayanchouder
[9063] Converting 939941510849728512 => WasiAhm85046507
[9064] Converting 1233632344755646466 => JeitendraS
[9065] Converting 1231193343276802049 => MohdNij38102621
[9066] Converting 1234760821173035009 => RahulHanotiya20
[9067] Converting 1234726505730588673 ID 1234726505730588673 not found
[9068] Converting 1234723018003402752 => Bhupend23839474
[9069] Converting 1234527971735588870 => Rafiqpadiyar6g1
[90

[9203] Converting 1232336992974696448 => realmaroof
[9204] Converting 1232282996042092551 => EkbalHu96283913
[9205] Converting 899921495593373696 => Amreenmirza959
[9206] Converting 1223657074011201536 => AbdulAzeemAsad3
[9207] Converting 1232286780352679939 => GulamAmer2
[9208] Converting 364150662 => parthaghosh7
[9209] Converting 1232272874104377349 => HODORIND
[9210] Converting 1232227867817365507 => NitikShukla5
[9211] Converting 1231873819284926465 => bMw983HrtiMxuQg
[9212] Converting 197670275 => sam_need_a_name
[9213] Converting 1232114243593527297 => ExEngineer15
[9214] Converting 946304070159032321 => Pravesh66106483
[9215] Converting 1230817371826188288 => VAdhiveshan
[9216] Converting 1231717891063468032 => DickySingh90
[9217] Converting 1175299630558433280 => msalman186
[9218] Converting 709706606028529664 => Saurabh19019145
[9219] Converting 1204643678905454592 => ReshmaV20
[9220] Converting 795340052276645889 => Muhammadaarshad
[9221] Converting 1231939960493821959 => Aa

[9357] Converting 1227131961434443777 => NOMAD_000
[9358] Converting 1073575273541615617 => MdSadda69561267
[9359] Converting 1215236143316254720 => BorioJhoka
[9360] Converting 1212084917846167552 => HabibSk11423355
[9361] Converting 1219878569213448192 => MB17363115
[9362] Converting 1226956278439849985 => aitzazahmad22
[9363] Converting 1182124397538078720 => GoldiKu20873693
[9364] Converting 1226826916918972416 => KhanMoh30116639
[9365] Converting 1226921714321223681 => reachshashwat
[9366] Converting 1226884763849375744 => MUMSHADAHMADSI1
[9367] Converting 1226904307640877057 => GBilimale
[9368] Converting 1210143085419610114 => Zulfika99657052
[9369] Converting 1226574845686628353 => MTasavour
[9370] Converting 1202277809298690048 => iahmsuhail
[9371] Converting 1226457260072697856 => RedPencil20
[9372] Converting 1216646488845115392 => Muhamma51796285
[9373] Converting 359217105 => Arahaman400
[9374] Converting 1226447857135296512 => sonubaba112
[9375] Converting 2980057525 => 1

[9512] Converting 1223099469400399872 => ilf99209545
[9513] Converting 1094886063837872128 => Abdul87511590
[9514] Converting 1222854024199729154 => Manu32047554
[9515] Converting 1214101385521725440 => ParvezRafat2
[9516] Converting 1223263421283491845 => Rajrohilla005g1
[9517] Converting 1222735494221451264 => Tapan24985160
[9518] Converting 1222076342533074944 => BravoSwara
[9519] Converting 1223237884712562688 => Naseer___13
[9520] Converting 117006308 ID 117006308 not found
[9521] Converting 1221267956635598849 => DilbagKharb2
[9522] Converting 1214802899865202690 => AniketA98721028
[9523] Converting 1147873089012690945 => pratikmankar92
[9524] Converting 893093478950813696 => JuSabine2
[9525] Converting 1072714731306274816 => RSinha25776487
[9526] Converting 1220375065990254592 => BaidyanathChat6
[9527] Converting 1208219866986995713 => AzamKha10352412
[9528] Converting 1727694937 => JavedAFarooqui
[9529] Converting 306597191 => 2007_anas
[9530] Converting 1222904915804090375 => 

[9665] Converting 1217041270570250240 => AGConsulting9
[9666] Converting 1216470409056382976 => Ataurra13006819
[9667] Converting 1220013246091911169 => SDMohid5
[9668] Converting 1181861313485520896 => PankajP42900533
[9669] Converting 2983182799 => A_Wajid_Khan
[9670] Converting 922054155627151360 => for_bats
[9671] Converting 1219957459785277441 => Mahmood05897341
[9672] Converting 1162209612449234944 => sultanakaisar
[9673] Converting 1218240020395347969 => ArnabCowsami
[9674] Converting 982508670960447488 => GulJaved23
[9675] Converting 1219884829904236545 => TheUnpopularO13
[9676] Converting 1219825500631138305 => RaviKum71383948
[9677] Converting 1206806943416901632 => ali08784264
[9678] Converting 821197076 => RIO_644
[9679] Converting 1216597866648653829 => BittuChopra7
[9680] Converting 1219826423734472704 => Ishwar211
[9681] Converting 1219792845474680832 ID 1219792845474680832 not found
[9682] Converting 1218836144655339520 => SohrabK15452194
[9683] Converting 1219255035030

### Run GetOldTweets3 to download tweets upto Jan 2014 if available

In [None]:
# Function the pulls tweets from a specific username and turns to csv file
# Parameters: (list of twitter usernames), (max number of most recent tweets to pull from)
def username_tweets_to_csv(username, count):
    # Creation of query object
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                            .setSince("2014-01-01")\
                                            .setMaxTweets(count)\
                                            .setEmoji("unicode")
    try:
        # Creation of list that contains all tweets
        tweets = got.manager.TweetManager.getTweets(tweetCriteria)

        # Creating list of chosen tweet data
        user_tweets = [[tweet.date, tweet.text] for tweet in tweets]

        # Creation of dataframe from tweets list
        tweets_df = pd.DataFrame(user_tweets, columns = ['Datetime', 'Text'])

        # Converting dataframe to CSV
        tweets_df.to_csv(tweets_folder + '/original/{}-{}k-tweets.csv'.format(username, int(count/1000)), sep=',')
    except:
        print("Caught Rate limit Exception. Sleeping...")
        time.sleep(200)

In [None]:
print("Reading followers")
with open(followers_username_file) as f:
    user_list = [line.rstrip() for line in f] 

print("Reading already fetched usernames")
with open(fetched_username_files) as f:
    fetched_list = [line.rstrip() for line in f]

with open(fetched_username_files, "a+") as output:
    count = 0
    for username in user_list:    
        if username not in fetched_list:
            print("[%d] Fetching tweets for %s" % (count, username))
            username_tweets_to_csv(username, 0)    
            output.write('%s\n' % username)
        else:
            print("User %s already fetched. Skipping" % username)

        count += 1

### Capture all tweets into a single file

In [7]:
# Set the path one by one and run the below two cells appropriately to generate a dataframe of tweets
path = '/home/nikhil/packages/GetOldTweets3/bin/tweets/' + side 
#path = tweets_folder + "/original"

In [9]:
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

### Run one of the below - Temporary TBD

In [10]:
leaders_tweets_df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
followers_tweets_df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
# Tweets download using function call of GetOldTweets3 vs using the binary have different casing for column names
# So convert below one to lowercase before merging
followers_tweets_df.columns = followers_tweets_df.columns.str.lower()
followers_tweets_df['text']

In [12]:
#frames = [leaders_tweets_df, followers_tweets_df]
frames = [leaders_tweets_df]
all_tweets_df = pd.concat(frames)

In [14]:
all_tweets_df.text.to_csv(processed_tweets_file,index=False)

### Do Text Pre-processing - Cleanup tweets

In [15]:
# Pre-processing: 
# 1) Remove URL
# 2) Keep tweets greater than 30 characters

MIN_CHARS = 30
with open(processed_tweets_file) as f:
    all_tweets = [line.rstrip() for line in f]    
    
    with open(cleaned_tweets_file, 'w',encoding="utf-8") as output:
        for tweet in all_tweets:            
            tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet, flags=re.MULTILINE)
            tweet = re.sub("([^\x00-\x7F])+"," ",tweet)
            tweet = ' '.join(tweet.split()) 
            tweet = tweet.replace('&amp;', '&')            
            if len(tweet) > MIN_CHARS:
                output.write('%s\n' % tweet)

### Shuffle, De-duplicate and Split to train + test

In [29]:
# Unable to get to work in Jupyter. Run from shell
#!awk '!seen[$0]++' all_tweets_cleaned.txt > all_tweets_dedup.txt
#!shuf all_tweets_dedup.txt > all_tweets_dedup_shuf.txt
#!mv xaa train.txt
#!mv xab test.txt

/bin/sh: 1: Syntax error: word unexpected (expecting ")")


### Download tweets using the GetOldTweets3 binary (appears to be doing it faster)

In [None]:
# I used this to download tweets for all the 'leaders' (the original lef or right handles list)
#%%handle

since_list=['2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01', '2020-01-01']
until_list=['2014-12-31', '2015-12-31', '2016-12-31', '2017-12-31', '2018-12-31', '2019-12-31', '2020-12-31']

tweets_bin_folder = tweets_folder + "original/leaders/"
    
with open(handles_file) as f:
    handles = [line.rstrip() for line in f]
    
for handle in handles:
    print("Getting tweets for " + handle)
    
    for since, until in zip(since_list, until_list):
        outfile = handle + "_" + since.split('-')[0] + ".csv"
        cmd = GetOldTweets3_bin + " --username " + handle + \
        " --since " + since + " --until " + until + \
        " --maxtweets 0 --emoji unicode --output " + tweets_bin_folder + outfile 
        print(cmd)
        os.system(cmd)
        print("%s Done." % outfile)
        time.sleep(60)        

### Some files are not downloaded properly. Re-download them 

In [None]:
# Re-download those files which have size 84 bytes

all_files = glob.glob(tweets_bin_folder + "/*.csv")

for filename in all_files:
    statinfo = os.stat(filename)
    if statinfo.st_size is 84:
        filename = os.path.basename(filename)
        #print("%s is having size 84" % filename)
        handle=filename.split('_20')[0]
        year=filename.split('.csv')[0][-4:]
        #print("Re-fetching for %s & %s" % (handle, year))
        outfile = handle + "_" + year + ".csv"
        cmd = GetOldTweets3_bin + " --username " + handle + \
        " --since " + year + "-01-01" + " --until " + year + "-12-31" + \
        " --maxtweets 0 --emoji unicode --output " + tweets_bin_folder + outfile 
        print(cmd)
        os.system(cmd)
        print("%s Done." % outfile)
        time.sleep(2)

### Unused Code