# Download Twitter followers for a set of users

### Notebook Author: Nikhil Utane


In [11]:
# Pip install GetOldTweets3 if you don't already have the package
# !pip install GetOldTweets3

# Imports
import GetOldTweets3 as got
import pandas as pd
import time
import tweepy
import csv
import sys
from collections import Counter 
import requests
import http.client, urllib
import re
import glob
import os

#### Read all the security tokens from a keys.py file

In [None]:
from keys import keys #keep keys in separate file, keys.py

consumer_key = keys['consumer_key']
consumer_secret = keys['consumer_secret']
access_token = keys['access_token']
access_token_secret = keys['access_token_secret']
pushover_token = keys['pushover_token']
pushover_user = keys['pushover_user']

#### I am using Pushover to notify me if any cell stops running

In [None]:
def pushoverNotify():
    conn = http.client.HTTPSConnection("api.pushover.net:443")
    conn.request("POST", "/1/messages.json",
      urllib.parse.urlencode({
        "token": pushover_token,
        "user": pushover_user,
        "message": "Cell finished execution",
      }), { "Content-type": "application/x-www-form-urlencoded" })
    r=conn.getresponse()
    print(r.status, r.reason)

In [None]:
from IPython.core.magic import register_cell_magic

# Register the magic so that later any cell that we want to be notified on exception can be done
@register_cell_magic('handle')
def handle(line, cell):
    try:
        exec(cell)        
    except Exception as e:
        pushoverNotify()
        raise # if you want the full trace-back in the notebook


### Get List of Followers. 
#### We are getting the IDs since the rate limit for that is quite high ~45000 per 15 mins vs ~3000 for usernames
#### Then we'll convert ID to username and using GetOldTweets3 to download in bulk going as far back as 2014

In [30]:
# Define all the user configuration here
side = "right"

In [31]:
side_folder = "../data/" + side + "/"
followers_folder = side_folder + "followers/"
tweets_folder = side_folder + "tweets/"
handles_file = side_folder + side + "_handles.txt"
followers_id_file = side_folder + "all_followers_id.txt"
followers_id_dedup_file = side_folder + "all_followers_id_dedup.txt"
followers_username_file = side_folder + "all_followers_username.txt"
fetched_username_files = tweets_folder + "fetched_list.txt"
GetOldTweets3_bin = "/home/nikhil/packages/GetOldTweets3/bin/GetOldTweets3"
processed_path = tweets_folder + "processed/"
processed_tweets_file = processed_path + "all_tweets.txt"
cleaned_tweets_file = processed_path + "all_tweets_cleaned.txt"

### Download all follower IDs for a user using tweepy

In [11]:
# Below source code credit: https://gist.github.com/PandaWhoCodes/46f58fdead71f4c71453d9ed1e21adf8
# Credentials
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

def get_and_save_followers(user_name):
    """
    get a list of all followers of a twitter account
    :param user_name: twitter username without '@' symbol
    :return: list of usernames without '@' symbol
    """
    followers = []
    with open(followers_folder + user_name + "_followers_id.csv", 'w',encoding="utf-8") as output:
        for page in tweepy.Cursor(api.followers_ids, screen_name=user_name, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True).pages():
            sys.stdout.write(".")
            sys.stdout.flush()
            try:
                #followers.extend(page)
                for user_id in page:
                    output.write('%s\n' % user_id)
            except tweepy.TweepError as e:
                print("Going to sleep:", e)
                # Sleeping to slow down. Else we hit rate limit often
                time.sleep(60)
    return followers

### Read the initial list of handles and get their follower IDs

In [None]:
%%handle

with open(handles_file) as f:
    handles = [line.rstrip() for line in f]
    
for handle in handles:
    print("Getting followers for " + handle)
    followers = get_and_save_followers(handle)    
    print("Done.")
    

### Merge, de-duplicate and sort the followers list

In [7]:
print(followers_folder)
print(followers_id_file)
!echo $followers_folder/*.csv | xargs cat > $followers_id_file

../data/left/followers/
../data/left/followers/all_followers_id.txt


In [16]:
# initializing list 
with open(followers_id_file) as f:
    id_list = [line.rstrip() for line in f]

# printing original list 
print("Number of ids before dedup: %d" % len(id_list)) 

# using Counter.most_common() + list comprehension 
# sorting and removal of duplicates 
id_dedup = [key for key, value in Counter(id_list).most_common()] 

# print result 
print("Number of ids after dedup: {}. Percent reduced: {}".format(len(id_dedup), int((len(id_dedup)*100)/len(id_list))) )

with open(followers_id_dedup_file, "w") as output:
    for user_id in id_dedup:
        output.write('%s\n' % user_id)

Number of ids before dedup: 15360681
Number of ids after dedup: 8174646. Percent reduced: 53


### Convert IDs to usernames for GetOldTweets3 to fetch in bulk

In [None]:
%%handle
# We are doing a GET on a twitter link and parsing our the username, fastest way with no rate limiting
found = not_found = last_index = 0
user_list = []

# If you are resuming from somewhere in the middle, then uncomment below lines 
# and specify the last converted ID
print("Reading ID file");
with open(followers_id_dedup_file) as f:
    id_dedup = [line.rstrip() for line in f]

last_index = id_dedup.index("1108670208607174656")
del id_dedup[0:last_index+1]

with open(followers_username_file, "w") as output:
    count = last_index + 1
    for user_id in id_dedup:
        print("[%d] Converting %s" % (count, user_id) , end=' ');
        r = requests.get('https://twitter.com/intent/user?user_id=' + user_id)
        user_search=re.search('<title>.*\(@(.*)\).*</title>', r.content.decode('utf-8'), re.IGNORECASE)
        if user_search:
            username = user_search.group(1)            
            user_list.append(username)
            output.write('%s\n' % username)
            found += 1
            print("=> %s" % username);
        else:
            not_found += 1
            print("ID %s not found" % user_id);
        count += 1
        
    print("%d usernames found. %d not found." % found, not_found)   

### Run GetOldTweets3 to download tweets upto Jan 2014 if available

In [None]:
# Function the pulls tweets from a specific username and turns to csv file
# Parameters: (list of twitter usernames), (max number of most recent tweets to pull from)
def username_tweets_to_csv(username, count):
    # Creation of query object
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                            .setSince("2014-01-01")\
                                            .setMaxTweets(count)\
                                            .setEmoji("unicode")
    try:
        # Creation of list that contains all tweets
        tweets = got.manager.TweetManager.getTweets(tweetCriteria)

        # Creating list of chosen tweet data
        user_tweets = [[tweet.date, tweet.text] for tweet in tweets]

        # Creation of dataframe from tweets list
        tweets_df = pd.DataFrame(user_tweets, columns = ['Datetime', 'Text'])

        # Converting dataframe to CSV
        tweets_df.to_csv(tweets_folder + '/original/{}-{}k-tweets.csv'.format(username, int(count/1000)), sep=',')
    except:
        print("Caught Rate limit Exception. Sleeping...")
        time.sleep(200)

In [None]:
print("Reading followers")
with open(followers_username_file) as f:
    user_list = [line.rstrip() for line in f] 

print("Reading already fetched usernames")
with open(fetched_username_files) as f:
    fetched_list = [line.rstrip() for line in f]

with open(fetched_username_files, "a+") as output:
    count = 0
    for username in user_list:    
        if username not in fetched_list:
            print("[%d] Fetching tweets for %s" % (count, username))
            username_tweets_to_csv(username, 0)    
            output.write('%s\n' % username)
        else:
            print("User %s already fetched. Skipping" % username)

        count += 1

### Capture all tweets into a single file

In [35]:
# I downloaded tweets using GetOldTweets3 binary for original handles aka leaders and then using library for all followers. 
# The CSV format is different for both so make it same before we combine all tweets into a single file

# Step 1) Set path below to appropriate folder
# Step 2) Run the cell below to populate li
# Step 3) Run the subsequent cell to generate either leaders_tweet_df or followers_tweet_df. 
# Step 4) Repeat above step if required
# Step 5) Build the frames df that has all the tweets

# Set the path one by one and run the below two cells appropriately to generate a dataframe of tweets
#path = '/home/nikhil/packages/GetOldTweets3/bin/tweets/' + side 
path = tweets_folder + "/original/followers"

In [36]:
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

### Run one of the below - Temporary

In [34]:
leaders_tweets_df = pd.concat(li, axis=0, ignore_index=True)

In [37]:
followers_tweets_df = pd.concat(li, axis=0, ignore_index=True)
# Tweets download using function call of GetOldTweets3 vs using the binary have different casing for column names
# So convert below one to lowercase before merging
followers_tweets_df.columns = followers_tweets_df.columns.str.lower()
followers_tweets_df['text']

In [39]:
frames = [leaders_tweets_df, followers_tweets_df]
#frames = [leaders_tweets_df]
all_tweets_df = pd.concat(frames)

In [40]:
all_tweets_df.text.to_csv(processed_tweets_file,index=False)

### Do Text Pre-processing - Cleanup tweets

In [41]:
# Pre-processing: 
# 1) Remove URL
# 2) Keep tweets greater than 20 characters

MIN_CHARS = 20
with open(processed_tweets_file) as f:
    all_tweets = [line.rstrip() for line in f]    
    
    with open(cleaned_tweets_file, 'w',encoding="utf-8") as output:
        for tweet in all_tweets:            
            tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet, flags=re.MULTILINE)
            tweet = re.sub("([^\x00-\x7F])+"," ",tweet)
            tweet = ' '.join(tweet.split()) 
            tweet = tweet.replace('&amp;', '&')            
            if len(tweet) > MIN_CHARS:
                output.write('%s\n' % tweet)

### Shuffle, De-duplicate and Split to train + test

In [None]:
# Unable to get to work in Jupyter. Run from shell
#!awk '!seen[$0]++' all_tweets_cleaned.txt > all_tweets_dedup.txt
#!sed 's/\"//g' all_tweets_dedup.txt > all_tweets_dedup_2.txt # Remove all quotes
#!sed 's/^ *//' all_tweets_dedup_2.txt > all_tweets_dedup_3.txt # Remove starting space

# Not doing shuffle since I feel it is better to keep in order as tweets that are next to each other are more likely to be related.
##!shuf all_tweets_dedup_3.txt > all_tweets_dedup_shuf.txt # <- Don't run

#!rm all_tweets_cleaned.txt all_tweets_dedup.txt all_tweets_dedup_2.txt
#!mv all_tweets_dedup_3.txt all_tweets_left.txt


### Download tweets using the GetOldTweets3 binary (appears to be doing it faster)

In [None]:
%%handle

# I used this to download tweets for all the 'leaders' (the original lef or right handles list)

since_list=['2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01', '2020-01-01']
until_list=['2014-12-31', '2015-12-31', '2016-12-31', '2017-12-31', '2018-12-31', '2019-12-31', '2020-12-31']

# Set this for leaders or followers appropriately
tweets_bin_folder = tweets_folder + "original/leaders/"
#tweets_bin_folder = tweets_folder + "original/followers/"
# Temp
handles_file = side_folder + "temp_handles.txt"
input_file = handles_file
#input_file = followers_username_file
    
with open(input_file) as f:
    handles = [line.rstrip() for line in f]
    
for handle in handles:
    print("Getting tweets for " + handle)
    
    for since, until in zip(since_list, until_list):
        outfile = handle + "_" + since.split('-')[0] + ".csv"
        cmd = GetOldTweets3_bin + " --username " + handle + \
        " --since " + since + " --until " + until + \
        " --maxtweets 0 --emoji unicode --output " + tweets_bin_folder + outfile 
        print(cmd)
        os.system(cmd)
        print("%s Done." % outfile)
        time.sleep(60)        

### Some files are not downloaded properly. Re-download them 

In [None]:
# Re-download those files which have size 84 bytes
tweets_bin_folder = tweets_folder + "original/leaders/"

all_files = glob.glob(tweets_bin_folder + "/*.csv")

for filename in all_files:
    statinfo = os.stat(filename)
    if statinfo.st_size is 84:
        outfile = os.path.basename(filename)
        #print("%s is having size 84" % filename)
        handle=outfile.split('_20')[0]
        year=outfile.split('.csv')[0][-4:]
        #print("Re-fetching for %s & %s" % (handle, year))
        #outfile = handle + "_" + year + ".csv"
        cmd = GetOldTweets3_bin + " --username " + handle + \
        " --since " + year + "-01-01" + " --until " + year + "-12-31" + \
        " --maxtweets 0 --emoji unicode --output " + tweets_bin_folder + outfile
        print(cmd)
        os.system(cmd)
        #cmd = "mv " + tweets_bin_folder + outfile + " " + tweets_bin_folder + "../retried/"
        #os.system(cmd)
        #print(cmd)
        print("%s Done." % outfile)
        time.sleep(60)

### Unused Code/Scratchpad

In [32]:
import re

regex = re.compile('[^a-zA-Z\']')
#regex = re.compile('[,\.!?]:-0-9') 
#First parameter is the replacement, second parameter is your input string

#Out: 'abdE'

# initializing list 
words_file = "1.txt"
with open(words_file) as f:
    word_list = [line.rstrip() for line in f]

# printing original list 
print("Number of words before dedup: %d" % len(word_list)) 

# using Counter.most_common() + list comprehension 
# sorting and removal of duplicates 
word_dedup = [key for key, value in Counter(word_list).most_common()] 

# print result 
print("Number of words after dedup: {}. Percent reduced: {}".format(len(word_dedup), int((len(word_dedup)*100)/len(word_list))) )

with open("2.txt", "w") as output:
    for word in word_dedup:
        new_word = regex.sub('', word)
        if new_word:
            output.write('%s\n' % new_word)

Number of words before dedup: 254462
Number of words after dedup: 27099. Percent reduced: 10


In [None]:
common_file = "../data/common/common.txt"
with open(common_file) as f:
    common_list = [line.rstrip() for line in f]
    for user in common_list:
        cmd = "mv ../data/right/tweets/original/followers/" + user + "* ../data/common/right/" 
        print(cmd)
        os.system(cmd)
        