# Pulling Scientology Followers Data with Tweepy
**By:** _Mike Scheibel_ 

In [1]:
import datetime
import tweepy
from pprint import pprint

# I've put my API keys in a .py file called API_keys.py
from Mike_API_Keys import api_key, api_key_secret, access_token, access_token_secret

In [2]:
# Authenticate the Tweepy API
auth = tweepy.OAuthHandler(api_key,api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

## Collecting Followers

In [3]:
# I'm putting the handles in a list to iterate through below
handles = ['LeahRemini','Scientology']

# Create dictionary to hold follower IDs
id_dict = dict() 

# Intialize dictionary.
for handle in handles :
    id_dict[handle] = []

# Grabs the time when we start making requests to the API
start_time = datetime.datetime.now()

for handle in id_dict :
    
    # Let's grab the first page of IDs
    for page in tweepy.Cursor(api.followers_ids,
                              wait_on_rate_limit=True,
                              wait_on_rate_limit_notify=True, 
                              compression=True,
                              screen_name=handle).pages():

        # The page variable comes back as a list, so we have to use .extend rather than .append
        id_dict[handle].extend(page)
        
        break
        
# Let's see how long it took to grab all follower IDs
end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
print(elapsed_time)

0:00:01.146087


In [19]:
# let's make sure it worked
for handle in id_dict :
    print(f"{handle} has {len(id_dict[handle])} followers in our dictionary.")

LeahRemini has 5000 followers in our dictionary.
Scientology has 5000 followers in our dictionary.


In [17]:
def get_screen_names(twitter_api, list_of_ids, list_for_screen_names):
    
    start=100 #we have feed the API 100 ID's at a time, this will iterate through them
    end=0
    followers=[]
    while end-1 <= len(list_of_ids): #quit when we get past the end of our list
        end += 1000 #update the end of our slice
        if end <= len(list_of_ids): #split into if else statement to slice correctly
            try:
                followers_temp = twitter_api.lookup_users(list_of_ids[start:end])
            except tweepy.TweepError as err:
                if err.api_code in (103,88) :
                    print('sleeping, 900 seconds')
                    time.sleep(900)
                else :
                    print(err)
        else:        
            try:
                followers_temp = twitter_api.lookup_users(list_of_ids[start:])
            except tweepy.TweepError as err:
                if err.api_code in (103,88) :
                    print('sleeping, 900 seconds')
                    time.sleep(900)
                else :
                    print(err)
                    
        followers.extend(followers_temp)
        start = end #update our starting slice index for next loop
        
    list_for_screen_names.extend(followers)

In [25]:
user_dict = dict() 

for handle in id_dict :
    user_dict[handle] = []
    
for handle in user_dict :
    get_screen_names(api,id_dict[handle],user_dict[handle])

NameError: name 'id_dict' is not defined

In [15]:
for handle in user_dict :
    print(f"{handle} has {len(user_dict[handle])} followers in our dictionary.") 

LeahRemini has 1700 followers in our dictionary.
Scientology has 1700 followers in our dictionary.


In [16]:
headers = ['screen_name','name','id','location','followers_count','friends_count','description']

for team in id_dict.keys():
    
    # Descriptions with emoji or non-Roman letters can cause trouble. Encoding your .txt file in utf-8 will help
    with open(f'{team}_followers.txt','w', encoding='utf-8') as out_file:
        out_file.write('\t'.join(headers) + '\n')

        for idx, ids in enumerate(id_dict[team]):
            
            # For accounts set to private, we won't be able to get the description unless we follow them
            # Putting in a try/except statement, we can get around this issue.
            try:
                user = api.get_user(ids)
                description = str(user.description).replace('\t',' ').replace('\n',' ')
                outline = [user.screen_name,user.name,user.id,
                    user.location,user.followers_count,user.friends_count,
                    user.description]
                
                out_file.write('\t'.join([str(item) for item in outline]) + '\n')
                
            except:
                continue
                
            if idx == 100:
                break

## Tokenize and Analyse Text

In [1]:
import nltk
import numpy as np

from string import punctuation
from collections import Counter

from pprint import pprint # get some prettier printing of objects

from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

**Text for Leah Remini**

In [19]:
leahremini = open("LeahRemini_followers.txt", encoding='utf-8').read()

leahremini_clean = [w for w in leahremini.lower().split()]
leahremini_clean = [w.lower() for w in leahremini_clean if w.isalpha() and w not in sw]

In [22]:
total_tokens = len(leahremini_clean)
unique_tokens = len(set(leahremini_clean))
lex_diversity = len(set(leahremini_clean))/len(leahremini_clean)
avg_token_len = np.mean([len(w) for w in leahremini_clean])
top_10 = Counter(leahremini_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 590,
 'unique_tokens': 516,
 'avg_token_length': 6.161016949152542,
 'lexical_diversity': 0.8745762711864407,
 'top_10': [('living', 6),
  ('life', 6),
  ('love', 4),
  ('like', 4),
  ('state', 3),
  ('san', 3),
  ('tx', 3),
  ('ca', 3),
  ('usa', 3),
  ('colorado', 3)]}

**Text for Scientology**

In [23]:
scientology = open("Scientology_followers.txt", encoding='utf-8').read()

scientology_clean = [w for w in scientology.lower().split()]
scientology_clean = [w.lower() for w in scientology_clean if w.isalpha() and w not in sw]

In [24]:
total_tokens = len(scientology_clean)
unique_tokens = len(set(scientology_clean))
lex_diversity = len(set(scientology_clean))/len(scientology_clean)
avg_token_len = np.mean([len(w) for w in scientology_clean])
top_10 = Counter(scientology_clean).most_common(10)

results = {'tokens':total_tokens,
            'unique_tokens':unique_tokens,
            'avg_token_length':avg_token_len,
            'lexical_diversity':lex_diversity,
            'top_10':top_10}

results

{'tokens': 533,
 'unique_tokens': 466,
 'avg_token_length': 6.25703564727955,
 'lexical_diversity': 0.874296435272045,
 'top_10': [('love', 8),
  ('usa', 7),
  ('name', 3),
  ('god', 3),
  ('jonathan', 3),
  ('like', 3),
  ('jesus', 3),
  ('nc', 3),
  ('england', 3),
  ('opheliaevermore', 2)]}