# Fetching Tweets for Big Dataset (100K+ Tweets) from Twitter API

## Twitter API Premium v1.1: Search Tweets 30-Day Endpoint

In [6]:
# Import dependencies
import tweepy
import pandas as pd
import numpy as np
import json

In [7]:
# Import API client token
import os
from dotenv import load_dotenv
load_dotenv()

twit_token = os.getenv('twit_token')

In [8]:
# Set up API authorization
auth = tweepy.OAuth2BearerHandler(twit_token)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [42]:
# Code reproduced from Twitter developer API documentation. Additional parsing features and customizations made where necessary for desired output.
def determine_tweet_type(tweet):
    # Check for reply indicator first
    if tweet["in_reply_to_status_id"] is not None:
        tweet_type = "Reply Tweet"
    # Check boolean quote status field but make sure it's not a Retweet (of a Quote Tweet) 
    elif tweet["is_quote_status"] is True and not tweet["text"].startswith("RT"):
        tweet_type = "Quote Tweet"
    # Check both indicators of a Retweet
    elif tweet["text"].startswith("RT") and tweet.get("retweeted_status") is not None:
        tweet_type = "Retweet"
    else:
        tweet_type = "Original Tweet"
    return tweet_type

parsedTweets = []
rawTweets = []

def parse_tweets(status):
    for tweet in status:
        
        rawTweets.append(tweet._json)
        hashtags = []
        mentions = []
        

        if 'extended_tweet' in tweet._json:
            full_text = tweet._json['extended_tweet']['full_text']

            # Collect hashtags from tweet into list
            hashtags_entity = tweet._json["extended_tweet"]["entities"]["hashtags"]
            if len(hashtags_entity) == 0:
                hashtags = None
            else:
                for hashtag in hashtags_entity:
                    hashtags.append("#" + hashtag['text'])

            # Collect mentions from tweet into list
            mentions_entity = tweet._json["extended_tweet"]["entities"]["user_mentions"]
            if len(mentions_entity) == 0:
                mentions = None
            else:
                for mention in mentions_entity:
                    mentions.append("@" + mention["screen_name"])

        else:
            full_text = tweet._json['text']

            # Collect hashtags from tweet into list
            hashtags_entity = tweet._json["entities"]["hashtags"]
            if len(hashtags_entity) == 0:
                hashtags = None
            else:
                for hashtag in hashtags_entity:
                    hashtags.append("#" + hashtag['text'])

            # Collect mentions from tweet into list
            mentions_entity = tweet._json["entities"]["user_mentions"]
            if len(mentions_entity) == 0:
                mentions = None
            else:
                for mention in mentions_entity:
                    mentions.append("@" + mention["screen_name"])
        
                
        # Get coordinates latitude and longitude into separate variables by subscripting coordinates if coordinates object not Nonetype
        coordinates = tweet._json["coordinates"]
        if coordinates is not None:
            coord_lat = tweet._json["coordinates"]["coordinates"][0]
            coord_lng = tweet._json["coordinates"]["coordinates"][1]
        else:
            coord_lat = None
            coord_lng = None
            
        
        # Get place attributes into separate variables by subscripting place if place object not Nonetype 
        place = tweet._json["place"]
        if place is not None:
            place_type = tweet._json["place"]["place_type"]
            place_name = tweet._json["place"]["name"]
            place_full_name = tweet._json["place"]["full_name"]
            country_code = tweet._json["place"]["country_code"]
            country = tweet._json["place"]["country"]
        else:
            place_type = None
            place_name = None
            place_full_name = None
            country_code = None
            country = None
            
        # Parsing profile geo data

        # First intialize profile geo field values as Nonetype for the default value (value if field not found in profile geo data)
        prof_country = None
        prof_country_code = None
        prof_locality = None
        prof_region = None
        prof_sub_region = None
        prof_full_name = None
        prof_coord_lat = None
        prof_coord_lng = None 

        # Parse desired profile geo data if the field is present 
        for field in tweet._json["user"]: 
            if field == "derived":
                prof_geo_data = tweet._json["user"]["derived"]
                if prof_geo_data["locations"] is not None:
                    for field in prof_geo_data["locations"][0]:
                        if field == "country":
                            prof_country = prof_geo_data["locations"][0][field]

                        elif field == "country_code":
                            prof_country_code = prof_geo_data["locations"][0][field]

                        elif field == "locality":
                            prof_locality = prof_geo_data["locations"][0][field]

                        elif field == "region":
                            prof_region = prof_geo_data["locations"][0][field]

                        elif field == "sub_region":
                            prof_sub_region = prof_geo_data["locations"][0][field]

                        elif field == "full_name":
                            prof_full_name = prof_geo_data["locations"][0][field]

                        elif field == "geo":
                            prof_coord_lat = prof_geo_data["locations"][0][field]["coordinates"][0]
                            prof_coord_lng = prof_geo_data["locations"][0][field]["coordinates"][1]

                        else:
                            pass
                else:
                    pass
            else:
                pass
            

            
        # Create dict with key-value pairs of parsed field and corresponding data
        mydict = { "tweet_id": tweet._json["id_str"], #Tweet ID
                       "date":tweet._json["created_at"], #Timestamp of tweet creation
                       "full_text": full_text, #Full tweet text
                       "tweet_type": determine_tweet_type(tweet._json), #Type of tweet
                       "hashtags": hashtags, #List of hashtags used in the tweet
                       "mentions": mentions, #List of mentions used in the tweet
                       "user_id": tweet._json["user"]["id_str"], #Twitter user profile ID
                       "user_location": tweet._json["user"]["location"], #The user's listed location
                       "geo": tweet._json["geo"], #Geodata 'geo' attribute
                       "lat_coordinates": coord_lat, #Geodata 'latitude coordinates' from 'coordinates' attribute
                       "lng_coordinates": coord_lng, #Geodata 'longitude coordinates' from 'coordinates' attribute
                       "place_type": place_type, #Geodata 'place_type' from 'place' attribute
                       "place_name": place_name, #Geodata 'name' from 'place' attribute
                       "place_full_name": place_full_name, #Geodata 'full_name' from 'place' attribute
                       "country_code": country_code, #Geodata 'country_code' from 'place' attribute
                       "country": country, #Geodata 'country' from 'place' attribute
                       "prof_country": prof_country, #Profile geodata 'country' from 'user''derived''locations'
                       "prof_country_code": prof_country_code, #Profile geodata 'country_code' from 'user''derived''locations'
                       "prof_locality": prof_locality, #Profile geodata 'locality' from 'user''derived''locations'
                       "prof_region": prof_region, #Profile geodata 'region' from 'user''derived''locations'
                       "prof_sub_region": prof_sub_region, #Profile geodata 'sub_region' from 'user''derived''locations'
                       "prof_full_name": prof_full_name, #Profile geodata 'full_name' from 'user''derived''locations'
                       "prof_coord_lat": prof_coord_lat, #Profile geodata 'latitude' from 'user''derived''locations''geo''coordinates'
                       "prof_coord_lng": prof_coord_lng, #Profile geodata 'longitude' from 'user''derived''locations''geo''coordinates'
                       "reply_count": tweet._json["reply_count"], #Number of times Tweet has been replied to
                       "quote_count": tweet._json["quote_count"], # Number of times Tweet has been quoted
                       "likes_count": tweet._json["favorite_count"], #Number of times Tweet has been liked 
                       "retweet_counts": tweet._json["retweet_count"], #Number of times this Tweet has been retweeted
                       "hyperlink": "https://twitter.com/twitter/status/" + tweet._json["id_str"] #Link to tweet
              }
        
        # Append parsed tweet data to list
        parsedTweets.append(mydict) # Add Tweet to parsedTweets list

In [43]:
# Code to run query against the Twitter API v1.1 Premium - 30 day search endpoint for fetching tweets in the last 30-day archive 

query = '(#guncontrol OR #gunrights OR #gunsense OR #2a OR #gunviolence OR #nra OR #guns OR #shooting OR #firearms OR #gunsafety OR #uvalde \
OR "gun control" OR "2nd amendment" OR ("gun" "violence") OR "right to bear" OR "nra" OR ("gun" "safety") OR ("gun" "rights") \
OR "good guy with a gun" OR "assault weapons" OR ("ban" "guns") OR "ban assault" OR ("school" "shootings") OR "ar-15" OR "mass shootings" \
OR "open carry" OR "gun owners" OR "concealed carry" OR "gun laws" OR "gun law" OR "pro-gun" OR "anti-gun" OR "gun safety" OR "high capacity magazine" \
OR "second amendment" OR "gun lobby" OR "gun ownership" OR "carry permit" OR "bump stock" OR "gunman" OR "red flag laws" OR "red-flag laws" OR "brady law" \
OR "gun sales" OR ("buying" ("gun" OR "guns")) OR "gun shows" OR "gun sense" OR "sandy hook" OR ("guns" ("republicans" OR "democrats")) OR (("background check" OR "background checks") \
("gun" OR "guns" OR "sale" OR "sales")) OR (("legal" OR "illegal") ("gun" OR "guns"))) lang:en profile_country:US -is:retweet -is:quote'


# Initalize lists to populate with raw and parsed using parsing function
rawTweets = []
parsedTweets = []
from_date = 1103
to_date = 1104
date_range = range(10)

# Fetch tweets from the endpoint using pagination to collect up to 18,500 tweets for each specified day (UTC)
for date in date_range:
    status = tweepy.Cursor(api.search_30_day, label='gunsentiment', query=query, fromDate=f'2022{from_date}0000', toDate=f'2022{to_date}0000', maxResults='500').items(14000)

    # parse the fetched tweet and store key-value pairs in list of dictionaries in parsedTweets. Append raw JSON responses to rawTweets list
    parse_tweets(status)
    
    from_date += 1
    to_date += 1

# convert raw tweets JSON responses array into .json file
with open("../res/big_data_tweets_raw.json", "w") as training_tweets:
    json.dump(rawTweets, training_tweets)
    
    
tweet_df = pd.DataFrame(parsedTweets)

TooManyRequests: 429 Too Many Requests
Exceeded rate limit

In [22]:
rawTweets[14836]

{'created_at': 'Fri Nov 04 00:00:02 +0000 2022',
 'id': 1588320126427836417,
 'id_str': '1588320126427836417',
 'text': 'If you keep a handgun in your glove box as most any Tennessean may legally do, it should always occur to you to loc… https://t.co/aTxteyAK9j',
 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
 'truncated': True,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 23782016,
  'id_str': '23782016',
  'name': 'Times-News Online',
  'screen_name': 'timesnewsonline',
  'location': 'Kingsport, TN',
  'url': 'http://www.timesnews.net',
  'description': 'Breaking news and information for Kingsport, TN\n\nLinktree: https://linktr.ee/kingsport_times_news',
  'translator_type': 'none',
  'derived': {'locations': [{'country': 'United States',
     'country_code': 'US',
     'locality': 'Kingsport',
     'region': '

In [44]:
len(parsedTweets)

28775

In [26]:
tweet

{'_json': {'created_at': 'Fri Nov 04 00:00:02 +0000 2022',
  'id': 1588320126427836417,
  'id_str': '1588320126427836417',
  'text': 'If you keep a handgun in your glove box as most any Tennessean may legally do, it should always occur to you to loc… https://t.co/aTxteyAK9j',
  'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
  'truncated': True,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 23782016,
   'id_str': '23782016',
   'name': 'Times-News Online',
   'screen_name': 'timesnewsonline',
   'location': 'Kingsport, TN',
   'url': 'http://www.timesnews.net',
   'description': 'Breaking news and information for Kingsport, TN\n\nLinktree: https://linktr.ee/kingsport_times_news',
   'translator_type': 'none',
   'derived': {'locations': [{'country': 'United States',
      'country_code': 'US',
      'locality'

In [17]:
parse_tweets(status)

In [12]:
len(parsedTweets)

10215

In [13]:
tweet_df = pd.DataFrame(parsedTweets)

In [14]:
tweet_df['date'].value_counts

<bound method IndexOpsMixin.value_counts of 0        Thu Nov 03 23:59:52 +0000 2022
1        Thu Nov 03 23:59:36 +0000 2022
2        Thu Nov 03 23:59:33 +0000 2022
3        Thu Nov 03 23:59:06 +0000 2022
4        Thu Nov 03 23:59:00 +0000 2022
                      ...              
10210    Fri Nov 04 17:19:09 +0000 2022
10211    Fri Nov 04 17:19:07 +0000 2022
10212    Fri Nov 04 17:19:06 +0000 2022
10213    Fri Nov 04 17:19:03 +0000 2022
10214    Fri Nov 04 17:18:50 +0000 2022
Name: date, Length: 10215, dtype: object>

In [11]:
# Check tweets dataframe data types and missing values for each column
tweet_df.info()

NameError: name 'tweet_df' is not defined

In [258]:
# drop 'geo', 'lat_coordinates', and 'lng_coordinates' columns for insufficient data
tweet_df.drop(['geo', 'lat_coordinates', 'lng_coordinates'], axis=1, inplace=True)
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8801 entries, 0 to 8800
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tweet_id           8801 non-null   object 
 1   date               8801 non-null   object 
 2   full_text          8801 non-null   object 
 3   tweet_type         8801 non-null   object 
 4   hashtags           1366 non-null   object 
 5   mentions           6147 non-null   object 
 6   user_location      8781 non-null   object 
 7   place_type         244 non-null    object 
 8   place_name         244 non-null    object 
 9   place_full_name    244 non-null    object 
 10  country_code       244 non-null    object 
 11  country            244 non-null    object 
 12  prof_country       8747 non-null   object 
 13  prof_country_code  8747 non-null   object 
 14  prof_locality      4645 non-null   object 
 15  prof_region        7680 non-null   object 
 16  prof_sub_region    4477 

In [259]:
# Check top 10 rows of tweets dataframe
tweet_df.head(10)

Unnamed: 0,tweet_id,date,full_text,tweet_type,hashtags,mentions,user_location,place_type,place_name,place_full_name,...,prof_region,prof_sub_region,prof_full_name,prof_coord_lat,prof_coord_lng,reply_count,quote_count,likes_count,retweet_counts,hyperlink
0,1587957708166426625,Wed Nov 02 23:59:55 +0000 2022,@PapiiDev_ @aaronbricks69 @Barstooldmv Man ppl...,Reply Tweet,,"[@PapiiDev_, @aaronbricks69, @Barstooldmv]","Indianapolis, IN",,,,...,Indiana,Marion County,"Indianapolis, Indiana, United States",-86.15804,39.76838,1,0,0,0,https://twitter.com/twitter/status/15879577081...
1,1587957661085171712,Wed Nov 02 23:59:43 +0000 2022,"@watertigernyc Hey, now that NYC is open carry…",Reply Tweet,,[@watertigernyc],"New Jersey, USA",,,,...,New Jersey,,"New Jersey, United States",-74.49987,40.16706,0,0,2,0,https://twitter.com/twitter/status/15879576610...
2,1587957440494141441,Wed Nov 02 23:58:51 +0000 2022,.@KaneGovernment decides new gun law not ready...,Original Tweet,,"[@KaneGovernment, @dailyherald]","Kane County, IL.",,,,...,Illinois,Greene County,"Kane, Illinois, United States",-90.35373,39.19005,0,0,0,0,https://twitter.com/twitter/status/15879574404...
3,1587957427823349760,Wed Nov 02 23:58:48 +0000 2022,@thediva76 @iamjadebrieanne @brolys_maiden @un...,Reply Tweet,,"[@thediva76, @iamjadebrieanne, @brolys_maiden,...","Washington, DC",,,,...,"Washington, D.C.",,"Washington, D.C., United States",-77.00025,38.91706,1,0,0,0,https://twitter.com/twitter/status/15879574278...
4,1587957397422841861,Wed Nov 02 23:58:41 +0000 2022,HipHop needs to go on strike! No more gun play...,Original Tweet,,,douglasville,,,,...,Georgia,Douglas County,"Douglasville, Georgia, United States",-84.74771,33.7515,0,0,0,0,https://twitter.com/twitter/status/15879573974...
5,1587957395539611648,Wed Nov 02 23:58:40 +0000 2022,@bradlena1 @RadioFreeTom Republicans push for ...,Reply Tweet,,"[@bradlena1, @RadioFreeTom]","Florida, USA",,,,...,Florida,,"Florida, United States",-82.5001,28.75054,0,0,26,1,https://twitter.com/twitter/status/15879573955...
6,1587957387503501312,Wed Nov 02 23:58:38 +0000 2022,"Iowans, Strict scrutiny means the rights of th...",Original Tweet,,,"Johnston, IA",,,,...,Iowa,Polk County,"Johnston, Iowa, United States",-93.69772,41.67304,0,0,0,0,https://twitter.com/twitter/status/15879573875...
7,1587957369543204865,Wed Nov 02 23:58:34 +0000 2022,"The 2nd Amendment calls for a ""A well regulate...",Original Tweet,,,America,,,,...,,,United States,-98.5,39.76,0,0,1,0,https://twitter.com/twitter/status/15879573695...
8,1587957359636189185,Wed Nov 02 23:58:32 +0000 2022,@marla_vous @BetoORourke Abbott is Forever. So...,Reply Tweet,,"[@marla_vous, @BetoORourke]","California, USA",,,,...,California,,"California, United States",-119.75126,37.25022,0,0,0,0,https://twitter.com/twitter/status/15879573596...
9,1587957220502822912,Wed Nov 02 23:57:58 +0000 2022,A Federal Judge Calls Clarence Thomas’ Bluff o...,Original Tweet,,,"Euclid,OH",,,,...,Ohio,Cuyahoga County,"Euclid, Ohio, United States",-81.52679,41.5931,0,0,0,0,https://twitter.com/twitter/status/15879572205...


In [260]:
# Confirm results for user.location and profile geo data parsing
tweet_df.loc[:, ['user_location', 'prof_country', 'prof_country_code', 'prof_locality', 'prof_region', 'prof_sub_region', 'prof_full_name']].head(10)

Unnamed: 0,user_location,prof_country,prof_country_code,prof_locality,prof_region,prof_sub_region,prof_full_name
0,"Indianapolis, IN",United States,US,Indianapolis,Indiana,Marion County,"Indianapolis, Indiana, United States"
1,"New Jersey, USA",United States,US,,New Jersey,,"New Jersey, United States"
2,"Kane County, IL.",United States,US,Kane,Illinois,Greene County,"Kane, Illinois, United States"
3,"Washington, DC",United States,US,,"Washington, D.C.",,"Washington, D.C., United States"
4,douglasville,United States,US,Douglasville,Georgia,Douglas County,"Douglasville, Georgia, United States"
5,"Florida, USA",United States,US,,Florida,,"Florida, United States"
6,"Johnston, IA",United States,US,Johnston,Iowa,Polk County,"Johnston, Iowa, United States"
7,America,United States,US,,,,United States
8,"California, USA",United States,US,,California,,"California, United States"
9,"Euclid,OH",United States,US,Euclid,Ohio,Cuyahoga County,"Euclid, Ohio, United States"


In [261]:
# Check distribution of tweet types
tweet_df['tweet_type'].value_counts()

Reply Tweet       5788
Original Tweet    3013
Name: tweet_type, dtype: int64

In [262]:
# Check for unique values and respective counts in 'country_code' column
tweet_df['country_code'].value_counts()

US    241
TZ      1
GB      1
CO      1
Name: country_code, dtype: int64

In [293]:
# Check rows with non-null values in 'country_code' that are not 'US' for conflicts with profile geo data
tweet_df.loc[lambda df: (df['country_code'] == 'TZ') | (df['country_code'] == 'GB') | (df['country_code'] == 'CO')].T

Unnamed: 0,1495,2650,4146
tweet_id,1587906553058828289,1587868633472049153,1587825249898450947
date,Wed Nov 02 20:36:38 +0000 2022,Wed Nov 02 18:05:58 +0000 2022,Wed Nov 02 15:13:34 +0000 2022
full_text,@ValaAfshar Yeah ikr but at least they don’t b...,Every body deserves to feel perfect #shooting ...,"@robreiner Dana Loesch, spokeswoman NRA, recei..."
tweet_type,Reply Tweet,Original Tweet,Reply Tweet
hashtags,,"[#shooting, #elegance, #classy, #elegant, #pho...",
mentions,[@ValaAfshar],,[@robreiner]
user_location,U.S Intelligence Outpost,Devon,"Fort Worth, TX"
place_type,admin,city,city
place_name,Dar es Salaam,Kilkhampton,"Bogotá, D.C."
place_full_name,"Dar es Salaam, Tanzania","Kilkhampton, England","Bogotá, D.C., Colombia"


In [295]:
# Export tweet_df as training data CSV
tweet_df.to_csv('../res/big_data_tweets.csv', index=False)