In [16]:
#!/usr/bin/env python
""" Parses Twitter archives from Archive Team: The Twitter Stream Grab for a list of user-defined keywords

The Archive Team: The Twitter Stream Grab (https://archive.org/details/twitterstream) provides historic 
downloads of Twitter archives by month. This script helps researchers to mine this content for a list of 
words, phrases, or hashtags. This script requires the monthly archives to be downloaded and extracted from 
the .tar archive before use.

Output is a .csv file containing one record per relationship. Relationships are classified as either
(1) reply, (2) mention, or (3) tweet. A reply is a direct response to another user's post. A mention is 
where another user is mentioned, but not a direct reply. A tweet relationship are tweets with neither 
replies nor mentions.

See the modify section below to specify (1) keywords/hashtags, (2) top-level directory, and
(3) output file name.
"""

# Library imports
import json, os, bz2
import unicodecsv as csv    # unicodecsv for non-ascii handling
from IPython.display import clear_output     # to clear screen between directory prints

# Metadata
__author__ = 'Joshua Been'
__copyright__ = 'Copyright 2018, Baylor University'
__credits__ = ['Joshua Been','Ann Mirabito','Clint Ratliff','Carol Schuets']
__license__ = 'MIT License'
__maintainer__ = 'Joshua Been'
__email__ = 'Joshua_Been@baylor.edu'
__status__ = 'Development'

__modifiedby__ = 'Saif Murad'
__copyright__ = 'Copyright 2021, University of North Carolina at Charlotte'
__email__ = 'smurad1@uncc.edu'
__changes__ = """ Marked with [SM]. Updated to Python 3 coding standards, added: parsing of geo-tagged 
              tweets, filter fields, more tweet info, extended tweets and associated hashtags, reply 
              userid, mentioned userid, final screen print
              """

# Modify keywords, top-level directory, and output file name
##################################################

# Keep keywords lower case - Will match all cases [SM]
keywords = ['trump'.encode('utf-8'), 'biden'.encode('utf-8')]

# Top level directory - Use forward slashes only (/) - Do not place / at end
directory = '/Volumes/ExternalHD/archiveteam-twitter-stream-2020-11/2020'

# Output .csv table name - Will be placed in same directory as Jupyter script
outfile = 'trumpbiden11_CLEANED_lang.csv'

##################################################


def process_json(keywords,directory,outfile):
    # Initializing variables [SM]
    global cursor
    cursor = '  >>  '
    global count_mentions
    count_mentions = 0
    global count_replies
    count_replies = 0
    global count_tweets
    count_tweets = 0
    
    with open(outfile, 'wb') as csvfile:
        writer = csv.writer(csvfile)
        # Write header row
        writer.writerow(['poster','recipient','relationship','date','place_full_name','place_name','place_type','place_bounding_box_coord','user_country_code','user_location','user_description','user_id','tweet_id','tweet','retweet_status','truncated_status','quote_status','in_reply_to_user_id','tweet_source','hashtags'])
        # Walk through all subdirectories 
        for dirs, subdirs, files in os.walk(directory):

            # Screen prints [SM]
            clear_output()
            print (cursor,'mentions:',count_mentions)
            print (cursor,'replies:',count_replies)
            print (cursor,'tweets:',count_tweets)
            print (cursor,'* total:',count_mentions+count_replies+count_tweets)
            print ('-'*10)
            print (cursor,'currently searching', dirs)

            for file in files:
                if file.endswith('.bz2'):
                    # Extract bz2 archives to memory
                    file = bz2.BZ2File(os.path.join(dirs, file), 'r')
                    for line in file:
                        status=0
                        # Test for search term(s) in tweet record
                        for keyword in keywords:
                            if keyword in line.lower():
                                status=1
                        if status==1:
                            # Load each record as json object
                            tweet = json.loads(line)

                            # Avoid truncation by pulling extended tweets when present [SM]
                            if tweet['truncated'] is True:
                                tweet_text=tweet['extended_tweet']['full_text']
                            else:
                                tweet_text=tweet['text']
                            # Filter tweets by specific keyword(s) and language used, pull location and other general 
                            # tweet information [SM]
                            if ('biden' in tweet_text.lower() or 'trump' in tweet_text.lower()) and tweet['lang'] == 'en' and (tweet['source'].find('>Twitter for Mac<') != -1 or tweet['source'].find('>Twitter Web Client<') != -1 
                            or tweet['source'].find('>Twitter for iPad<') != -1 
                            or tweet['source'].find('>Twitter for Android<') != -1 
                            or tweet['source'].find('>Twitter for iPhone<') != -1 
                            or tweet['source'].find('>Twitter Web App<') != -1):
                                poster=tweet['user']['screen_name']
                                tweet_date=tweet['created_at']
                                tweet_location=tweet['user']['location']
                                if tweet['place'] is not None and tweet['place']['country_code'] == 'US':
                                    tweet_placefullname=tweet['place']['full_name']
                                    tweet_placename=tweet['place']['name']
                                    tweet_placetype=tweet['place']['place_type']
                                    tweet_placeboundingboxcoord=tweet['place']['bounding_box']['coordinates']
                                    tweet_countrycode=tweet['place']['country_code']
                                if tweet['place'] is None:
                                    tweet_placefullname=None
                                    tweet_placename=None
                                    tweet_placetype=None
                                    tweet_placeboundingboxcoord=None
                                    tweet_countrycode=None
                                tweet_description=tweet['user']['description']
                                #tweet_coordinates=tweet['coordinates']
                                tweet_userid=tweet['user']['id_str'].encode('utf-8')
                                tweet_id=tweet['id_str'].encode('utf-8')
                                tweet_truncated=tweet['truncated']
                                tweet_isquotestatus=tweet['is_quote_status']
                                #tweet_followers=tweet['user']['followers_count']
                                #tweet_friends=tweet['user']['friends_count']
                                #tweet_lang=tweet['lang']
                                tweet_source=tweet['source']

                            # Test for retweet status [SM]
                                if 'retweeted_status'.encode('utf-8') in line.lower():
                                    retweet='True'
                                else:
                                    retweet='False'
                                    
                            # Pass mentioned userids to list [SM]
                                #mentioneduserids = list()
                                #for user_mention in tweet['entities']['user_mentions']:
                                    #text = user_mention['id_str']
                                    #mentioneduserids.append(text)

                            # Pass hashtags to list [SM]
                                hashes = list()
                                if tweet['truncated'] is False:
                                    for hashtag in tweet['entities']['hashtags']:
                                        text = hashtag['text']
                                        hashes.append(text) 
                                else:
                                    for hashtag in tweet['extended_tweet']['entities']['hashtags']:
                                        text = hashtag['text']
                                        hashes.append(text) 
                                reply_status=0

                            # Test for reply relationship [SM]
                                if not tweet['in_reply_to_screen_name'] is None: 
                                    writer.writerow([poster,tweet['in_reply_to_screen_name'],'reply',tweet_date,tweet_placefullname,tweet_placename,tweet_placetype,tweet_placeboundingboxcoord,tweet_countrycode,tweet_location,tweet_description,tweet_userid.decode(),tweet_id.decode(),tweet_text,retweet,tweet_truncated,tweet_isquotestatus,tweet['in_reply_to_user_id_str'],tweet_source,hashes])
                                    reply_status=1
                                    count_replies+=1

                            # Test for mention relationships [SM]
                                #mentions = list()
                                #for mention in tweet['entities']['user_mentions']:
                                    #recipient=mention['screen_name']
                                    #mentionid=mention['id_str']
                                # Ensure the mention is not already a reply
                                    #if recipient != tweet['in_reply_to_screen_name']:
                                        #writer.writerow([poster,recipient,'mentions',tweet_date,tweet_placefullname,tweet_placename,tweet_placetype,tweet_placeboundingboxcoord,tweet_location,tweet_description,tweet_userid,tweet_id,tweet_text,tweet_truncated,tweet_isquotestatus,None,mentionid,tweet_source,hashes])
                                        #reply_status=1
                                        #count_mentions+=1

                            # Write relationship as tweet if no reply or mentions [SM]
                                if reply_status==0:
                                    writer.writerow([poster,poster,'tweet',tweet_date,tweet_placefullname,tweet_placename,tweet_placetype,tweet_placeboundingboxcoord,tweet_countrycode,tweet_location,tweet_description,tweet_userid.decode(),tweet_id.decode(),tweet_text,retweet,tweet_truncated,tweet_isquotestatus,None,tweet_source,hashes])
                                    count_tweets+=1
                                    
                                    
if __name__ == '__main__':
    process_json(keywords,directory,outfile)
    # Final screen print [SM]
    clear_output()
    print (cursor,'mentions:',count_mentions)
    print (cursor,'replies:',count_replies)
    print (cursor,'tweets:',count_tweets)
    print (cursor,'* total:',count_mentions+count_replies+count_tweets)
    print ('-'*10)
    print ('complete!')

  >>   mentions: 0
  >>   replies: 52625
  >>   tweets: 177241
  >>   * total: 229866
----------
complete!
