<img src="https://datasciencedegree.wisconsin.edu/wp-content/themes/data-gulp/images/logo.svg" width="300">


# Capstone Project -- Tweepy Data Collection
## Matt Peterson - DS 785
### 05/06/2022

---

## Setup for file system

#### Gain access to required libraries

In [None]:
import os
from pathlib import Path

#### Set input and output file names

In [None]:
input_filename = 'VR_Twitter_Usernames_By_Type.csv'
output_filename = 'vr_tweets.csv'

#### Set input and output directory paths

In [None]:
file_dir = cwd = Path.cwd()
input_file = os.path.join(file_dir, 'input', input_filename)
output_file = os.path.join(file_dir, 'output', output_filename)

---

## Setup for ```tweepy```

#### Gain access to the Tweepy library

In [None]:
import tweepy

#### Load credentials from the external file

In [None]:
run ~/Documents/twitter_credentials_v2.py

#### Initialize a client by passing bearer token

In [None]:
client = tweepy.Client(bearer_token=b_token)

#### Configure query information to assist in Twitter search

In [None]:
#define desired tweet fields and expansions
fields = ["attachments","created_at","public_metrics","entities"]
expansions = ["attachments.media_keys","author_id"]

---

## Use ```pandas``` to Read Twitter Usernames from File

#### Gain access to the pandas library

In [None]:
import pandas as pd

#### Import lists of known VR-related Twitter accounts from CSV file

In [None]:
df = pd.read_csv(input_file, skip_blank_lines=True)

developers = [d for d in df['Developers'].tolist() if pd.notna(d)]
news_pages = [n for n in df['News'].tolist() if pd.notna(n)]
platforms = [p for p in df['Platforms'].tolist() if pd.notna(p)]
influencers = [i for i in df['Influencers'].tolist() if pd.notna(i)]

---

## Define Objects with a Tweet Class

#### Gain access to the regular expression library

In [None]:
import re

In [None]:
class Tweet(object):
    def __init__(self, tweet_id, timestamp, user, usertype, text, media, tweet):
        self.tweet_id = tweet_id
        self.timestamp = self.format_timestamp(timestamp) #format before storing
        self.user_type = usertype
        self.text = self.clean_text(text) #clean text before storing
        self.tweet_length = len(self.text)
        self.user = user
        self.media = media
        self.tweet = tweet
        
    def __str__(self):
        """ return a string representation of the Tweet object"""
        return "TWEET_ID: " + str(self.tweet_id) + "\nTIMESTAMP: " + self.timestamp + "\nNAME: " + self.get_name() + "\nUSERNAME: " + self.get_username() + "\nUSER_TYPE: " + self.user_type + "\nTWEET_LENGTH: "+ str(self.tweet_length) + "\IS_RETWEET: " + self.get_is_retweet() + "\nTEXT: [" + self.text + "]" + "\nRETWEET_COUNT: " + str(self.get_retweet_count()) + "\nREPLY_COUNT: " + str(self.get_reply_count()) + "\nLIKE_COUNT: " + str(self.get_like_count()) + "\nQUOTE_COUNT: " + str(self.get_quote_count()) + "\nURL_COUNT: " + str(self.get_url_count()) + "\PHOTO_COUNT: " + str(self.get_photo_count()) + "\VIDEO_COUNT: " + str(self.get_video_count()) + "\GIF_COUNT: " + str(self.get_gif_count()) + "\MENTION_COUNT: " + str(self.get_mention_count()) + "\HASHTAG_COUNT: " + str(self.get_hashtag_count()) + "\MONTH: " + str(self.get_month()) + "\DAY: " + str(self.get_day()) + "\HOUR: " + str(self.get_hour()) + "\nMEDIA: " + self.list_2_str(self.get_media()) + "\nMENTIONS: " + self.list_2_str(self.get_mentions()) + "\nHASHTAGS: " + self.list_2_str(self.get_hashtags())
    
    def get_tweet_id(self):
        """ return the unique Twitter tweet ID"""
        return self.tweet_id
    
    def format_timestamp(self, timestamp):
        """ convert datetime object to a more user-friendly string"""
        return timestamp.strftime("%m/%d/%Y, %H:%M:%S")
    
    def get_timestamp(self):
        """ return the timestamp at which the tweet was created"""
        return self.timestamp  
 
    def get_name(self):
        """ get the user's name from the tweet data"""
        name = "UNKNOWN"
        if 'author_id' in self.tweet.data:
            author_id = self.tweet.data['author_id']
            name = self.user[author_id].name 
        return name

    def get_username(self):
        """ get the user's username from the tweet data"""
        username = "UNKNOWN"
        if 'author_id' in self.tweet.data:
            author_id = self.tweet.data['author_id']
            username = self.user[author_id].username 
        return username
    
    def get_user_type(self):
        """ return the type of user who authored the tweet"""
        return self.user_type  
    
    def get_tweet_length(self):
        """ return the character length of the cleaned tweet text"""
        return self.tweet_length  

    def get_is_retweet(self):
        """ return whether the tweet is a retweet"""
        return "TRUE" if self.text[:5] == "RT : " else "FALSE" 
    
    def get_text(self):
        """ return the cleaned tweet text"""
        return self.text
    
    def get_retweet_count(self):
        """ get the retweet_count value from the tweet data"""
        return self.tweet.data['public_metrics']['retweet_count']
        
    def get_reply_count(self):
        """ get the reply_count value from the tweet data"""
        return self.tweet.data['public_metrics']['reply_count']
    
    def get_like_count(self):
        """ get the like_count value from the tweet data"""
        return self.tweet.data['public_metrics']['like_count']
    
    def get_quote_count(self):
        """ get the quote_count value from the tweet data"""
        return self.tweet.data['public_metrics']['quote_count']
       
    def get_url_count(self):
        """ get the url_count value from the tweet data"""
        url_count = 0
        if not self.tweet.entities is None and 'urls' in self.tweet.entities:
            url_count = len(self.tweet.entities['urls'])
        return url_count
    
    def get_photo_count(self):
        """ get the number of photos in the tweet media data"""
        return len([media for media in self.get_media() if media == 'photo'])

    def get_video_count(self):
        """ get the number of videos in the tweet media data"""
        return len([media for media in self.get_media() if media == 'video'])
    
    def get_gif_count(self):
        """ get the number of animated_gifs in the tweet media data"""
        return len([media for media in self.get_media() if media == 'animated_gif'])
    
    def get_mention_count(self):
        """ get the number of mentions in the tweet text"""
        return len(self.get_mentions())
    
    def get_hashtag_count(self):
        """ get the number of hashtags in the tweet text"""
        return len(self.get_hashtags())
    
    def get_month(self):
        """ get the month the tweet was posted from timestamp"""
        return self.timestamp[:2]
    
    def get_day(self):
        """ get the day the tweet was posted from timestamp"""
        return self.timestamp[3:5]
    
    def get_hour(self):
        """ get the hour the tweet was posted from timestamp"""
        return self.timestamp[12:14]
    
    def get_media(self):
        """ get the media type list from the tweet data"""
        media_list = []
        if 'attachments' in self.tweet.data and 'media_keys' in self.tweet.data['attachments']:
            media_key_list = self.tweet.data['attachments']['media_keys']
            media_list = [self.media[key].type for key in media_key_list]
        return media_list
    
    def get_mentions(self):
        """ get the mentions list from the tweet data"""
        mentions = []
        if not self.tweet.entities is None and 'mentions' in self.tweet.entities:
            mentions = [mention['username'] for mention in self.tweet.entities['mentions']]
        return mentions
    
    def get_hashtags(self):
        """ get the hashtags list from the tweet data"""
        hashtags = []
        if not self.tweet.entities is None and 'hashtags' in self.tweet.entities:
            hashtags = [hashtag['tag'] for hashtag in self.tweet.entities['hashtags']]
        return hashtags

    def clean_text(self, text):
        """ remove Twitter handles and URLs from text with regex"""
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)"," ",text).split())
    
    def list_2_str(self, strlist):
        """ helper function to convert lists to strings"""
        return '[' + ', '.join(strlist) + ']'

--- 

## Use the REST API to Gather Tweets

#### Function to return user ID mappings for each user in a list of usernames

In [None]:
# returns username:userid relationship in a dictionary
def get_user_dict(username_list):
    users = client.get_users(usernames=username_list)
    userid_list = [user.id for user in users.data]
    user_dict = dict(zip(username_list, userid_list))
    return user_dict

#### Function that updates the user tweet set with tweets parameterized by exclusions such as retweets & replies

In [None]:
def get_tweets_by_exclusion(all_user_tweets, curr_user_tweets, user_id, user_type, max_tweets_per_user, exclusions):
    media_dict = {}
    
    #get all of their non-retweet/reply tweets
    for tweets in tweepy.Paginator(client.get_users_tweets, 
                                   id=user_id, 
                                   max_results=max_tweets_per_user, 
                                   exclude=exclusions,
                                   tweet_fields=fields, 
                                   expansions=expansions):

        if not tweets.data is None:

            #get media dictionary from response to map to each tweet later
            if 'media' in tweets.includes:
                media_dict = {m["media_key"]: m for m in tweets.includes['media']}

            #get users dictionary from response to map to each tweet later
            if 'users' in tweets.includes: 
                users_dict = {str(u["id"]): u for u in tweets.includes['users']}

            #format the tweets to extract desired fields and store in custom class
            formatted_tweets = [Tweet(tweet.id, tweet.created_at, users_dict, user_type, tweet.text, media_dict, tweet) for tweet in tweets.data]

            #add list of tweets to the complete set
            all_user_tweets.update(formatted_tweets)

#### Function to return a set of Tweet objects given a list of Twitter users

In [None]:
def get_relevant_tweets_for_user_list(user_list, user_type, max_tweets_per_user):
    all_user_tweets = set()

    #map twitter user ids to usernames
    user_dict = get_user_dict(user_list)

    #for each user:
    for user in user_dict:
        
        get_tweets_by_exclusion(all_user_tweets, backup_dict[user], user_dict[user], user_type, max_tweets_per_user, ["retweets", "replies"])
        get_tweets_by_exclusion(all_user_tweets, backup_dict[user], user_dict[user], user_type, max_tweets_per_user, ["replies"])

    return all_user_tweets

---

## Store a List of Tweet Objects for Each Account Type

#### 1. Obtain tweets from VR game developer companies

In [None]:
developer_tweets = get_relevant_tweets_for_user_list(developers, 'Developer', 100)
len(developer_tweets)

#### 2. Obtain tweets from VR platform companies

In [None]:
platform_tweets = get_relevant_tweets_for_user_list(platforms, 'Platform', 100)
len(platform_tweets)

#### 3. Obtain tweets from VR news and promotional accounts

In [None]:
newspage_tweets = get_relevant_tweets_for_user_list(news_pages, 'News', 100)
len(newspage_tweets)

#### 4. Obtain tweets from VR influencer accounts

In [None]:
influencer_tweets = get_relevant_tweets_for_user_list(influencers, 'Influencer', 100)
len(influencer_tweets)

---

## Use ```pandas``` for Data Frame Creation

#### Function to combine several lists of tweets into a single data frame for export

In [None]:
def create_data_frame_from_tweet_sets(column_names, tweet_sets):
    dict_list = []
    #add each list of tweets to the dictionary list
    for tweet_set in tweet_sets:
        for t in tweet_set:
            dict_list.append(dict(zip(column_names,[t.get_tweet_id(),
                                                    t.get_timestamp(),
                                                    t.get_name(),
                                                    t.get_username(),
                                                    t.get_user_type(),
                                                    t.get_tweet_length(),
                                                    t.get_is_retweet(),
                                                    t.get_text(),
                                                    t.get_retweet_count(),
                                                    t.get_reply_count(),
                                                    t.get_like_count(),
                                                    t.get_quote_count(),
                                                    t.get_url_count(),
                                                    t.get_photo_count(),
                                                    t.get_video_count(),
                                                    t.get_gif_count(),
                                                    t.get_mention_count(),
                                                    t.get_hashtag_count(),
                                                    t.get_month(),
                                                    t.get_day(),
                                                    t.get_hour(),
                                                    t.get_media(),
                                                    t.get_mentions(),
                                                    t.get_hashtags()])))
            
    #create data frame
    tweet_df = pd.DataFrame(dict_list)
    return tweet_df

#### Create master data frame to be analyzed 

In [None]:
column_names = ("TWEET_ID", 
                "TIMESTAMP", 
                "NAME", 
                "USERNAME", 
                "USER_TYPE", 
                "TWEET_LENGTH", 
                "IS_RETWEET",
                "TEXT", 
                "RETWEET_COUNT", 
                "REPLY_COUNT", 
                "LIKE_COUNT", 
                "QUOTE_COUNT", 
                "URL_COUNT",
                "PHOTO_COUNT",
                "VIDEO_COUNT",
                "GIF_COUNT",
                "MENTION_COUNT",
                "HASHTAG_COUNT",
                "MONTH",
                "DAY",
                "HOUR",
                "MEDIA", 
                "MENTIONS", 
                "HASHTAGS")

tweet_sets = [developer_tweets, platform_tweets, newspage_tweets, influencer_tweets]
tweet_df = create_data_frame_from_tweet_sets(column_names, tweet_sets)

---

## Export data frame to ```.csv``` file for analysis

In [None]:
# save dfs to a .csv file
tweet_df.to_csv(output_file, index=False)

# USE LATER to append df to existing .csv file
#tweet_df.to_csv(output_file, mode='a', index=False, header=False)