# Sentiment analysis of tweets from top 20+ Silicon Valley (SV) influencers. 

# Creating data

## Prepare packages

In [1]:
# Data science toolbox
import pandas as pd
import numpy as np

# Access to API
import tweepy

# Other
import json

# Wildcard operations on files
import glob

# Date & time operations
import datetime

In [2]:
import warnings

warnings.simplefilter("ignore")

## Research input

### Twitter accounts

In [3]:
# Source: https://www.digitaltrends.com/social-media/tech-people-influencers-follow-twitter/
# Second source: https://www.quora.com/What-are-some-of-the-best-Silicon-Valley-Twitter-accounts-to-follow

In [4]:
persons = set(['tim_cook', 'sundarpichai', 'billgates', 
           'elonmusk', 'jeffbezos', 'emilychangtv',
          'mkbhd', 'dhh', 'reshmasaujani',
          'KaraSwisher', 'mims', 'davidcohen',
          'charlesarthur', 'jeffweiner', 'benedictevans',
          'sirajraval', 'ajitpaifcc', 'BoredElonMusk', 
          'SwiftonSecurity', 'DigitalTrends',
          'cdixon', 'JonErlichman', 'fxshaw', 'ericjackson', 'asymco', 'MikeIsaac'])

f"There are together {len(persons)} Twitter accounts from which we'll build dataset"

"There are together 26 Twitter accounts from which we'll build dataset"

### Time period

In [5]:
start_date = datetime.datetime(year=2018, month=8, day=10)
end_date = datetime.datetime(year=2018, month=9, day=8, hour=23, minute=59, second=59)

f"We'll analyze tweets in the time period between {start_date} end {end_date} (including)"

"We'll analyze tweets in the time period between 2018-08-10 00:00:00 end 2018-09-08 23:59:59 (including)"

## Dataset

### Prepare credentials for querying API

In [6]:
# Twitter API credentials
# # (I have used here my personal consumer_key, consumer_secret, access_key, access_secret)
# because of the limits in tweeter API i had to created more pair of my credentials. I will use them to get
# the acces to all tweets I need

rotating_credentials = [
("fDmPUsiDAxZkIOVBjpck4o1n3", "EcNK9UfMe4fiaXFwvMWDjfReSviksgxFy3UhQ9yvYvFveFQtV7", 
 "914842765-StdZ8LjQGQm6gQgVbxMitC5cDTNDwjMs4fH2jfPR", "p9peNw0BKy1GI8vOtBfW6gVqNhSuNYvokipHjYLC84hPc"),
("tnnfyogABdQgYQAMjeAwhMSRu", "awCxjFKBpXA8GApZJcyqYCIwQnXVsSWSuJLQKbCZ9YvhC33uC4", 
"914842765-kur9whOqACbW7pJC1rrybJeUgAbme3rYfmOEcZgO", "gIAbk5D4DVqsbwFwzLxneOXGyQHVjnWtJGTQlg1361TLO"),
("Mryk9xUD0voaG625GLMVfHzBw", "c34WK2T7SmNNiT2CgQG4StWl47Yf9dSxxrPUG6izu63SgihQDs",
"914842765-XXwDxcP4vZcCAf5k7gDA5kuO77aBOpUp39rbJM7X", "FBHcy1x7MUPMQ8VClnTKYs8hzVdJ8RG9zfsaUWCz3yOZj"),
("6irsVtCQZQGMexYh2swanAUFS", "kuPho0piwZQLo6HulcZnJyUSbQpw63zeVIomysQaie85GwW01O",
"914842765-VOFoZWHDxX86WULBLUlmOuYeKSjbnPhs2KuQoBNa", "lJj2FN2Q9mXYYIGQuIRBJRLN0MgbfUiQ9DvFH7AiMfoZF")]  


In [7]:
api_instances = []

for credential in rotating_credentials:
    auth = tweepy.OAuthHandler(credential[0], credential[1])
    auth.set_access_token(credential[2], credential[3])
    api = tweepy.API(auth)
    api_instances.append(api)

### Read follower count statistics

In [8]:
follower_stat_cols = ['nick', 'date', 'delta_followers', 'followers_number']
followers_stats = pd.DataFrame(columns=follower_stat_cols)

In [9]:
path = "./dataset/followers_stats/*.xlsx"  # asterix notation

'''
Followers stat were created using 3rd party webpage holding followers statistics from a last month:
https://socialblade.com/twitter/user/elonmusk/monthly
We made single excel file per user holding change in number of followers
(see the /dataset/followers_stats directory for raw data)
'''
for filename in glob.glob(path):
    nick = filename.split('\\')[1].split('.xlsx')[0]
    
    followers = pd.read_excel(filename, header=None)
    followers_raw = []
    current_date = None
    delta_followers = None
    followers_number = None
    
    for index, row in followers.iterrows():
        # print(row[0])
        if index % 8 == 0:
            current_date = row[0]
        elif index % 8 == 2:
            delta_followers = row[0]
        elif index % 8 == 3:
            followers_number = row[0]
        elif index % 8 == 7:
            followers_raw.append((nick, current_date, delta_followers, followers_number))
    
    # print(f'{nick} parsed, {len(followers_raw)} rows read')
    
    followers_stats = pd.concat([followers_stats, pd.DataFrame.from_records(followers_raw, columns=follower_stat_cols)])

In [10]:
# followers_stats.groupby(['nick']).date.agg(['count'])
len(followers_stats.groupby(['nick']).date.agg(['count']))

0

In [11]:
followers_stats.dtypes

nick                object
date                object
delta_followers     object
followers_number    object
dtype: object

### Twitter API parser

#### getting content of the tweets:

In [12]:
def get_all_tweets(screen_name):
    
    iterator = 0
    
    api = api_instances[iterator % 4]
        
    alltweets = []  # here are stored all results of parse
    
    new_tweets = api.user_timeline(screen_name=screen_name, 
                                   count=200, 
                                   tweet_mode='extended')  # allows to get more than 140 chars
    
    alltweets.extend(new_tweets)  # adding new tweets to all tweets

    oldest = alltweets[-1].id - 1

    while len(new_tweets) > 0:
        
        iterator += 1
        
        if alltweets[-1].created_at < start_date:
            print('Time threshold of {} reached, going to next Twitter account'.format(start_date))
            break
        
        api = api_instances[iterator % 4]
        
        print('Credentials rotated due iterator {}, key is {}'.format(iterator, rotating_credentials[iterator % 4][0]))

        new_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended', max_id=oldest)

        alltweets.extend(new_tweets)

        oldest = alltweets[-1].id - 1
        oldest_object = alltweets[-1]

        print('Downloaded so far {} tweets for user {}'.format(len(alltweets), screen_name))

    # Here I already finished getting tweet data from API
    # Below is creating list of tuples holding data from columns (part of tweet object)
    
    formated_tweets = [(screen_name, tweet.id_str, tweet.created_at,  
                        tweet.full_text.encode('utf-8'), 
                        tweet.geo, tweet.place, tweet.is_quote_status, 
                        len(tweet.entities['hashtags']) > 0, len(tweet.entities['symbols']) > 0, 
                        len(tweet.entities['user_mentions']) > 0, len(tweet.entities['urls']) > 0,
                        len(tweet.entities['hashtags']), len(tweet.entities['symbols']), 
                        len(tweet.entities['user_mentions']), len(tweet.entities['urls']),
                        tweet.coordinates, tweet.contributors, 
                        tweet.retweet_count, tweet.favorite_count,
                        tweet.favorited, tweet.retweeted, tweet.lang,
                        tweet.in_reply_to_status_id, tweet.in_reply_to_status_id_str,
                        tweet.in_reply_to_user_id, tweet.in_reply_to_user_id_str,
                        tweet.in_reply_to_screen_name) for tweet in alltweets]
    
    # there must be defined columns for the future pandas data frame
    
    labels = ['screen_name', 'id', 'created_at', 
              'text', 
              'geo', 'place', 'is_quote_status', 
              'has_hashtags', 'has_symbols', 'has_user_mentions', 'has_urls', 
              'hashtags_count', 'symbols_count', 'user_mentions_count', 'urls_count',
              'coordinates', 'contributors',  
              'retweet_count', 'favorite_count', 
              'favorited', 'retweeted', 'lang', 
              'in_reply_to_status_id', 'in_reply_to_status_id_str', 
              'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name']
    
    result = pd.DataFrame.from_records(formated_tweets, columns=labels)  # accepts list of tuples (here, formated_tweets)
    
    return result  

### Lunch downloading tweets

In [13]:
result_dict = {}

for screen_name in persons:
    print('Processing twitter handle {}'.format(screen_name))
    
    p_tweets = get_all_tweets(screen_name)
    print(f'get_all_tweets done job! length:{len(p_tweets)}')
    
    result_dict[screen_name] = p_tweets

Processing twitter handle davidcohen
Time threshold of 2018-08-10 00:00:00 reached, going to next Twitter account
get_all_tweets done job! length:200
Processing twitter handle charlesarthur
Credentials rotated due iterator 1, key is tnnfyogABdQgYQAMjeAwhMSRu
Downloaded so far 400 tweets for user charlesarthur
Credentials rotated due iterator 2, key is Mryk9xUD0voaG625GLMVfHzBw
Downloaded so far 600 tweets for user charlesarthur
Credentials rotated due iterator 3, key is 6irsVtCQZQGMexYh2swanAUFS
Downloaded so far 800 tweets for user charlesarthur
Credentials rotated due iterator 4, key is fDmPUsiDAxZkIOVBjpck4o1n3
Downloaded so far 1000 tweets for user charlesarthur
Credentials rotated due iterator 5, key is tnnfyogABdQgYQAMjeAwhMSRu
Downloaded so far 1199 tweets for user charlesarthur
Credentials rotated due iterator 6, key is Mryk9xUD0voaG625GLMVfHzBw
Downloaded so far 1399 tweets for user charlesarthur
Credentials rotated due iterator 7, key is 6irsVtCQZQGMexYh2swanAUFS
Downloaded s

Downloaded so far 599 tweets for user ajitpaifcc
Credentials rotated due iterator 3, key is 6irsVtCQZQGMexYh2swanAUFS
Downloaded so far 799 tweets for user ajitpaifcc
Time threshold of 2018-08-10 00:00:00 reached, going to next Twitter account
get_all_tweets done job! length:799
Processing twitter handle jeffweiner
Time threshold of 2018-08-10 00:00:00 reached, going to next Twitter account
get_all_tweets done job! length:200
Processing twitter handle mkbhd
Credentials rotated due iterator 1, key is tnnfyogABdQgYQAMjeAwhMSRu
Downloaded so far 399 tweets for user mkbhd
Time threshold of 2018-08-10 00:00:00 reached, going to next Twitter account
get_all_tweets done job! length:399
Processing twitter handle BoredElonMusk
Time threshold of 2018-08-10 00:00:00 reached, going to next Twitter account
get_all_tweets done job! length:200
Processing twitter handle tim_cook
Time threshold of 2018-08-10 00:00:00 reached, going to next Twitter account
get_all_tweets done job! length:200
Processing 

In [14]:
# We're joining 26 result Pandas dataframe into one
# so we can write to single CSV file

tweets = [p for p in result_dict.values()]
dataset = pd.concat(tweets)

### Merge information on followers to dataset

In [15]:
def get_delta_f(screen_name, created_at): 
    global followers_stats
    
    match = followers_stats[(followers_stats.date.dt.month == created_at.month) & 
                    (followers_stats.date.dt.day == created_at.day) & 
                    (followers_stats.nick == screen_name)]
    
    if len(match) > 0:
        delta_found = match.iloc[0].delta_followers
        if delta_found == '--':
            delta_found = 0
        return (delta_found, match.iloc[0].followers_number)
    else:
        return (None, None)

In [16]:
dataset['delta_followers'] = dataset.apply(lambda x: get_delta_f(x['screen_name'], x['created_at'])[0], axis=1)

AttributeError: ('Can only use .dt accessor with datetimelike values', 'occurred at index 0')

In [None]:
dataset['followers_count'] = dataset.apply(lambda x: get_delta_f(x['screen_name'], x['created_at'])[1], axis=1)

## Sanity checks

In [None]:
dataset.columns

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.to_csv('dataset/dataset-sv-201808.csv', index=False)

print('Done exporting dataset to CSV files!')
print(f'Together {len(dataset)} rows saved')

Conslusions:

Dataset exported to single CSV file