## 1.1 Load Tokens and base functions
refer: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all

https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query

In [1]:
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
#To add wait time between requests
import time
import numpy as np
import yaml
def auth():
    return os.getenv('TOKEN')

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

with open('twitter_keys.yml') as file:
    twitter_keys=yaml.full_load(file)  
    file.close()

bear_key=twitter_keys['search_tweets_v2']['bearer_token']

os.environ['TOKEN'] = bear_key
bearer_token = auth()
headers = create_headers(bearer_token)



def create_url_username(usernames_list):
    """Return twitter's user-lookup api and query params, in order to
    request a variety of information about one or more users specified by their usernames.
    

    Parameters
    ----------
    usernames_list : List, or str 
        Usernames list that you want to look up.
        Length [1:100]
    api_dict : dict, optional
        Twitter API reference. The default is api_dict.
        api: https://api.twitter.com/2/users/by
    Returns
    -------
    search_url : str
        api: https://api.twitter.com/2/users/by
    query_params : dict
        user fields: id, name,creaated at, description and public metrics.

    """
    
    search_url ="https://api.twitter.com/2/users/by"
    if isinstance(usernames_list,str):
        usernames_list=[usernames_list]
    #checking format
    for name in usernames_list:
        if not isinstance(name,str):
            print("%s is not str, will be removed from query."%(name))
            usernames_list.remove(name)

    username_s=','.join(usernames_list)
    username_s=username_s.replace('@','')

    query_params = {'usernames': username_s,
                    #'expansions': 'pinned_tweet_id',
                    #'tweet.fields': 'id,text,author_id,in_reply_to_user_id,conversation_id,created_at,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,created_at,description,public_metrics,verified',#'entities'
                    'next_token': {}}
    return search_url, query_params




def connect_to_endpoint(url, headers, params, next_token = None):
    "connect to endpoint, making HTTP request, return response with json format"
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()



def create_url_id(user_id,start_time=None,end_time=None,exclude='replies'):
    "get username by id"
    search_url = "https://api.twitter.com/2/users"
    #change params based on the endpoint you are using
    query_params = {'id': user_id,
                    'expansions': 'pinned_tweet_id',
                    'tweet.fields': 'id,text,author_id,created_at,public_metrics,entities',
                    'user.fields': 'created_at, description, entities, id, name, pinned_tweet_id, public_metrics, url, username, verified',
                    #'place.fields': 'contained_within, country, country_code, full_name, geo, id, name, place_type',
                    'next_token': {}}
    return (search_url, query_params)


def create_url_full(query, start_time, end_time, max_results = 500):
    "create a general full-archive tweet search url and params."
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
    
    start_time=pd.to_datetime(start_time,utc=True).isoformat()
    end_time=pd.to_datetime(end_time,utc=True).isoformat()
    #change params based on the endpoint you are using
    query_params = {'query': query,
                    'start_time': start_time,
                    'end_time': end_time,
                    'max_results': max_results,
                    'expansions': 'author_id,referenced_tweets.id',
                    'tweet.fields': 'created_at,id,text,public_metrics,referenced_tweets,author_id',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    #'place.fields': 'full_name,id,country,name',
                    'next_token': {}}
    return (search_url, query_params)
    

## 1.2 Core Functions: get users' information; get all tweets of an individual user; build custom query string .

In [2]:

def get_user_information(usernames_list,save_csv=False):
    """Look up users' information by usernames, 
    return their id,name,description, public metrics.
    
    Parameters
    ----------
    usernames_list : List, or str
        Usernames list that you want to look up.
        username format:  ^[A-Za-z0-9_]{1,15}  e.g 'TwitterDev'
    save_csv : Bool, optional
        Whether to save result as csv file. The default is False.

    Returns
    -------
    user_info_df:data frame 
        contain enssential user information

    """    
    'creat url and param'
    
    N=len(usernames_list)
    q=N//100+1
    user_info=[]
    for i in range(q):
        sub_usernames=usernames_list[i*100:(i+1)*100]
        if len(sub_usernames)<1:
            break
        url=create_url_username(sub_usernames)
        'get request response (json)'
        user_info_sub=connect_to_endpoint(url[0], headers, url[1])['data']
        user_info.extend(user_info_sub)
    'read json as pandas df'
    user_info_df=pd.DataFrame(user_info)
    
    public_metrics=user_info_df['public_metrics']
    user_info_df=user_info_df.drop(columns=['public_metrics'])
    'split internal dict data to columns '
    for i,metric in enumerate(public_metrics):
        user_info_df.loc[i,'followers_count']=metric['followers_count']
        user_info_df.loc[i,'following_count']=metric['following_count']
        user_info_df.loc[i,'listed_count']=metric['listed_count']
    
    
    if save_csv:
        user_info_df.to_csv("/data/workspace_files/user_information.csv")
    
    return user_info_df


def get_user_all_tweets(user_id,start_time,end_time,keyword=None,hash_tag=None,
                        exclude_retweet=False,exclude_reply=True,
                        exclude_promotion=True,max_count=3200):
    """Get all tweets in the timeline of a single user, return formatted data frame results.  
    inlcuding created time, text, tweet id, author id, and public metrics. 

    Parameters
    ----------
    user_id : str
        DESCRIPTION.
    start_time : str YYYY-MM-DD
        The oldest UTC timestamp from which the Tweets will be provided..
    end_time : str YYYY-MM-DD
         The newest, most recent UTC timestamp to which the Tweets will be provided.
    keyword : TYPE, optional
        atches a keyword within the body of a Tweet. This is a tokenized match, meaning that your keyword string will be matched against the tokenized text of the Tweet body. 
    hash_tag : TYPE, optional
        Matches any Tweet containing a recognized hashtag, if the hashtag is a recognized entity in a Tweet.. The default is None.
    exclude_retweet : TYPE, optional
        Whether to exclude retweets. The default is False.
    exclude_reply : TYPE, optional
        Whether to exclude reply. The default is True.
    exclude_promotion : TYPE, optional
        Whether to exclude all promotions. The default is True.
    max_count : TYPE, optional
        maximum number of tweets per user. The default is 3200.

    Returns
    -------
    user_tweets_df : data frame
        All tweets of a user, up to max count.

    """
    
    'build standard query'
    query_str=build_twitter_query(keyword,from_id=int(user_id),hash_tag=hash_tag,exclude_retweet=exclude_retweet,
                                  exclude_reply=exclude_reply,exclude_promotion=exclude_promotion)


    print("query string: ",query_str)
    count=0
    flag=True
    next_token=None
    user_tweets_list=[]

    refer_tweets_list=[]
    'using loop to get all the tweets until reach max_count'
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("-------------------")
        print("Token: ", next_token)
        'creat corrsponding url and param'
        url = create_url_full(query_str, start_time,end_time)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']
        if not json_response.get('data'):
            print('Tweets of User ID: %s are protected, no access.'%(user_id))
            return pd.DataFrame([json_response['meta']])
        data_list=json_response['data']
        include_dict=json_response['includes']
        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                user_tweets_list.extend(data_list)
                if include_dict.get('tweets'):
                    refer_tweets_list.extend(include_dict['tweets'])
                count += result_count
                print("Total # of Tweets added: ", result_count)
                print("Oldest time of tweets",data_list[-1]['created_at'])
                print("-------------------")
                time.sleep(3)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                user_tweets_list.extend(data_list)
                if include_dict.get('tweets'):
                    refer_tweets_list.extend(include_dict['tweets'])
                count += result_count
                print("Total # of Tweets added: ", result_count)
                print("Oldest time of tweets",data_list[-1]['created_at'])
                print("-------------------")
                time.sleep(3)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
    print("Total number of tweets retrived: ",count)
    "process the response data to data frame"
    
    user_tweets_df=pd.DataFrame(user_tweets_list)

    public_metrics=user_tweets_df.loc[:,'public_metrics'].copy()
    user_tweets_df=user_tweets_df.drop(columns='public_metrics')
    for i,m in enumerate(public_metrics):
        keys=list(m.keys())
        for key in keys:
            user_tweets_df.loc[i,key]=m[key]
    
    if len(refer_tweets_list)>1:     
        refer_tweets_df=pd.DataFrame(refer_tweets_list)
        refer_tweets_df.index=refer_tweets_df.id
        user_tweets_df.text=user_tweets_df.apply(lambda x: refer_tweets_df.loc[x.referenced_tweets[0].get('id'),'text'] if not pd.isna(x.referenced_tweets) and x.referenced_tweets[0].get('id') in refer_tweets_df.index else x.text,axis=1)
        user_tweets_df=user_tweets_df.drop(columns='referenced_tweets')

    user_tweets_df.created_at=pd.to_datetime(user_tweets_df.created_at).dt.tz_localize(None)
    return user_tweets_df


def build_twitter_query(keyword=None,from_id=None,emoji=None,hash_tag=None,lang='en',
                        to_id=None,exclude_retweet=True,exclude_reply=True,
                        exclude_promotion=True):
    """Build a custom query for full tweets search, return a standard fomatted query string.
    
    

    Parameters
    ----------
    keyword : TYPE, optional
        Matches a keyword within the body of a Tweet. This is a tokenized match, meaning that your keyword string will be matched against the tokenized text of the Tweet body. Tokenization splits words based on punctuation, symbols, and Unicode basic plane separator characters.
    from_id : TYPE, optional
        Matches any Tweet from a specific user.
The value can be either the username (excluding the @ character) or the user’s numeric user ID.
    emoji : TYPE, optional
        Matches an emoji within the body of a Tweet
    hash_tag : TYPE, optional
        Matches any Tweet containing a recognized hashtag, if the hashtag is a recognized entity in a Tweet.. The default is None.
    lang : TYPE, optional
        language. The default is 'en'.
    to_id : TYPE, optional
        Matches any Tweet that is in reply to a particular user.
The value can be either the username (excluding the @ character) or the user’s numeric user ID.
    exclude_retweet : TYPE, optional
        Whether to exclude retweets. The default is True.
    exclude_reply : TYPE, optional
        Whether to exclude reply. The default is True.
    exclude_promotion : TYPE, optional
        Whether to exclude all promotions. The default is True.

    Returns
    -------
    query_str : str
        standard fomatted query string.

    """
    
    
    query_str=''
    keyword_str=''
    if isinstance(keyword,str):
        keyword_str=keyword
    elif isinstance(keyword,list):
        keyword_str='('+' OR '.join(keyword)+")"
        
    
    from_id_str='(from:%s)'%(from_id) if from_id else ''
    to_id_str='(to:%s)'%(to_id) if to_id else ''
    
    emoji_str='' if not emoji else emoji
    
    hash_tag_str=''
    if isinstance(hash_tag,str):
        hash_tag_str='#'+hash_tag
    elif isinstance(hash_tag,list):
        hash_tag=['#'+tag for tag in hash_tag]
        hash_tag_str='('+' OR '.join(hash_tag)+')'
    
    
    lang_str='lang:%s'%(lang)
    query_str=' '.join([keyword_str,from_id_str,to_id_str,lang_str,emoji_str,hash_tag_str])
    if exclude_retweet:
        query_str+=' -is:retweet'
    
    if exclude_reply:
        query_str+=' -is:reply'
    
    if exclude_promotion:
        query_str+=' -is:nullcast'
    print(query_str)
    return query_str


def get_sample_tweets_by_day(start_time,hash_tag=['btc','BTC','bitcoin','Bitcoin','eth','ETH','ethereum','Ethereum','crypto','Crypto']):

    end_time=start_time+pd.offsets.Day(1)
    query_str=build_twitter_query(exclude_promotion=True, exclude_reply=True, exclude_retweet=True,hash_tag=hash_tag)
    
    t0=start_time
    t1=t0+pd.offsets.Hour(1)
    user_tweets_list=[]
    count=0
    while t1<=end_time:
    
        url = create_url_full(query_str, t0,t1,max_results=100)
        json_response = connect_to_endpoint(url[0], headers, url[1])
        result_count=0
        if json_response.get('data'):
            user_tweets_list.extend(json_response['data'])
            result_count = json_response['meta']['result_count']
        t0=t1
        t1=t0+pd.offsets.Hour(1)
        count+=result_count
        time.sleep(1)
    user_tweets_df=pd.DataFrame(user_tweets_list)
    print("Tweets Sampling size from %s to %s : %d"%(start_time,end_time,count))
    public_metrics=user_tweets_df.loc[:,'public_metrics'].copy()
    user_tweets_df=user_tweets_df.drop(columns='public_metrics')
    for i,m in enumerate(public_metrics):
        keys=list(m.keys())
        for key in keys:
            user_tweets_df.loc[i,key]=m[key]
    
    user_tweets_df=user_tweets_df.drop(columns='referenced_tweets')
    user_tweets_df.created_at=pd.to_datetime(user_tweets_df.created_at).dt.tz_localize(None)
    
    return user_tweets_df    

## 2.1 Usage Examples

### 2.2 Get Users' information by usernames

In [3]:

"get all usernames(top_1 has 194 valid usernames)"
Top_200_df=pd.read_csv('Top_200.csv',index_col=0)
Top_200_df=Top_200_df.drop_duplicates('username')
usernames=Top_200_df.username.to_list()
user_info_df=get_user_information(usernames,save_csv=False)
user_info_df

Endpoint Response Code: 200
Endpoint Response Code: 200


Unnamed: 0,created_at,description,id,username,name,verified,followers_count,following_count,listed_count
0,2011-05-08T16:03:03.000Z,Ethereum.\n\nFable of the Dragon Tyrant (not m...,295218901,VitalikButerin,vitalik.eth,True,2431170.0,276.0,19487.0
1,2008-04-09T02:03:48.000Z,Creator of Litecoin.\nCryptocurrency Enthusias...,14338147,SatoshiLite,Charlie Lee [LTC⚡],True,1017811.0,489.0,10211.0
2,2012-11-21T00:03:03.000Z,"Iconoclast.\nLover of women, adventure and mys...",961445378,officialmcafee,John McAfee,True,1159584.0,13968.0,7133.0
3,2013-05-30T06:33:49.000Z,"#Bitcoin & Open Blockchains, since 2012. \n\nA...",1469101279,aantonop,Andreas (BEWARE of giveaway scams!),True,625839.0,0.0,9916.0
4,2009-02-15T01:07:18.000Z,"Founder of @DFJvc, @drapervc, @Draper_U, @IYS_...",20884310,TimDraper,Tim Draper,True,203915.0,2474.0,3546.0
...,...,...,...,...,...,...,...,...,...
186,2008-05-22T20:28:49.000Z,"Cofounder @GrowClovyr, Board @ZcashFoundation ...",14872837,AmberBaldet,Amber ☘️,False,50409.0,1543.0,1548.0
187,2018-06-23T09:35:43.000Z,"Blockchain enthusiasts, Hyperledger evangelist...",1010456357982625792,mpiekarska8,Marta Piekarska,False,728.0,73.0,25.0
188,2009-10-24T13:46:19.000Z,"CEO, https://t.co/NppXAI5QRm",84850888,Kris_HK,Kris | Crypto.com,True,77903.0,7431.0,596.0
189,2007-05-10T23:39:54.000Z,,5943622,pmarca,Marc Andreessen,True,853156.0,20596.0,13429.0


In [33]:
Top_200_df.to_csv('Top_200.csv')
user_info_df.to_csv('user_information.csv')

### 2.3 Get all tweets of an individual users by user id.

#### 2.3.1 Single user 

In [4]:
"load top 100 data"
Top_200_df.dropna(0,'any',inplace=True)
Top_100_df=Top_200_df.iloc[:101]
Top_100_id_list=Top_100_df.id.astype(int).to_list()
Top_100_username_list=Top_100_df.username.to_list()
Top_100_df

Unnamed: 0,name,username,id
0,Vitalik Buterin,VitalikButerin,2.952189e+08
1,Charlie Lee,SatoshiLite,1.433815e+07
2,John McAfee,officialmcafee,9.614454e+08
3,Andreas M. Antonopoulos,aantonop,1.469101e+09
4,Tim Draper,TimDraper,2.088431e+07
...,...,...,...
98,Brock Pierce,brockpierce,2.946858e+07
99,???TF%$D!,CryptoHustle,3.179873e+09
100,Jerry Brito,jerrybrito,7.945320e+05
101,Crypto Cred,CryptoCred,8.995583e+17


In [12]:
i=12
username_i=Top_200_df.username[i]
user_id_i=get_user_information(username_i)['id'][0]
print("Username: %s , user id: %s"%(username_i,user_id_i))

Endpoint Response Code: 200
Username: TuurDemeester , user id: 40742821


In [15]:
"Get the tweets of Vitalik Buterin during 2016-01-01 to 2021-08-01, including retweets but excluding replies and promotions"

start_time='2016-01-01'
end_time='2021-08-01'

user_tweets_df=get_user_all_tweets(user_id_i, start_time, end_time,exclude_retweet=False,exclude_promotion=True,exclude_reply=True,
                                   max_count=32000)
    

 (from:40742821)  lang:en   -is:reply -is:nullcast
query string:   (from:40742821)  lang:en   -is:reply -is:nullcast
-------------------
Token:  None
Endpoint Response Code: 200
Tweets of User ID: 40742821 are protected, no access.


In [7]:
"save results to Tweets folder"
user_tweets_df.to_csv('/data/workspace_files/Tweets2/%s_tweets.csv'%username_i)
user_tweets_df

Unnamed: 0,created_at,id,author_id,text,retweet_count,reply_count,like_count,quote_count
0,2021-07-29 11:58:57,1420715466411020289,295218901,🚀NEW EPISODE🚀\n@seb2point0 and @sunnya97 talk ...,180.0,0.0,0.0,0.0
1,2021-07-29 05:26:18,1420616651150528516,295218901,Against overuse of the Gini coefficient:\n\nht...,439.0,636.0,2771.0,52.0
2,2021-07-20 15:03:00,1417500293563142151,295218901,Retroactive public goods funding!\n\nCollabora...,548.0,742.0,3247.0,68.0
3,2021-07-20 14:10:55,1417487184467275781,295218901,During the last few weeks there has been a lot...,1113.0,785.0,5573.0,112.0
4,2021-07-01 15:43:53,1410625211494436870,295218901,When https://t.co/Cf4OB6fQsT meets #MEVroast.....,420.0,1129.0,2616.0,41.0
...,...,...,...,...,...,...,...,...
2955,2016-01-03 17:49:52,683706858574450689,295218901,https://t.co/JQTx0rgT71 I recommend Uber just ...,5.0,1.0,9.0,0.0
2956,2016-01-03 02:05:46,683469268579565572,295218901,"""Next year is the year for Bitcoin” is the new...",34.0,0.0,0.0,0.0
2957,2016-01-02 17:20:14,683337015430021121,295218901,RT @Falkvinge on the first ten years of the Pi...,1.0,0.0,2.0,0.0
2958,2016-01-02 17:19:58,683336946756616195,295218901,I will bet you 1 million USD that the US dolla...,199.0,26.0,393.0,0.0


#### 2.3.2 Using a loop to get results for all targets

In [12]:
for i in range(0,103):
    username_i=user_info_df.loc[i,'username']
    user_id_i=user_info_df.loc[i,'id']
    print("Username: %s , user id: %s"%(username_i,user_id_i))
    start_time='2016-01-01'
    end_time='2021-08-01'

    user_tweets_df=get_user_all_tweets(user_id_i, start_time, end_time,exclude_retweet=False,exclude_promotion=True,exclude_reply=True,
                                    max_count=32000)

    user_tweets_df.to_csv('/data/workspace_files/Tweets/%s_tweets.csv'%username_i)

Username: CryptoCred , user id: 899558268795842561
 (from:899558268795842561)  lang:en   -is:reply -is:nullcast
query string:   (from:899558268795842561)  lang:en   -is:reply -is:nullcast
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fosevp5zr6735snx23bjqvty1fniil
Total # of Tweets added:  197
Oldest time of tweets 2020-12-16T16:04:25.000Z
-------------------
-------------------
Token:  b26v89c19zqg8o3fosevp5zr6735snx23bjqvty1fniil
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fo6vf3m4ngb09fjrbarxqxfln7l2t9
Total # of Tweets added:  247
Oldest time of tweets 2019-12-05T00:48:17.000Z
-------------------
-------------------
Token:  b26v89c19zqg8o3fo6vf3m4ngb09fjrbarxqxfln7l2t9
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fn0jmlyb9pankwjktgo74z6vgn2vlp
Total # of Tweets added:  220
Oldest time of tweets 2018-08-27T18:01:31.000Z
-------------------
-------------------
Token:  b26v89c19zqg8o3fn0jmlyb9pankwjktgo74z6vgn2vlp
Endpo

#### 2.3.3 Summarize download top100 data

In [6]:
tweet_dir = '/data/workspace_files/Tweets'
tweet_files = os.listdir(tweet_dir)
print('Directory contains {} files'.format(len(tweet_files)))
print(tweet_files)

Directory contains 100 files
['tyler_tweets.csv', 'TheCryptoDog_tweets.csv', 'TimDraper_tweets.csv', 'iamjosephyoung_tweets.csv', 'haydentiff_tweets.csv', 'jgarzik_tweets.csv', 'paulvigna_tweets.csv', 'zooko_tweets.csv', 'WarrenWhitlock_tweets.csv', 'NickSzabo4_tweets.csv', 'VinnyLingham_tweets.csv', 'cburniske_tweets.csv', 'CarpeNoctom_tweets.csv', 'cdixon_tweets.csv', 'crypto_rand_tweets.csv', 'VladZamfir_tweets.csv', 'Jasmine_tweets.csv', 'FEhrsam_tweets.csv', 'WhalePanda_tweets.csv', 'davidgerard_tweets.csv', 'SatoshiLite_tweets.csv', 'CharlieShrem_tweets.csv', 'jonmatonis_tweets.csv', 'peterktodd_tweets.csv', 'JoshRoomsburg_tweets.csv', 'kevinrose_tweets.csv', 'thomaspower_tweets.csv', 'APompliano_tweets.csv', 'nic__carter_tweets.csv', 'starkness_tweets.csv', 'lopp_tweets.csv', 'obussmann_tweets.csv', 'ProfFaustus_tweets.csv', 'kwerb_tweets.csv', 'woonomic_tweets.csv', 'efipm_tweets.csv', 'IOHK_Charles_tweets.csv', 'adam3us_tweets.csv', 'PerianneDC_tweets.csv', 'CryptoYoda1338_twe

In [10]:

dtypes = {
    'id': str,
    'author_id': str
}


data_info_df=user_info_df.iloc[:102].copy()
for i in range(102):
    username_i=user_info_df.loc[i,'username']
    user_id_i=user_info_df.loc[i,'id']
    file_name='%s_tweets.csv'%username_i
    if file_name in tweet_files:
        file=file_name
        df = pd.read_csv(os.path.join(tweet_dir, file), 
                         dtype=dtypes, index_col=0, lineterminator='\n')
        tot_num_tweets=len(df)

        retweet_mean=df['retweet_count'].mean()
        reply_count_mean=df['reply_count'].mean()
        like_count_mean=df['like_count'].mean()

        data_info_df.loc[i,'tweet_count']=tot_num_tweets
        data_info_df.loc[i,'retweet_count']=retweet_mean
        data_info_df.loc[i,'reply_count']=reply_count_mean
        data_info_df.loc[i,'like_count']=like_count_mean
    else:
        print("%s is not contained in the folder"%(username_i))


data_info_df=data_info_df.dropna(0,'any')
print(len(data_info_df))
data_info_df

TuurDemeester is not contained in the folder
cryptomanran is not contained in the folder
100


Unnamed: 0,id,verified,description,name,username,created_at,followers_count,following_count,listed_count,tweet_count,retweet_count,reply_count,like_count
0,295218901,True,Ethereum.\n\nFable of the Dragon Tyrant (not m...,vitalik.eth,VitalikButerin,2011-05-08T16:03:03.000Z,2397191.0,274.0,19190.0,2960.0,606.593581,43.326689,382.992905
1,14338147,True,Creator of Litecoin.\nCryptocurrency Enthusias...,Charlie Lee [LTC⚡],SatoshiLite,2008-04-09T02:03:48.000Z,1013934.0,488.0,10188.0,3516.0,813.480375,34.687429,381.102389
2,961445378,True,"Iconoclast.\nLover of women, adventure and mys...",John McAfee,officialmcafee,2012-11-21T00:03:03.000Z,1159462.0,13986.0,7142.0,8373.0,160.498029,86.329512,656.246984
3,1469101279,True,"#Bitcoin & Open Blockchains, since 2012. \n\nA...",Andreas (BEWARE of giveaway scams!),aantonop,2013-05-30T06:33:49.000Z,623550.0,0.0,9904.0,8803.0,1220.865955,7.300466,115.306486
4,20884310,True,"Founder of @DFJvc, @drapervc, @Draper_U, @IYS_...",Tim Draper,TimDraper,2009-02-15T01:07:18.000Z,202740.0,2474.0,3536.0,1119.0,96.571939,20.760500,190.996425
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,29468585,False,"Visionary, Cryptopioneer, Entrepreneur, Impact...",Brock Pierce,brockpierce,2009-04-07T15:19:56.000Z,73177.0,6421.0,1968.0,7272.0,292.499862,1.768152,14.823020
97,3179873194,False,Radical Centrist. Truth seeker + Activist. Cap...,฿TF%$Brrrrrr!,CryptoHustle,2015-04-18T15:43:34.000Z,126176.0,873.0,2824.0,4828.0,286.948012,5.369097,72.402444
98,794532,False,"Executive director of @coincenter, the DC-base...",Jerry Brito,jerrybrito,2007-02-26T03:18:37.000Z,47052.0,536.0,1371.0,615.0,154.858537,3.751220,41.676423
99,899558268795842561,False,Trader. @RoundupCrypto,Cred,CryptoCred,2017-08-21T09:06:19.000Z,355569.0,864.0,6757.0,1412.0,73.639518,28.989377,485.973088


In [11]:
data_info_df.index=pd.RangeIndex(len(data_info_df))
data_info_df.to_csv('/data/workspace_files/Top100_info.csv')
data_info_df.head()

In [13]:
import plotly.express as px
data_info_df['rank']=np.arange(1,101)
data_info_df['log_followers']=np.log(data_info_df.followers_count)
data_info_df['log_like_count']=np.log(data_info_df['like_count'])
fig = px.scatter(data_info_df, x="rank", y="log_followers",title='Public metrics of Top100 Crypto Influencers' ,
                 color="log_like_count", size='tweet_count',hover_data=['username','retweet_count'],trendline="ols")

fig.update_traces(textposition="bottom right")
fig.update_layout(
                    xaxis_title='Rank',
                    yaxis_title='Log(number of followers)')
fig.show()

Unsupported

## 2.4 Get Daily semi-random sample tweets

In [8]:

month_range=pd.date_range('2016-01-01','2021-08-01',freq='MS')

N=len(month_range)
for i in range(0,N-1):
    start_time_tot=month_range[i]
    print('Current month: ',start_time_tot.date())
    monthly_tweets_df=pd.DataFrame()
    daily_range=pd.date_range(start_time_tot,month_range[i+1])
    n=len(daily_range)
    for j in range(n-1):
        start_time=daily_range[j]
        daily_tweets_df=get_sample_tweets_by_day(start_time,hash_tag=['btc','BTC','bitcoin','Bitcoin','eth','ETH','ethereum','Ethereum','crypto','Crypto'])
        monthly_tweets_df=pd.concat([monthly_tweets_df,daily_tweets_df],axis=0)
        time.sleep(60)
    time.sleep(120)
    monthly_tweets_df.to_csv('/data/workspace_files/Random Tweets/%s_tweets.csv'%(start_time_tot.date()))


Current month:  2021-02-01
   lang:en  (#btc OR #BTC OR #bitcoin OR #Bitcoin OR #eth OR #ETH OR #ethereum OR #Ethereum OR #crypto OR #Crypto) -is:retweet -is:reply -is:nullcast
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
Tweets Sampling size from 2021-02-01 00:00:00 to 2021-02-02 00:00:00 : 2288
   lang:en  (#btc OR #BTC OR #bitcoin OR #Bitcoin OR #eth OR #ETH OR #ether