In [1]:
# IMPORT PACKAGES
#
## tweepy - api to get data from twitter
import tweepy as tw
## hiding API keys - ref: https://youtu.be/CJjSOzb0IYs
import apikeys
## data manipulation
import pandas as pd
## date transformation
from datetime import datetime, timedelta

In [2]:
# CREATE CONNECTION WITH TWITTER API

## Authentication Tokens
my_bearer = apikeys.bearer
my_key = apikeys.key
my_secret = apikeys.secret
my_token = apikeys.token
my_token_secret = apikeys.token_secret

## creating client object
client = tw.Client(
    bearer_token=my_bearer, 
    consumer_key=my_key, 
    consumer_secret=my_secret, 
    access_token=my_token, 
    access_token_secret=my_token_secret
    )

## creating API object
# authorization of consumer key and consumer secret
auth = tw.OAuthHandler(
    consumer_key=my_key, 
    consumer_secret=my_secret
    )
# set access to user's access key and access secret 
auth.set_access_token(my_token, my_token_secret)
# calling the api 
api = tw.API(auth)

In [3]:
# PROFILE QUANTITY MENTIONS

# Replace with your own search query
query_list = ['@LulaOficial', '@jairbolsonaro', '@cirogomes', '@simonetebetbr']
query_filter = ' -is:retweet'

## all recent tweets mentions
counts_recent = client.get_recent_tweets_count(query=query_list[0], granularity='hour')
## recent tweets mentions without retweets
counts_recent_filtered = client.get_recent_tweets_count(query=query_list[0] + query_filter, granularity='hour')

## creating the dataframe and removing the first and the last range
df_count = pd.DataFrame(counts_recent[0], columns=['start', 'end', 'tweet_count']).rename(columns={"tweet_count": query_list[0] + '_mentions'}).iloc[1:len(counts_recent[0])-1].reset_index(drop=True)
## convert date columns from string to datetime
df_count['start'] = pd.to_datetime(df_count['start'], utc=True).map(lambda x: x.tz_convert('America/Bahia'))
df_count['end'] = pd.to_datetime(df_count['end'], utc=True).map(lambda x: x.tz_convert('America/Bahia'))
## split datetime into date and time
df_count['start_date'] = pd.to_datetime(df_count['start']).dt.date
df_count['start_time'] = pd.to_datetime(df_count['start']).dt.time
df_count['end_date'] = pd.to_datetime(df_count['end']).dt.date
df_count['end_time'] = pd.to_datetime(df_count['end']).dt.time
## reorder the columns sequence
df_count = df_count.loc[:, ['start', 'start_date', 'start_time', 'end', 'end_date', 'end_time', query_list[0] + '_mentions']]
## get tweet count without retweets
df_temp = pd.DataFrame(counts_recent_filtered[0], columns=['tweet_count']).rename(columns={"tweet_count": query_list[0] + '_mentions_without_retweet'}).iloc[1:len(counts_recent[0])-1].reset_index(drop=True)
df_count = pd.concat([df_count, df_temp], axis=1)
## check numeric columns type - fillna and convert to integer
df_count[query_list[0] + '_mentions'] = df_count[query_list[0] + '_mentions'].fillna(0).astype(int)
df_count[query_list[0] + '_mentions_without_retweet'] = df_count[query_list[0] + '_mentions_without_retweet'].fillna(0).astype(int)

## loop - get data about the top 4 candidates
for i in range(1, len(query_list)):
    
    ## all recent tweets mentions
    counts_recent = client.get_recent_tweets_count(query=query_list[i], granularity='hour')
    ## recent tweets mentions without retweets
    counts_recent_filtered = client.get_recent_tweets_count(query=query_list[i] + query_filter, granularity='hour')

    ## get tweet count with retweets
    df_temp = pd.DataFrame(counts_recent[0], columns=['tweet_count']).rename(columns={"tweet_count": query_list[i] + '_mentions'}).iloc[1:len(counts_recent[0])-1].reset_index(drop=True)
    df_count = pd.concat([df_count, df_temp], axis=1)
    
    ## get tweet count without retweets
    df_temp = pd.DataFrame(counts_recent_filtered[0], columns=['tweet_count']).rename(columns={"tweet_count": query_list[i] + '_mentions_without_retweet'}).iloc[1:len(counts_recent[0])-1].reset_index(drop=True)
    df_count = pd.concat([df_count, df_temp], axis=1)
    
    ## check numeric columns type - fillna and convert to integer
    df_count[query_list[i] + '_mentions'] = df_count[query_list[i] + '_mentions'].fillna(0).astype(int)
    df_count[query_list[i] + '_mentions_without_retweet'] = df_count[query_list[i] + '_mentions_without_retweet'].fillna(0).astype(int)

## final dataset
df_count

Unnamed: 0,start,start_date,start_time,end,end_date,end_time,@LulaOficial_mentions,@LulaOficial_mentions_without_retweet,@jairbolsonaro_mentions,@jairbolsonaro_mentions_without_retweet,@cirogomes_mentions,@cirogomes_mentions_without_retweet,@simonetebetbr_mentions,@simonetebetbr_mentions_without_retweet
0,2022-09-11 13:00:00-03:00,2022-09-11,13:00:00,2022-09-11 14:00:00-03:00,2022-09-11,14:00:00,7523,3749,10776,2560,1700,811,187,140
1,2022-09-11 14:00:00-03:00,2022-09-11,14:00:00,2022-09-11 15:00:00-03:00,2022-09-11,15:00:00,7191,3639,8604,2225,1498,812,231,172
2,2022-09-11 15:00:00-03:00,2022-09-11,15:00:00,2022-09-11 16:00:00-03:00,2022-09-11,16:00:00,7850,4084,7466,1834,1599,924,186,127
3,2022-09-11 16:00:00-03:00,2022-09-11,16:00:00,2022-09-11 17:00:00-03:00,2022-09-11,17:00:00,9526,4307,7808,1859,1933,1057,212,151
4,2022-09-11 17:00:00-03:00,2022-09-11,17:00:00,2022-09-11 18:00:00-03:00,2022-09-11,18:00:00,11663,5585,8519,1762,1659,960,221,162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,2022-09-18 07:00:00-03:00,2022-09-18,07:00:00,2022-09-18 08:00:00-03:00,2022-09-18,08:00:00,3474,1926,5434,1228,536,225,68,43
163,2022-09-18 08:00:00-03:00,2022-09-18,08:00:00,2022-09-18 09:00:00-03:00,2022-09-18,09:00:00,5009,2806,9611,2079,760,404,166,110
164,2022-09-18 09:00:00-03:00,2022-09-18,09:00:00,2022-09-18 10:00:00-03:00,2022-09-18,10:00:00,6142,3538,12522,3152,1296,560,109,64
165,2022-09-18 10:00:00-03:00,2022-09-18,10:00:00,2022-09-18 11:00:00-03:00,2022-09-18,11:00:00,8117,4511,12095,2997,1498,644,95,66


In [5]:
# PROFILE INFORMATION

# Replace with your own users id 
## @LulaOficial id = 2670726740
## @jairbolsonaro id = 128372940 
## @cirogomes id = 33374761
## @simonetebetbr id = 2508415207
user_id_list = [2670726740, 128372940, 33374761, 2508415207]

# fetching the user
user = api.get_user(user_id=user_id_list[0])
# creating dataframe
df_users = pd.DataFrame([datetime.today().strftime('%Y-%m-%d')], columns=['date'])
# fetching the statuses_attributes
df_users['screen_name'] = user.screen_name
df_users['followers'] = user.followers_count
df_users['following'] = user.friends_count
df_users['posts'] = user.statuses_count
df_users['lists'] = user.listed_count
df_users['likes'] = user.favourites_count

## loop - get data about the top 4 candidates
for i in range(1, len(user_id_list)):
    # fetching the user
    user = api.get_user(user_id=user_id_list[i])
    # fetching the statuses_attributes
    df_temp =  pd.DataFrame([datetime.today().strftime('%Y-%m-%d')], columns=['date'])
    # fetching the statuses_attributes
    df_temp['screen_name'] = user.screen_name
    df_temp['followers'] = user.followers_count
    df_temp['following'] = user.friends_count
    df_temp['posts'] = user.statuses_count
    df_temp['lists'] = user.listed_count
    df_temp['likes'] = user.favourites_count

    # adding the info about the new user
    df_users = pd.concat([df_users, df_temp])

df_users

Unnamed: 0,date,screen_name,followers,following,posts,lists,likes
0,2022-09-18,LulaOficial,4355557,2803,27026,4057,14979
0,2022-09-18,jairbolsonaro,8918819,497,14974,8405,5882
0,2022-09-18,cirogomes,1487689,415,12752,3277,5839
0,2022-09-18,simonetebetbr,365885,239,3052,851,12421


In [6]:
# Saving dataset
## mentions dataset
df_count.to_csv('datasets/profile_mentions.csv', index=False)
## user dataset
df_users.to_csv('datasets/profile_info.csv', index=False)
## last valid date and time
df_count.loc[len(df_count)-1:, ['end_date', 'end_time']].reset_index(drop=True).to_csv('datasets/last_valid_date.csv', index=False)