In [2]:
import tweepy
from secrets import bearer_token, consumer_key,consumer_secret,access_token,access_token_secret
import pandas as pd
import glob
import os
pd.options.display.max_columns = 200
from math import inf
client = tweepy.Client(bearer_token, consumer_key,consumer_secret,access_token,access_token_secret,wait_on_rate_limit=True)

In [3]:
def get_tweets(query,df,num_tweets=inf,start_token=""):

    # Searches for a maximum of "num_tweets" tweets
    # If num_tweets = inf, gets all tweets available
    # Partial results are saved in temp/ folder
    df = pd.DataFrame()
    # max_results = 10
    max_results = 500
    if num_tweets == inf:
        total_pages = inf
    else:
        total_pages = round(num_tweets/max_results)
    num_pages = 0
    token = start_token

    while num_pages < total_pages:
        print(f"Page number {num_pages} starting")
        if token == "":
            page = client.search_all_tweets(query,
                                            max_results=max_results,
                                            tweet_fields=["created_at","author_id","referenced_tweets"],
                                            user_fields=["created_at"],
                                            expansions=["referenced_tweets.id,author_id,referenced_tweets.id.author_id"])
        else:
            page = client.search_all_tweets(query,
                                            max_results=max_results,
                                            next_token = token,
                                            tweet_fields=["created_at","author_id","referenced_tweets"],
                                            user_fields=["id","name","username","created_at"],
                                            expansions=["referenced_tweets.id,author_id,referenced_tweets.id.author_id"])

        try:
            page_df = pd.DataFrame(page.data)
            page_df.dropna()
            page_df["referenced_tweets_type"] = page_df["referenced_tweets"].apply(lambda x: x[0].type)
            page_df["referenced_tweets_id"] = page_df["referenced_tweets"].apply(lambda x: x[0].id)
            page_df = page_df[page_df.columns.intersection(["author_id","created_at","id","text","referenced_tweets_type","referenced_tweets_id"])]
            
            users_df = pd.DataFrame(page.includes["users"])
            users_df = users_df.rename(columns={"id":"author_id","created_at":"author_created_at","name":"author_name","username":"author_username"})
            users_df = users_df[users_df.columns.intersection(["author_id","author_created_at","author_name","author_username"])]

            
            ref_df = pd.DataFrame(page.includes["tweets"])
            ref_df = ref_df.merge(users_df)
            ref_df = ref_df.rename(columns={"id":"referenced_tweets_id","created_at":"referenced_tweets_created_at","author_id":"referenced_tweets_author_id","text":"referenced_tweets_text","author_created_at":"referenced_tweets_author_created_at","author_name":"referenced_tweets_author_name","author_username":"referenced_tweets_author_username"})
            ref_df = ref_df[ref_df.columns.intersection(["referenced_tweets_id","referenced_tweets_created_at","referenced_tweets_author_id","referenced_tweets_text","referenced_tweets_author_created_at","referenced_tweets_author_name","referenced_tweets_author_username"])]

            page_df = page_df.merge(users_df)
            page_df = page_df.merge(ref_df)
            
            df = df.append(page_df,ignore_index=True)
            if "next_token" in page.meta:
                token = page.meta["next_token"]
                page_df.to_csv(f"temp/{token}.csv")
                print(f"Page {num_pages} finished, next_token = {token}")
            else:
                print(f"Page {num_pages} finished. All tweets for this query collected.")
                page_df.to_csv(f"temp/end.csv")
                return df            
        except:
            if "next_token" in page.meta:
                token = page.meta["next_token"]
            else:
                print("No next_token column. All tweets for this query collected.")
                return df

            print(f"Page {num_pages} failed, next_token = {token}. Skipping")
        # sleep(0.1)
        num_pages+=1

    df.to_csv("retweets.csv")
    return df

In [4]:
df = pd.DataFrame()
df = get_tweets("(covid OR coronavirus OR corona OR covid-19) lang:pt is:retweet",df,4_000_000)
df

Page number 0 starting
Page 0 finished, next_token = b26v89c19zqg8o3fpdv8qd8nho5el6xynlt9s9nnj3zwd
Page number 1 starting
Page 1 finished, next_token = b26v89c19zqg8o3fpdv8qd8n715jbmex4mphreldlyxz1
Page number 2 starting
Page 2 finished, next_token = b26v89c19zqg8o3fpdv8qd8chmt4ifznajcv1yfucpail
Page number 3 starting
Page 3 finished, next_token = b26v89c19zqg8o3fpdv8qd81qrc4pus17x9xhxsoy0mt9
Page number 4 starting
Page 4 finished, next_token = b26v89c19zqg8o3fpdv8qd7r2v6tul53ja4n8gduynlh9
Page number 5 starting
Page 5 finished, next_token = b26v89c19zqg8o3fpdv8qd7qtr0zrczkzg97jaxzdqqnx
Page number 6 starting
Page 6 finished, next_token = b26v89c19zqg8o3fpdv8qd7g5wl0rmttksd1c1lt1g29p
Page number 7 starting
Page 7 finished, next_token = b26v89c19zqg8o3fpdv8qd75gi8rcl10m95k0cij4vg59
Page number 8 starting
Page 8 finished, next_token = b26v89c19zqg8o3fpdv8qd6upla0yr7q40y2fq3dxenp9
Page number 9 starting
Page 9 finished, next_token = b26v89c19zqg8o3fpdv8qd6k07korzdv0ajk6li54tmyl
Page numbe

In [None]:
df.to_csv("retweets.csv")

In [15]:
# Merge all files in temp/ if necessary
all_files = glob.glob(os.path.join("temp/", "*.csv"))     

df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
del concatenated_df["Unnamed: 0"]

In [16]:
concatenated_df

Unnamed: 0,author_id,created_at,id,text,referenced_tweets_type,referenced_tweets_id,author_created_at,author_name,author_username,referenced_tweets_author_id,referenced_tweets_created_at,referenced_tweets_text,referenced_tweets_author_created_at,referenced_tweets_author_name,referenced_tweets_author_username
0,1417148229053988879,2021-10-07 00:12:23+00:00,1445904805852033026,RT @MarcosRogerio: O relator informou que colo...,retweeted,1445890361688313856,2021-07-19 15:44:25+00:00,EaS,EaS33130022,160895960,2021-10-06 23:15:00+00:00,O relator informou que colocará o presidente d...,2010-06-29 12:04:57+00:00,MARCOS ROGÉRIO,MarcosRogerio
1,1376715806235561984,2021-10-07 00:11:27+00:00,1445904569066786817,RT @MarcosRogerio: O relator informou que colo...,retweeted,1445890361688313856,2021-03-30 02:00:22+00:00,Filipe Vianna,_DireitaUnida,160895960,2021-10-06 23:15:00+00:00,O relator informou que colocará o presidente d...,2010-06-29 12:04:57+00:00,MARCOS ROGÉRIO,MarcosRogerio
2,1441084185653829644,2021-10-07 00:12:12+00:00,1445904758074654720,RT @MarcosRogerio: O relator informou que colo...,retweeted,1445890361688313856,2021-09-23 16:57:17+00:00,EmersonFernandez,Emerson79311981,160895960,2021-10-06 23:15:00+00:00,O relator informou que colocará o presidente d...,2010-06-29 12:04:57+00:00,MARCOS ROGÉRIO,MarcosRogerio
3,1386666107994583045,2021-10-07 00:11:20+00:00,1445904541019475969,RT @MarcosRogerio: O relator informou que colo...,retweeted,1445890361688313856,2021-04-26 12:59:10+00:00,Sérgio Morais,SergioMorais38,160895960,2021-10-06 23:15:00+00:00,O relator informou que colocará o presidente d...,2010-06-29 12:04:57+00:00,MARCOS ROGÉRIO,MarcosRogerio
4,423895213,2021-10-07 00:11:17+00:00,1445904527010369536,RT @MarcosRogerio: O relator informou que colo...,retweeted,1445890361688313856,2011-11-29 02:21:10+00:00,Edmir Silva,EdmirSilva1,160895960,2021-10-06 23:15:00+00:00,O relator informou que colocará o presidente d...,2010-06-29 12:04:57+00:00,MARCOS ROGÉRIO,MarcosRogerio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145702,2678455944,2021-10-07 00:00:08+00:00,1445901720026976256,"RT @GFiuza_Oficial: Mais uma: Sofia, 17 anos, ...",retweeted,1444772527084130304,2014-07-25 02:35:43+00:00,MANOEL MOURA ............................BRAZIL,MANOELMOURA1,2804949532,2021-10-03 21:13:07+00:00,"Mais uma: Sofia, 17 anos, estudante saudável, ...",2014-10-04 13:20:53+00:00,Guilherme Fiuza,GFiuza_Oficial
1145703,633330070,2021-10-07 00:00:05+00:00,1445901707251175427,"RT @monicabergamo: O ""kit Covid"", pacote de re...",retweeted,1445884696320266241,2012-07-12 00:23:54+00:00,Luciana de Oliveira,LubraulioDe,58951368,2021-10-06 22:52:29+00:00,"O ""kit Covid"", pacote de remédios ineficazes c...",2009-07-21 22:55:27+00:00,Mônica Bergamo,monicabergamo
1145704,1051634101390856193,2021-10-07 00:00:04+00:00,1445901705695088640,RT @simoneregazio: Meu amor vcs aplaudiram reu...,retweeted,1445883224710946820,2018-10-15 00:41:22+00:00,Lalo Proenca,LaloProenca,771574989195575302,2021-10-06 22:46:38+00:00,Meu amor vcs aplaudiram reunião com Amado Bat...,2016-09-02 05:06:27+00:00,Simone Regazio 🚩,simoneregazio
1145705,316918405,2021-10-07 00:00:02+00:00,1445901696979308550,RT @BarbeariaPoian: A Suécia interrompe o uso ...,retweeted,1445734811453247489,2011-06-14 04:48:00+00:00,Thiago🇧🇷 BOLSONARO.🔰 🪂 FORÇA E AÇÃO!,Thiago_CIR,2802207875,2021-10-06 12:56:54+00:00,A Suécia interrompe o uso de 💉💉💉 da covid par...,2014-10-03 16:22:22+00:00,Verdades e Nada Mais 🔸️,BarbeariaPoian


In [17]:
concatenated_df.to_csv("retweets.csv")