In [2]:
import gzip, bz2
import os, json
import shutil
import dask
from dask import delayed
from dask.distributed import Client
import polars as pl
import time
import re
from datetime import datetime

## 1 Fonte dos arquivos

Iremos extrair os arquivos coletados do twitter entre 2012 e 2022 na página do *[Internet Archive](https://archive.org)*.

A equipe responsável pela coleção dos dados é *[The Twitter Stream Grab](https://archive.org/details/twitterstream)* e o nome do projeto chamava-se ***Twitter Stream***.

> [Python Tweepy – Getting the ID of a status](https://www.geeksforgeeks.org/python-tweepy-getting-the-id-of-a-status/?ref=ml_lbp)
>
> [Python – Status object in Tweepy](https://www.geeksforgeeks.org/python-status-object-in-tweepy/?ref=header_search)
>
> [Python Tweepy – Getting the number of times a tweet has been retweeted](https://www.geeksforgeeks.org/python-tweepy-getting-the-number-of-times-a-tweet-has-been-retweeted/)
>
> [Python Tweepy – Getting the language of a tweet](https://www.geeksforgeeks.org/python-tweepy-getting-the-language-of-a-tweet/?ref=ml_lbp)
>
> [X entities](https://developer.x.com/en/docs/twitter-api/enterprise/data-dictionary/native-enriched-objects/entities#entitiesobject)
>
> [Post Object](https://developer.x.com/en/docs/twitter-api/enterprise/data-dictionary/native-enriched-objects/tweet)
>
> [Understanding the new Tweet payload](https://developer.x.com/en/blog/product-news/2020/understanding-the-new-post-payload)
>
> [Twitter Sentiment Analysis by Python | best NLP model 2022](https://youtu.be/uPKnSq6TaAk)

## 2 Dask para paralelizar a extração dos arquivos

In [3]:
client = Client(n_workers=4, threads_per_worker=4)
#client = Client(n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 15.28 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:63070,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 15.28 GiB

0,1
Comm: tcp://127.0.0.1:63089,Total threads: 4
Dashboard: http://127.0.0.1:63091/status,Memory: 3.82 GiB
Nanny: tcp://127.0.0.1:63073,
Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-ptro8jrv,Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-ptro8jrv

0,1
Comm: tcp://127.0.0.1:63093,Total threads: 4
Dashboard: http://127.0.0.1:63097/status,Memory: 3.82 GiB
Nanny: tcp://127.0.0.1:63074,
Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-2fh8ypad,Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-2fh8ypad

0,1
Comm: tcp://127.0.0.1:63090,Total threads: 4
Dashboard: http://127.0.0.1:63094/status,Memory: 3.82 GiB
Nanny: tcp://127.0.0.1:63075,
Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-lf3bxnhi,Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-lf3bxnhi

0,1
Comm: tcp://127.0.0.1:63096,Total threads: 4
Dashboard: http://127.0.0.1:63099/status,Memory: 3.82 GiB
Nanny: tcp://127.0.0.1:63076,
Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-bcrn8rty,Local directory: C:\Users\S78886~1\AppData\Local\Temp\dask-scratch-space\worker-bcrn8rty


## 3 Lista de funções que vamos usar

#### Salva os tweets no formato parquet

In [4]:
def save_parquet(tweets_br, dir, file_name):
    df_tweets = pl.from_dicts(tweets_br, infer_schema_length=None)
    parquet = "".join(dir.split("\\")[1:]) + f"_{file_name}.parquet"
    df_tweets.write_parquet(root + "\\" + parquet)
    print("++   Parquet {:s} salvo".format(parquet))

#### Subdiretórios

In [5]:
def extract_tweets(zip):
    f_ext = zip.split(".")[-1]

    try:
        with (bz2.open if f_ext=="bz2" else gzip.open)(filename=zip, mode="rt", encoding="utf8") as f_input:    
            lines = list(map(json.loads,f_input))
            f_input.close() 
            
        tweets, usuarios, apagados, status_tweets, retweets, hashtags, mencoes = [],[],[],[],[],[],[]

        for i, line in enumerate(lines):
            if not "delete" in line: 
                if ("lang" in line and line["lang"]=="pt"):
                    longitude=None
                    latitude=None
                    if line["coordinates"] != None:
                        latitude=line["coordinates"]["coordinates"][0]
                        longitude=line["coordinates"]["coordinates"][1]
                        
                    tweets.append({"id_tweet": line["id"],
                                   "dt_criacao_tweet": line["created_at"],
                                   "tx_tweet": line["text"],
                                   "id_twt_replied": line["in_reply_to_status_id"],
                                   "id_usr_replied": line["in_reply_to_user_id"],
                                   "no_lingua": line["lang"],
                                   "latitude": latitude,
                                   "longitude": longitude
                                  })
                    ##usuários
                    usuarios.append({"id_usuario": line["user"]["id"],
                                     "dt_criacao_usr": line["user"]["created_at"],
                                     "nm_usuario": line["user"]["name"]
                                  })  
                    ##status do tweet                
                    status_tweets.append({"id_tweet": line["id"],
                                          "dt_status": line["created_at"],
                                          "no_citacoes": int(re.sub("\D", "", str(line["quote_count"]))  if "quote_count" in line else "0"),
                                          "no_retweets": int(re.sub("\D", "", str(line["retweet_count"])) if "retweet_count" in line else "0"),
                                          "no_curtidas": int(re.sub("\D", "", str(line["favorite_count"])) if "favorite_count" in line else "0"),
                                          "id_usuario": line["user"]["id"],
                                          "nm_usuario": line["user"]["name"],
                                          "nm_tela_usr": line["user"]["screen_name"],
                                          "no_seguidores_usr": line["user"]["followers_count"],
                                          "no_amigos_usr": line["user"]["friends_count"],
                                          "no_listas_usr": line["user"]["listed_count"],
                                          "no_curtidas_usr": line["user"]["favourites_count"],
                                          "no_total_twt_usr": line["user"]["statuses_count"],
                                          "no_lingua_usr": line["user"]["lang"],
                                          "local_usr": line["user"]["location"]
                                         }) 
    
                    ## hashtags neste tweet
                    #hashtags += [{"id_tweet": line["id"],
                    #              "nm_hashtag":hashtag["text"]
                    #             } for hashtag in line["entities"]["hashtags"]]
                    for hashtag in line["entities"]["hashtags"]:
                        hashtags.append({"id_tweet": line["id"], "nm_hashtag": hashtag["text"]})
                    
                    ## menções neste tweet
                    #mencoes += [{"id_tweet": line["id"],
                    #              "id_usuario": mention["id"],
                    #              "nm_usuario": mention["name"],
                    #             "nm_tela_usr": mention["screen_name"]
                    #             } for mention in line["entities"]["user_mentions"]]
                    for mention in line["entities"]["user_mentions"]:
                        mencoes.append({"id_tweet": line["id"], "id_usuario": mention["id"], "nm_usuario": mention["name"],
                                  "nm_tela_usr": mention["screen_name"]})
                    
                    ## se o usuário retweetou ou curtiu um tweet
                    if "retweeted_status" in line:
                        longitude=None
                        latitude=None                     
                        if line["retweeted_status"]["coordinates"] != None:
                            latitude=line["retweeted_status"]["coordinates"]['coordinates'][0]
                            longitude=line["retweeted_status"]["coordinates"]['coordinates'][1]
                        ## tweet retweetado
                        tweets.append({"id_tweet": line["retweeted_status"]["id"],
                                       "dt_criacao_tweet": line["retweeted_status"]["created_at"],
                                       "tx_tweet": line["retweeted_status"]["text"],
                                       "id_twt_replied": line["retweeted_status"]["in_reply_to_status_id"],
                                       "id_usr_replied": line["retweeted_status"]["in_reply_to_user_id"],
                                       "no_lingua": line["retweeted_status"]["lang"],
                                       "latitude": latitude,
                                       "longitude": longitude
                                      })
                        
                        ## usuário que criou o tweet retweetado
                        usuarios.append({"id_usuario": line["retweeted_status"]["user"]["id"],
                                       "dt_criacao_usr": line["retweeted_status"]["user"]["created_at"],
                                       "nm_usuario": line["retweeted_status"]["user"]["name"]
                                      })
    
                        ## status do tweet retweetado
                        status_tweets.append({"id_tweet": line["retweeted_status"]["id"],
                                              "dt_status": line["created_at"],## data do tweet principal
                                              "no_citacoes": int(re.sub("\D", "", str(line["retweeted_status"]["quote_count"]))  if "quote_count" in line["retweeted_status"] else "0"),
                                              "no_retweets": int(re.sub("\D", "", str(line["retweeted_status"]["retweet_count"])) if "retweet_count" in line["retweeted_status"] else "0"),
                                              "no_curtidas": int(re.sub("\D", "", str(line["retweeted_status"]["favorite_count"])) if "favorite_count" in line["retweeted_status"] else "0"),
                                              "id_usuario": line["retweeted_status"]["user"]["id"],
                                              "nm_usuario": line["retweeted_status"]["user"]["name"],
                                              "nm_tela_usr": line["retweeted_status"]["user"]["screen_name"],
                                              "no_seguidores_usr": line["retweeted_status"]["user"]["followers_count"],
                                              "no_amigos_usr": line["retweeted_status"]["user"]["friends_count"],
                                              "no_listas_usr": line["retweeted_status"]["user"]["listed_count"],
                                              "no_curtidas_usr": line["retweeted_status"]["user"]["favourites_count"],
                                              "no_total_twt_usr": line["retweeted_status"]["user"]["statuses_count"],
                                              "no_lingua_usr": str(line["retweeted_status"]["user"]["lang"] or '') if "lang" in line["retweeted_status"]["user"] else "",
                                              "local_usr": line["retweeted_status"]["user"]["location"]
                                             })
                        
                        ## relação entre o tweet e o retweet
                        retweets.append({
                            "id_tweet": line["id"], "id_twt_retuitado": line["retweeted_status"]["id"] 
                        })
                        
                        ## hashtags do tweet retweetado
                        #hashtags += [{"id_tweet": line["retweeted_status"]["id"],
                        #              "nm_hashtag": hashtag["text"]
                        #             } for hashtag in line["retweeted_status"]["entities"]["hashtags"]] 
                        for hashtag in line["retweeted_status"]["entities"]["hashtags"]:
                            hashtags.append({"id_tweet": line["retweeted_status"]["id"], "nm_hashtag": hashtag["text"]})
                        
                        ## menções do tweet retweetado
                        #mencoes += [{"id_tweet": line["retweeted_status"]["id"],
                        #              "id_usuario": mention["id"],
                        #              "nm_usuario": mention["name"],
                        #              "nm_tela_usr": mention["screen_name"]
                        #             } for mention in line["retweeted_status"]["entities"]["user_mentions"]]
                        for mention in line["retweeted_status"]["entities"]["user_mentions"]:
                            mencoes.append({"id_tweet": line["retweeted_status"]["id"],
                                      "id_usuario": mention["id"],
                                      "nm_usuario": mention["name"],
                                      "nm_tela_usr": mention["screen_name"]
                                     })
                
        return tweets, usuarios, apagados, status_tweets, retweets, hashtags, mencoes
    except Exception as e:
        print(e)
        return [],[],[],[],[],[],[]

In [6]:
setup_v1='''
import gzip, bz2
import os, json
import tqdm
def extract_tweets(zip):
    f_ext = zip.split(".")[-1]
#    try:
    with (bz2.open if f_ext=="bz2" else gzip.open)(filename=zip, mode="rt", encoding="utf8") as f_input:    
        lines = list(map(json.loads,f_input))
        f_input.close() 
    for line in lines:
        if "delete" not in line:
            if (("lang" in line and line["lang"]=="pt") or ("lang" in line["user"] and line["user"]["lang"]=="pt")) and \
            ("retweeted_status" in line and line["retweeted_status"]["coordinates"]!=None):
                print(line["retweeted_status"]["coordinates"])
                #print(line["retweeted_status"]["coordinates"])
#    except Exception as e:
#        print(e)
'''


In [7]:
#dir = "Dados/2022/05/16/"
#zip_files = [os.path.join(path, name) for path, subdirs, files in os.walk(dir) for name in files if name.endswith(tuple([".gz",".bz2"]))]
#for zip in tqdm.tqdm(zip_files):
#    extract_tweets(zip)

In [8]:
setup_v2='''
import gzip, bz2
import os, json

def extract_tweets(zip):
    f_ext = zip.split(".")[-1]
    dados = []
    try:
        with (bz2.open if f_ext=="bz2" else gzip.open)(filename=zip, mode="rt", encoding="utf8") as f_input:    
            for i, line in enumerate(f_input):
                dados.append(json.loads(line))
            f_input.close()
        #print(len(dados))
    except Exception as e:
        print(e)
'''

In [9]:
#extract_tweets_v1("Dados/2022/04/10/20220410000000.json.gz")

In [10]:
#import timeit
#import numpy as np

In [11]:
#timeit.timeit(stmt="extract_tweets('Dados/2022/04/10/20220410000000.json.gz')", setup=setup_v1, number=20)
#timeit.timeit(stmt="extract_tweets('Dados/2022/04/10/20220410000000.json.gz')", setup=setup_v2, number=20)

#t = timeit.Timer(stmt="extract_tweets('Dados/2022/04/10/20220410000000.json.gz')", setup=setup_v1)
#total = t.repeat(5, number=5)
#print(np.mean( total ))
#t = timeit.Timer(stmt="extract_tweets('Dados/2022/04/10/20220410000000.json.gz')", setup=setup_v2)
#total = t.repeat(5, number=5)
#print(np.mean( total ))


### Descompactação, busca de tweets BR e salvamento em .parquet

In [12]:
#tweets, usuarios, apagados, status_tweets, retweets, hashtags, mencoes = extract_tweets('Dados\\2022\\01\\01\\20220101000000.json.gz')

In [None]:
# iteração sobre os diretórios
roots = ["Dados/2018/01"]
for root in roots:
    base_dir = [os.path.join(root, dir) for dir in os.listdir(root) if os.path.isdir(os.path.join(root, dir))]

    start_for = time.time()
    for dir in base_dir:
        start = time.time()
        print("+++++++++++++++++++++++++++++++++++++++++++++")    
        print("++ Diretório => {:s}".format(dir))
        print("++   Início => {:s}".format(datetime.now().strftime('%d/%m/%Y %H:%M:%S')))
    
        zip_files = [os.path.join(path, name) for path, subdirs, files in os.walk(dir) for name in files if name.endswith(tuple([".gz",".bz2"]))]
        print("++   Total de arquivos zip => {:d}".format(len(zip_files)))
    
        #paralelização
        tweets, usuarios, apagados, status_tweets, retweets, hashtags, mencoes = [],[],[],[],[],[],[]
        extracts = [delayed(extract_tweets)(zip) for zip in zip_files]

        if len(extracts)>0:
            tasks = delayed()(extracts)
            outs = tasks.compute()
            for out in outs:
                tweets.extend( out[0] )
                usuarios.extend( out[1] )
                apagados.extend( out[2] )
                status_tweets.extend( out[3] )
                retweets.extend( out[4] )
                hashtags.extend( out[5] )
                mencoes.extend( out[6] )

            save_parquet(tweets, dir, "tweets")
            save_parquet(usuarios, dir, "usuarios")
            #save_parquet(apagados, dir, "apagados")
            save_parquet(status_tweets, dir, "status")
            save_parquet(retweets, dir, "retwts")
            save_parquet(hashtags, dir, "hashtags")
            save_parquet(mencoes, dir, "mencoes")
        
        tweets, usuarios, apagados, status_tweets, retweets, hashtags, mencoes = [],[],[],[],[],[],[]
        zip_files = []
        #apaga o diretório relativo ao dia do mês
        shutil.rmtree(dir)
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        print("++   Tempo do passo        => {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        print("+++++++++++++++++++++++++++++++++++++++++++++")
       
    end_for = time.time()
    
    hours, rem = divmod(end_for-start_for, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Tempo total => {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

+++++++++++++++++++++++++++++++++++++++++++++
++ Diretório => Dados/2018/01\01
++   Início => 23/10/2024 07:11:50
++   Total de arquivos zip => 690
++   Parquet 01_tweets.parquet salvo
++   Parquet 01_usuarios.parquet salvo
++   Parquet 01_status.parquet salvo
++   Parquet 01_retwts.parquet salvo
++   Parquet 01_hashtags.parquet salvo
++   Parquet 01_mencoes.parquet salvo
++   Tempo do passo        => 00:03:44.27
+++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++
++ Diretório => Dados/2018/01\02
++   Início => 23/10/2024 07:15:34
++   Total de arquivos zip => 1440
++   Parquet 02_tweets.parquet salvo
++   Parquet 02_usuarios.parquet salvo
++   Parquet 02_status.parquet salvo
++   Parquet 02_retwts.parquet salvo
++   Parquet 02_hashtags.parquet salvo
++   Parquet 02_mencoes.parquet salvo
++   Tempo do passo        => 00:08:33.30
+++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++++
++ Diretório => Dados/2018/

2024-10-23 09:30:44,316 - distributed.nanny - ERROR - Nanny failed to start process
Traceback (most recent call last):
  File "C:\Users\s788865623\AppData\Local\anaconda3\lib\site-packages\distributed\nanny.py", line 720, in start
    await self.process.start()
  File "C:\Users\s788865623\AppData\Local\anaconda3\lib\site-packages\distributed\process.py", line 55, in _call_and_set_future
    res = func(*args, **kwargs)
  File "C:\Users\s788865623\AppData\Local\anaconda3\lib\site-packages\distributed\process.py", line 215, in _start
    process.start()
  File "C:\Users\s788865623\AppData\Local\anaconda3\lib\multiprocessing\process.py", line 121, in start
    self._popen = self._Popen(self)
  File "C:\Users\s788865623\AppData\Local\anaconda3\lib\multiprocessing\context.py", line 336, in _Popen
    return Popen(process_obj)
  File "C:\Users\s788865623\AppData\Local\anaconda3\lib\multiprocessing\popen_spawn_win32.py", line 73, in __init__
    hp, ht, pid, tid = _winapi.CreateProcess(
Permis

In [None]:
#tweets

In [None]:
client.close()

In [None]:
#save_parquet(tweets, dir, "tweets")
#df_tweets = pl.from_dicts(tweets, strict=False)
#parquet = "".join(dir.split("\\")[1:]) + f"_{file_name}.parquet"
#df_tweets.write_parquet(root + "\\" + parquet)
#print("++   Parquet {:s} salvo".format(parquet))

In [None]:
#import timeit
#import numpy as np
#t = timeit.Timer(stmt='for hashtag in s: '\
#    'hashtags.append({"nm_hashtag":hashtag})', setup='s=["a","b","c"]; hashtags=[]')
#total = t.repeat(10)
#print(np.mean( total ))
#t = timeit.Timer(stmt='hashtags += [{"nm_hashtag": hashtag} for hashtag in s]',setup='s=["a","b","c"]; hashtags=[]')
#total = t.repeat(10)
#print(np.mean( total ))

# FIM