In [None]:
# instala pacotes necessarios para o resto das operações
!pip install datetime ijson

In [None]:
import decimal
import pandas as pd
import ijson
import json
import datetime

user_sample_ids_path = '/content/drive/MyDrive/Colab Notebooks/Twibot22 - Dataset completo/user_sample_ids.csv'

# Carrega arquivo de ids da amostra
sample_ids = pd.read_csv(user_sample_ids_path)

# Limpa ids da amostra
sample_ids['id'] = sample_ids['id'].str.lstrip('u').astype(int)

## Gera amostra de tweets

In [None]:
# Caminho dos arquivos
input_file_template = '/content/drive/MyDrive/Colab Notebooks/Twibot22 - Dataset completo/tweet_{}.json'
output_file_template = '/content/drive/MyDrive/Colab Notebooks/Twibot22 - Dataset completo/sample_tweet_{}.json'

# Carregar os author_ids do CSV
print("Carregando author_ids do CSV...")
author_ids = set(sample_ids['id'])
print(f"Total de author_ids carregados: {len(author_ids)}")

def filter_large_json_iteratively(input_file_path, output_file_path, author_ids):
    print(f"Iniciando filtragem do arquivo JSON: {input_file_path}...")
    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:

        parser = ijson.items(input_file, 'item')
        filtered_data = []
        item_count = 0

        for item in parser:
            if item.get('author_id') in author_ids:
                entities_obj = item.get('entities') or {}
                user_mentions = entities_obj.get('user_mentions') or []
                # selecione só os atributos importantes para análise
                new_item = {
                    'id': item.get('id').lstrip('t'),
                    'text': item.get('text'),
                    'author_id': str(item.get('author_id')),
                    'created_at': item.get('created_at'),
                    'referenced_tweets': item.get('referenced_tweets') or [],
                    'entities': {
                        'mentions': user_mentions
                    },
                    'public_metrics': item.get('public_metrics') or {},
                    'in_reply_to_user_id': item.get('in_reply_to_user_id')
                }
                filtered_data.append(new_item)
            item_count += 1

        print(f"Total de itens processados: {item_count}")
        print(f"Total de itens filtrados: {len(filtered_data)}")

        # Exibe exemplo de dado filtrado
        if filtered_data:
            print(f"Exemplo de dado filtrado: {filtered_data[0]}")
        else:
            print("Nenhum dado filtrado disponível para mostrar.")

        # Salva os dados filtrados em um arquivo JSON
        try:
            print("Tentando serializar os dados filtrados para JSON...")
            json_string = json.dumps(filtered_data, ensure_ascii=False, indent=4)
            output_file.write(json_string)
            print(f"Dados filtrados gravados em {output_file_path}")
        except (TypeError, ValueError) as e:
            print(f"Erro ao serializar dados para JSON: {e}")

inicio = datetime.datetime.now()
print(f"Horário de início: {inicio}")

for i in range(0, 9):  # Processa os arquivos de tweet_0.json a tweet_8.json
    print(f"\nProcessando arquivo {i}...")
    input_file_path = input_file_template.format(i)
    output_file_path = output_file_template.format(i)
    filter_large_json_iteratively(input_file_path, output_file_path, author_ids)
    print(f"Arquivo {i} processado e salvo em {output_file_path}")

fim = datetime.datetime.now()
print(f"Horário de fim: {fim}")

## Transforma amostra de tweets para o formato esperado pelo sistema

In [None]:
def twitter_tweet_to_tweet(twitter_tweet: dict) -> dict:
    id = twitter_tweet['id']
    text = twitter_tweet['text']
    author_id = twitter_tweet['author_id']
    referenced_tweets = twitter_tweet.get('referenced_tweets') or []
    entities = twitter_tweet.get('entities') or {}
    public_metrics = twitter_tweet['public_metrics']
    in_reply_to_user_id = twitter_tweet.get('in_reply_to_user_id')
    created_at = twitter_tweet['created_at']

    retweet_count = public_metrics.get('retweet_count', 0)
    like_count = public_metrics.get('like_count', 0)
    reply_count = public_metrics.get('reply_count', 0)
    quote_count = public_metrics.get('quote_count', 0)

    mentions = [
        {'id': mention['id'], 'username': mention['screen_name']}
        for mention in entities.get('mentions', [])
    ]

    is_reply = in_reply_to_user_id is not None and in_reply_to_user_id != author_id

    if not referenced_tweets or len(referenced_tweets) == 0:
        is_retweet = text.startswith("RT")
    else:
        is_retweet = any(tweet['type'] == 'retweeted' for tweet in referenced_tweets if tweet)

    return {
        'id': id,
        'isReply': is_reply,
        'isRetweet': is_retweet,
        'mentions': mentions,
        'text': text,
        'authorId': author_id,
        'nRetweet': retweet_count,
        'nLike': like_count,
        'nReply': reply_count,
        'nQuote': quote_count,
        'tweetCreatedAt': created_at
    }

def process_tweets():
    for i in range(9):
        file_path = f'/content/drive/MyDrive/Colab Notebooks/Twibot22 - Dataset completo/sample_tweet_{i}.json'
        output_file_path = f'/content/drive/MyDrive/Colab Notebooks/Twibot22 - Dataset completo/parsed_tweets_{i}.json'
        print(f"Lendo arquivo: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as file:
            tweets = json.load(file)
            processed_tweets = [twitter_tweet_to_tweet(tweet) for tweet in tweets]
            print(f"Escrevendo tweets processados em: {output_file_path}")
            with open(output_file_path, 'a', encoding='utf-8') as output_file:
              json.dump(processed_tweets, output_file, ensure_ascii=False, indent=4)
              print("Tweets processados foram adicionados com sucesso.")

inicio = datetime.datetime.now()
print(f"Horário de início: {inicio}")
process_tweets()
fim = datetime.datetime.now()
print(f"Horário de fim: {fim}")