In [217]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from pymongo import MongoClient
from datetime import datetime
import pandas as pd

In [218]:
client = MongoClient()
db = client["twits_db"]
db.list_collection_names()
col = db["db.5e8fc63ddd8efca7e56c3215"]

- Filtrar por fechas
- Extraer columnas de interés
- RTs: extraer original si no fue procesado antes

In [219]:
start_ts = datetime.fromisoformat('2020-04-12T00:00:00+00:00')
end_ts =  datetime.fromisoformat('2020-04-19T00:00:00+00:00')

In [241]:
# Convert string to datetime in the aggregation pipeline and apply projection
pipeline = [        
    {
        '$project': {
            "id": 1,
            "user.id": 1,
            "text": 1,
            "truncated": 1,
            'extended_tweet.full_text': 1,
            "in_reply_to_status_id": 1,
            "in_reply_to_user_id": 1,
            "_id": 0,
            'created_at': {
                '$dateFromString': {
                    'dateString': '$created_at'
                }
            },
            "retweeted_status.id": 1,
            "retweeted_status.user.id": 1,
            "retweeted_status.text": 1,
            "retweeted_status.extended_tweet.full_text": 1,
        }
    },
    {
        "$match": {
            'created_at': {
                '$gte': start_ts,
                '$lte': end_ts
            },
        }
    }
]

In [242]:
# Batch settings
batch_size = 1000000  # Define the size of each batch
batch_number = 1  # Counter for batch files

# Batch processing
batch = []

def save_batch(batch, batch_number):
    # Convert the batch to a DataFrame and save as CSV (or use csv.writer for raw CSV)
    df = pd.DataFrame(batch)
    for f in ["reply_to_tweet_id", "reply_to_user_id", "retweet_tweet_id", "retweet_user_id"]:
        df[f] = df[f].astype("Int64")        
    df.to_csv(f'data/tweets_batch_{batch_number:03d}.tsv', index=False, sep='\t')
    print(f"Saved batch {batch_number:03d}")

for d in col.aggregate(pipeline):
    row = {
        "tweet_id": d["id"],
        "user_id": d["user"]["id"],        
        "created_at": d["created_at"],
        "full_text": d["text"],
        "reply_to_tweet_id": d["in_reply_to_status_id"],
        "reply_to_user_id": d["in_reply_to_user_id"],
    }
    if "extended_tweet" in d:
        row["full_text"] = d["extended_tweet"]["full_text"]
    if "retweeted_status" in d:
        dr = d["retweeted_status"]
        row |= {
            "retweet_tweet_id": dr["id"],
            "retweet_user_id": dr["user"]["id"],            
        }
        if "extended_tweet" in dr:
            row["full_text"] = dr["extended_tweet"]["full_text"]
        else:
            row["full_text"] = dr["text"]

    batch.append(row)
    # Save the batch to CSV when it reaches the batch_size
    if len(batch) == batch_size:
        save_batch(batch, batch_number)
        
        # Reset the batch and increment the batch number
        batch = []
        batch_number += 1

# Save the final batch if it's not empty
if batch:
    save_batch(batch, batch_number)

print(f"Processed {batch_number} batches.")
    

Saved batch 001 batches.
Saved batch 002 batches.
Saved batch 003 batches.
Saved batch 004 batches.
Saved batch 005 batches.
Saved batch 006 batches.
Saved batch 007 batches.
Processed 7 batches.


In [249]:
chunk_size = 10000  # Adjust the chunk size as needed
all_chunks = []

for i in range(1,8):
    chunks = pd.read_csv(f'data/tweets_batch_00{i}.tsv', sep='\t',
                         engine='python',
                         on_bad_lines='skip',
                         chunksize=chunk_size)
    all_chunks += chunks

df = pd.concat(all_chunks, ignore_index=True)

for f in ["user_id", "reply_to_tweet_id", "reply_to_user_id", "retweet_tweet_id", "retweet_user_id"]:
    df[f] = df[f].astype("Int64")

In [250]:
df

Unnamed: 0,tweet_id,user_id,created_at,full_text,reply_to_tweet_id,reply_to_user_id,retweet_tweet_id,retweet_user_id
0,1249125074495512576,415914708,2020-04-12 00:00:00,Panorama en Quintana Roo \n\nHasta las 18 hora...,,,1249114191325257728,814504845935788032
1,1249125074541649920,825227495242862592,2020-04-12 00:00:00,@CamiFerrce @Pau_lin_a Prefiero el COVID 19-20...,1248608951341465600,1112168942,,
2,1249125075179016192,1047598652510937088,2020-04-12 00:00:00,App de COVID-19 pudo ser un mero sitio web inf...,,,,
3,1249125075195793408,210950346,2020-04-12 00:00:00,Las pruebas rápidas de #Covid19 podrían ayudar...,,,,
4,1249125075166662658,888404662038261760,2020-04-12 00:00:00,#QueNoSeTePase Médicos del IMSS de Villa Alta ...,,,,
...,...,...,...,...,...,...,...,...
6166213,1251661747859214337,772099708500185088,2020-04-18 23:59:50,Me sumo!,,,1251651117580201984,733971169729187840
6166214,1251661748173758465,1332565328,2020-04-18 23:59:50,🚨ÚLTIMA HORA: Washington Post confirma origen ...,,,1251241300512378880,1163463729226428416
6166215,1251661748219686913,782058499798818816,2020-04-18 23:59:50,En Monterrey han iniciado un ensayo clínico al...,,,1251657496105713664,316273207
6166216,1251661748370763777,191983088,2020-04-18 23:59:50,"A nadie le importa que el covid les vaya a ""ar...",,,1251572197694550016,86802425


### How many replies?

In [255]:
(~df.reply_to_tweet_id.isna()).mean()

0.034609058583397474

In [254]:
(~df.reply_to_tweet_id.isna()).sum()

213407

In [256]:
replied_in_df = df.reply_to_tweet_id.isin(df.tweet_id)

In [257]:
replied_in_df.mean()

0.0012002494884222387

In [258]:
replied_in_df.sum()

7401

In [274]:
dfr = df[df.tweet_id.isin(df[replied_in_df].tweet_id)]
# Exclude self-replies (threads)
dfr = dfr[dfr.user_id != dfr.reply_to_user_id]

In [275]:
dfr = pd.merge(left=dfr.sample(10),
               right=df,
               left_on="reply_to_tweet_id", right_on="tweet_id",
               how='left')

In [276]:
dfr

Unnamed: 0,tweet_id_x,user_id_x,created_at_x,full_text_x,reply_to_tweet_id_x,reply_to_user_id_x,retweet_tweet_id_x,retweet_user_id_x,tweet_id_y,user_id_y,created_at_y,full_text_y,reply_to_tweet_id_y,reply_to_user_id_y,retweet_tweet_id_y,retweet_user_id_y
0,1249941878499414016,321385131,2020-04-14 06:05:41,@greciagirlany Maldito covid opino lo mismo qu...,1249892652096487424,1185577762704117504,,,1249892652096487424,1185577762704117760,2020-04-14 02:50:04,Con dedicatoria al covid https://t.co/lsNNNZaWVd,,,,
1,1249468821397737472,142093052,2020-04-12 22:45:55,@atilioboron @AliciaCastroAR @alferdez @alferd...,1249441130418180096,203555695,,,1249441130418180096,203555695,2020-04-12 20:55:53,Se entiende ahora por qué la restricción a la ...,,,,
2,1251661321973764098,1221567289302142976,2020-04-18 23:58:08,@JuanOrlandoH Nos unimos en agradecimiento Pre...,1251659347236659200,58244743,,,1251659347236659200,58244743,2020-04-18 23:50:17,Agradezco las muestras de amistad entre🇭🇳🇺🇸 a ...,,,,
3,1250868574551445504,68576031,2020-04-16 19:28:02,@AdrianVL1982 @CiudadanosCs Ya nos dejaron atr...,1250855626923786240,763382833,,,1250855626923786240,763382833,2020-04-16 18:36:35,"Cs insta a Planas a ejecutar ""urgentemente"" la...",,,,
4,1251125089053048834,2233387179,2020-04-17 12:27:20,@lacuarta Me paso x la Raja ese Bono covid19.....,1250947066613956608,3223771,,,1250947066613956608,3223771,2020-04-17 00:39:56,Dinero comenzará a ser entregado esta semana.\...,,,,
5,1251661452244443137,265758740,2020-04-18 23:58:39,@lasillarota POR SI FUERA POCO LOS PANISTAS TR...,1251657260477997056,152358615,,,1251657260477997056,152358615,2020-04-18 23:42:00,Los extraños números de Hugo López-Gatell sobr...,,,,
6,1251659175463133186,1244749454865031168,2020-04-18 23:49:36,@PLDenlinea @JCesarValentin Gracias por seguir...,1251513318042451968,176455570,,,1251513318042451968,176455570,2020-04-18 14:10:01,"El Ministro de Salud Pública, Rafael Sánchez C...",,,,
7,1250456285063061504,584323617,2020-04-15 16:09:45,@ciper El Covid19 es Ateo.\n\nExplicación evan...,1250412296565862400,33790745,,,1250412296565862400,33790745,2020-04-15 13:14:57,"Fallece obispo evangélico Mario Salfate, quien...",,,,
8,1249413994571599872,4418245397,2020-04-12 19:08:03,"@Konspyrenayko Prefiero, como solución, ofrece...",1249385551813320704,2956704706,,,1249385551813320704,2956704706,2020-04-12 17:15:02,En España 750.000 personas ganan 60.000€ según...,,,,
9,1250547011763539971,82434462,2020-04-15 22:10:16,@latercera Seguramente se llamará ..Covid ...🤣😂,1250543919290626048,3222731,,,1250543919290626048,3222731,2020-04-15 21:57:58,🇵🇪 Nace el primer bebé con coronavirus en Perú...,,,,


In [277]:
for _, r in dfr.iterrows():
    print("=================")
    print(r.full_text_y)
    print("-----------------")
    print(r.full_text_x)

Con dedicatoria al covid https://t.co/lsNNNZaWVd
-----------------
@greciagirlany Maldito covid opino lo mismo que tu!
Se entiende ahora por qué la restricción a la movilidad disminuye la tasa de propagación del COVID-19. Miren estos mapas aéreos de hoy a las 17.30 hs de Argentina y saquen sus conclusiones. En EEUU lo peor aún no ha llegado y en Brasil el peligro es enorme. Argentina sin vuelos https://t.co/1rVnoOWRjs
-----------------
@atilioboron @AliciaCastroAR @alferdez @alferdezprensa @AmadoBoudouArg @chino_navarrook @gustavoflopez @CFKArgentina @PrensaTaiana @GraRCruz @pvillegas_tlSUR EL COVID-19 VIAJA EN AVIONES. FUERON LOS QUE PROPAGARON Y SIGUEN PROPAGANDO EL VIRUS. MAS LOS CRUCEROS.
Agradezco las muestras de amistad entre🇭🇳🇺🇸 a través de su buena disposición de apoyarnos en la lucha contra el COVID-19 y la reactivación de la cooperación para apoyar nuestra economía y el desarrollo de Honduras en estos momentos de incertidumbre.
-----------------
@JuanOrlandoH Nos unimos en ag

In [260]:
pd.merge?

### retweets

In [246]:
df.retweet_tweet_id.isna().mean()

0.20257327911533454

In [247]:
rt_counts = df.reply_to_tweet_id.value_counts()

In [248]:
import plotly.express as px
px.bar(rt_counts.value_counts())