In [9]:
import pymongo
from consts import *

mongo_client = pymongo.MongoClient(DB_CONN_STRING)
database_name = mongo_client[DB_NAME]

tweets_collection = database_name[COLLECTION_NAME_TWEETS]
users_collection = database_name[COLLECTION_NAME_USERS]

In [10]:
from collections import Counter

graph = {}

def insert(graph, edge):
    u, v = edge
    if u in graph.keys():
        graph[u] += [v]
    else:
        graph[u] = [v]

def get_nodes(graph_with_weights):
    nodes = []
    for u,neighbours in graph_with_weights.items():
        nodes.append(u)
        for v in neighbours:
            nodes.append(v[0])
    return set([x[0] for x in Counter(nodes).items() if x[1] > 0])

for tw in tweets_collection.find({}, {"user": 1, "mentionedUsers": 1}):
    if tw["mentionedUsers"]:
        for user2 in tw["mentionedUsers"]:
            insert(graph, (tw["user"], int(user2)))

g = {k:[x for x in Counter(v).items()] for k,v in graph.items() if len(v) > 0}
nodes = get_nodes(g)
g = {k:[vi for vi in v if vi[0] in nodes] for k,v in g.items() if k in nodes}

file = open("users-mentions-edges.csv", "w")
file.write("source,target,weight\n")
for u, neighbours in g.items():
    for v in neighbours:
        file.write(f'{u},{v[0]},{v[1]}\n')
file.close()

In [34]:
import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def cast_tw_types(tweet):
    if tweet['user']:
        tweet['user'] = np.int64(tweet['user'])
    if tweet['quotedTweet']:
        tweet['quotedTweet'] = np.int64(tweet['quotedTweet'])
    return tweet

projection = {'_id': 1, 'user': 1, 'quotedTweet': 1, 'sentiment': 1, 'covid_topic': 1}
tweets = [tweet for tweet in tweets_collection.find({'covid_topic': True}, projection)]
tweets = list(map(lambda tw: cast_tw_types(tw), tweets))

all_tweets = [tweet for tweet in tweets_collection.find({}, projection)]
all_tweets = list(map(lambda tw: cast_tw_types(tw), all_tweets))

df = pd.DataFrame(tweets, dtype=np.int64)
all_tweets_df = pd.DataFrame(all_tweets, dtype=np.int64)

In [35]:
df[["quotedTweet"]] = df[["quotedTweet"]].applymap(lambda x: np.int64(x) if not np.isnan(x) else 0)

In [36]:
q_df = pd.DataFrame(df[df["quotedTweet"] != 0])

In [49]:
all_tweets_df[["_idTrimmed"]] = all_tweets_df[["_id"]].applymap(lambda x: x // 100)
all_tweets_df[["quotedTweetTrimmed"]] = all_tweets_df[["quotedTweet"]].applymap(lambda x: x // 100)

q_df[["_idTrimmed"]] = q_df[["_id"]].applymap(lambda x: x // 100)
q_df[["quotedTweetTrimmed"]] = q_df[["quotedTweet"]].applymap(lambda x: x // 100)

In [50]:
def compare_id(this, other):
    return this[:-2] == other[:-2]

def getQuotedTweetUser(df, quotedTweetId):
    lst = df[df['_idTrimmed'] == quotedTweetId]["user"].tolist()
    if len(lst) == 1:
#         print(lst[0])
        return lst[0]
#     print(f"Cannot obtain user of tweet with id={quotedTweetId}")
    return None
    
q_df[["quotedTweetUser"]] = q_df[["quotedTweetTrimmed"]].applymap(lambda _id: getQuotedTweetUser(all_tweets_df, _id))

In [51]:
q_df2 = q_df.copy()
q_df2[["quotedTweetUser"]] = q_df2[["quotedTweetUser"]].applymap(lambda x: np.int64(x) if not np.isnan(x) else 0)
q_df2 = pd.DataFrame(q_df2[q_df2["quotedTweetUser"] != 0])
q_df2

Unnamed: 0,_id,user,quotedTweet,covid_topic,sentiment,quotedTweetUser,_idTrimmed,quotedTweetTrimmed
55,1365987014160613378,2527486670,1365986142026358784,1,-0.5,194399035,13659870141606133,13659861420263587
83,1365793863512817668,747708529,1365753774413914112,1,1.0,1377800274,13657938635128176,13657537744139141
97,1365805140595388421,392090615,1363132750136549376,1,0.0,392090615,13658051405953884,13631327501365493
240,1365353647148826633,4714504895,1364997806012829696,1,0.5,4714504895,13653536471488266,13649978060128296
256,1365346239999524870,1019150857,1365336254926249984,1,0.0,1346846950080081920,13653462399995248,13653362549262499
...,...,...,...,...,...,...,...,...
354534,1388447887735070720,1131254074349899778,1388381262814064640,1,-1.0,3041668231,13884478877350707,13883812628140646
354548,1388436309145001984,123346316,1388434252971323392,1,1.0,4044862769,13884363091450019,13884342529713233
354588,1388396474296377344,985610412203937792,1388072259332497408,1,0.0,3405465905,13883964742963773,13880722593324974
354602,1388384710246686727,2237549338,1388381458914496512,1,0.5,504446603,13883847102466867,13883814589144965


In [54]:
q_df2[["user", "quotedTweetUser"]].to_csv("users-quotes-edges.csv")

In [53]:
q_df

Unnamed: 0,_id,user,quotedTweet,covid_topic,sentiment,quotedTweetUser,_idTrimmed,quotedTweetTrimmed
13,1366102095120760833,2809837583,1366072414568910848,1,0.5,,13661020951207608,13660724145689108
55,1365987014160613378,2527486670,1365986142026358784,1,-0.5,1.943990e+08,13659870141606133,13659861420263587
72,1365929584324149250,21931918,1365926661997346816,1,0.0,,13659295843241492,13659266619973468
79,1365859972366565376,1168499091485016066,1365852215609491456,1,0.0,,13658599723665653,13658522156094914
83,1365793863512817668,747708529,1365753774413914112,1,1.0,1.377800e+09,13657938635128176,13657537744139141
...,...,...,...,...,...,...,...,...
354534,1388447887735070720,1131254074349899778,1388381262814064640,1,-1.0,3.041668e+09,13884478877350707,13883812628140646
354548,1388436309145001984,123346316,1388434252971323392,1,1.0,4.044863e+09,13884363091450019,13884342529713233
354588,1388396474296377344,985610412203937792,1388072259332497408,1,0.0,3.405466e+09,13883964742963773,13880722593324974
354602,1388384710246686727,2237549338,1388381458914496512,1,0.5,5.044466e+08,13883847102466867,13883814589144965
