# csv to nodes
based on: https://nbviewer.jupyter.org/github/gwu-libraries/notebooks/blob/master/20170720-building-social-network-graphs-CSV.ipynbc

In [1]:
import sys
import json
import re
import numpy as np
from datetime import datetime
import pandas as pd  
from ast import literal_eval

In [2]:
tweets1 = pd.read_csv("../data/tweets1.csv", low_memory=False)

In [3]:
tweets2 = pd.read_csv("../data/tweets2.csv", low_memory=False)
tweets3 = pd.read_csv("../data/tweets3_1.csv", low_memory=False)
tweets4 = pd.read_csv("../data/tweets3_2.csv", low_memory=False)
tweets5 = pd.read_csv("../data/tweets3_3.csv", low_memory=False)

In [4]:
tweets = pd.concat([tweets1, tweets2, tweets3, tweets4, tweets5])
tweets.shape

(13847731, 31)

In [5]:
tweets.columns

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language', 'tweet_language',
       'tweet_text', 'tweet_time', 'tweet_client_name', 'in_reply_to_userid',
       'in_reply_to_tweetid', 'quoted_tweet_tweetid', 'is_retweet',
       'retweet_userid', 'retweet_tweetid', 'latitude', 'longitude',
       'quote_count', 'reply_count', 'like_count', 'retweet_count', 'hashtags',
       'urls', 'user_mentions', 'poll_choices'],
      dtype='object')

In [6]:
# 1. Export edges from Retweets

edges_retweets = tweets[tweets['is_retweet'] == True][['userid', 'retweet_userid','tweet_time']]
edges_retweets.columns = ['Source', 'Target', 'Strength']
edges_retweets.shape

(3169578, 3)

In [7]:
# 2. Export edges from Mentions

# remove interactions w/o mentions
mentions = tweets[tweets['user_mentions'].notnull()]
mentions = mentions[mentions['user_mentions'] != '[]']
mentions['user_mentions'] = mentions['user_mentions'].apply(literal_eval)

In [8]:
#explode mentions and convert to edge graph
mentions_exploded = mentions.explode('user_mentions')
edges_mentions = mentions_exploded[['userid','user_mentions','tweet_time']]
edges_mentions.columns = ['Source','Target','Strength']

In [9]:
# 3. Export edges from Replies

replies = tweets[tweets['in_reply_to_userid'].notnull()]

edges_replies = replies[['userid', 'in_reply_to_userid','tweet_time']]
edges_replies.columns = ['Source', 'Target', 'Strength']

In [10]:
edges_replies.shape, edges_mentions.shape, edges_retweets.shape

((3937160, 3), (11443501, 3), (3169578, 3))

In [11]:
edges = pd.concat([edges_replies, edges_mentions, edges_retweets])
edges.shape

(18550239, 3)

In [12]:
edges = edges.drop_duplicates(['Source', 'Target', 'Strength'], keep = 'first')
edges.shape

(10986674, 3)

In [13]:
strengthLevel = 3  # Network connection strength level: the number of times in total each of the tweeters responded to or mentioned the other.
                   # If you have 1 as the level, then all tweeters who mentioned or replied to another at least once will be displayed. But if you have 5, only those who have mentioned or responded to a particular tweeter at least 5 times will be displayed, which means that only the strongest bonds are shown.


# CHANGE THIS LINE TO CHANGE TYPE OF EDGE COUNT
edges2 = edges.groupby(['Source','Target'])['Strength'].count()
edges2 = edges2.reset_index()
edges2 = edges2[edges2['Strength'] >= strengthLevel]

In [14]:
# Export nodes from the edges and add node attributes for both Sources and Targets.

users = tweets[['userid','follower_count', 'following_count']]
users = users.sort_values(['userid','follower_count'], ascending=[True, False])
users = users.drop_duplicates(['userid'], keep='first') 
users.columns = ['Id', 'follower_count', 'following_count']

ids = edges2['Source'].append(edges2['Target']).to_frame()
ids['Label'] = ids
ids.columns = ['Id', 'Label']
ids = ids.drop_duplicates(['Id'], keep='first')

In [15]:
nodes = pd.merge(ids, users, on='Id', how='right')

In [16]:
print(nodes.shape)
print(edges2.shape)

(5191, 4)
(439728, 3)


In [17]:
# Print nodes to check
nodes.head()

Unnamed: 0,Id,Label,follower_count,following_count
0,++ef3C0Kijge2nRlhqtBnIK9ULUQOE0YeSEQ83P+o=,++ef3C0Kijge2nRlhqtBnIK9ULUQOE0YeSEQ83P+o=,0,10
1,+3IgsYSyatx7xFWru0gBPqI5wU2Mg9XAtMMwA4omJU8=,+3IgsYSyatx7xFWru0gBPqI5wU2Mg9XAtMMwA4omJU8=,1,2
2,+57fv5Mr90uu0Oihtjow368S6WZhSdFIydtLcHDyzg0=,+57fv5Mr90uu0Oihtjow368S6WZhSdFIydtLcHDyzg0=,6,29
3,+AG1yidnkMeRXPhzyZCqlfw1B8sKWJy1SiZXYcOkyg8=,+AG1yidnkMeRXPhzyZCqlfw1B8sKWJy1SiZXYcOkyg8=,0,12
4,+B2YBWL5Sen6rfThzwlgJ++dO8nCOP9MVBeshDa5zWA=,+B2YBWL5Sen6rfThzwlgJ++dO8nCOP9MVBeshDa5zWA=,0,0


In [18]:
users.head()

Unnamed: 0,Id,follower_count,following_count
2729,++ef3C0Kijge2nRlhqtBnIK9ULUQOE0YeSEQ83P+o=,0,10
563455,+0llyRnQonFaRwD8XWF3WNBAJ2cbgV2Sot6nZwHzf0c=,0,44
938354,+1mvzLvnEJsqyRmB62F1YTUfpJU47pZDzk9NttisEo=,0,0
80821,+3IgsYSyatx7xFWru0gBPqI5wU2Mg9XAtMMwA4omJU8=,1,2
1836400,+3zsHbEJuQ4nZCRZiUM3F1hQ7YuQWl89kxtOWmcEyks=,0,0


In [20]:
# see the interactions only within the network
edges2 = edges2[edges2.Source.isin(users.Id) & edges2.Target.isin(users.Id)]
edges2.shape

(14054, 3)

In [21]:
# Export nodes and edges to csv files
users.to_csv('../data/users.csv', encoding='utf-8', index=False)
edges2.to_csv('../data/edges.csv', encoding='utf-8', index=False)
nodes.to_csv('../data/nodes.csv', encoding='utf-8', index=False)

In [None]:
tweets[tweets['userid'] == '480709771']

In [None]:
edges2.shape

In [None]:
edges2.shape