In [13]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import urllib, json

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
!pip install matplotlib




# Introduction to Networkx

The simple methods '.nodes()' and '.edges()' print the key components of the network.

In [14]:
HG = nx.house_graph()
HG.nodes()
HG.edges()

NodeView((0, 1, 2, 3, 4))

EdgeView([(0, 1), (0, 2), (1, 3), (2, 3), (2, 4), (3, 4)])

## Drawing the network

To draw the network, we need to give it a dictionary that maps each node to an X-Y coordinate. I give one below.

In [16]:
pos = {0: (0, 0), 1: (1, 0), 2: (0, 1), 3: (1, 1), 4: (0.5, 2.0)}

nx.draw_networkx_nodes(
    HG, pos, node_size=3000, nodelist=[0, 1, 2, 3], 
    node_color="tab:blue"
)

nx.draw_networkx_nodes(HG, pos, node_size=2000, 
                       nodelist=[4], node_color="tab:orange")

nx.draw_networkx_edges(HG, pos, alpha=1, width=6)
nx.draw_networkx_labels(HG, pos, font_size=20)
plt.tight_layout()
plt.axis("off")

<matplotlib.collections.PathCollection at 0x1691ecf40>

<matplotlib.collections.PathCollection at 0x1691c24c0>

<matplotlib.collections.LineCollection at 0x1691c2d00>

{0: Text(0, 0, '0'),
 1: Text(1, 0, '1'),
 2: Text(0, 1, '2'),
 3: Text(1, 1, '3'),
 4: Text(0.5, 2.0, '4')}

(-0.10500000000000001, 1.105, -0.21000000000000002, 2.21)

Error in callback <function _draw_all_if_interactive at 0x12f0a3b80> (for post_execute), with arguments args (),kwargs {}:


ValueError: object __array__ method not producing an array

ValueError: object __array__ method not producing an array

<Figure size 640x480 with 1 Axes>

In [17]:
ADJ = nx.to_pandas_adjacency(HG)
ADJ = ADJ.astype(bool)

In [18]:
deg_0 = nx.degree(HG, 0) 
deg_0

2

# Activity
We will learn how to process a dataframe into a useable edgelist to feed into a graph.

In [19]:
df = pd.read_pickle("../public_data/Raw_Tweets.pkl") # Load your tweets
df.head()
df.shape

Unnamed: 0,tweetid,userid,date,lang,text,tweet_type,friends_count,followers_count,statuses_count,mentionid,rt_userid,rt_tweetid,qtd_userid,qtd_text,qtd_tweetid
12221,1221208309728006145,418859255,Sat Jan 25 23:08:44 +0000 2020,en,"RT @DepSecHargan: Every year in the US, 34,800...",retweeted_tweet_without_comment,638,1837,17778,[920035992073064450],9.20036e+17,1.221208e+18,,,
28315,1221213073652043776,14499829,Sat Jan 25 23:27:40 +0000 2020,en,WHO’s risk assessment of the new #coronavirus ...,original,1743,5196950,46697,[],,,,,
6092,1222281847163576321,1135486501,Tue Jan 28 22:14:35 +0000 2020,en,#FYI: 50 million are under travel restrictions...,original,950,27025,11145,[9624742],,,,,
13618,1222289121118511104,117793973,Tue Jan 28 22:43:30 +0000 2020,en,What should I do if I have traveled to China a...,original,1236,5860,4578,[],,,,,
19205,1222285279047843843,1465196934,Tue Jan 28 22:28:14 +0000 2020,en,Disease outbreaks can result in misinformation...,original,1946,3581,4636,[],,,,,


(59887, 15)

### 1.1 Subset the dataframe to network data

- First, using .value_counts(), identify how many types of tweets there are, using the tweet_type column. 
- Then, filter out all tweets that are not "retweeted_tweet_without_comment" and "quoted_tweet."

In [20]:
# your code here
number_tweet_type = df.tweet_type.value_counts()
number_tweet_type

tweet_type
original                           26275
retweeted_tweet_without_comment    19923
reply                               7292
quoted_tweet                        6397
Name: count, dtype: int64

In [21]:
## Run this. Why is it empty?
df[(df.tweet_type == "retweeted_tweet_without_comment") & (df.tweet_type == "quoted_tweet")]

Unnamed: 0,tweetid,userid,date,lang,text,tweet_type,friends_count,followers_count,statuses_count,mentionid,rt_userid,rt_tweetid,qtd_userid,qtd_text,qtd_tweetid


### 1.2 Create an edgelist

The "userid" column shows the person that did the retweeting. Create a new column called "userid2" that denotes the user retweeted. You will need to use the columns "rt_userid" and "qtd_userid."

In [23]:
##network_df = df[(df['tweet_type'] == "retweeted_tweet_without_comment") | (df['tweet_type'] == "quoted_tweet")]

##network_df['userid2'] = network_df.apply(lambda row: row['rt_userid'] if row['tweet_type'] == "retweeted_tweet_without_comment" else row['qtd_userid'], axis=1)
##network_df['userid2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  network_df['userid2'] = network_df.apply(lambda row: row['rt_userid'] if row['tweet_type'] == "retweeted_tweet_without_comment" else row['qtd_userid'], axis=1)


12221     9.200360e+17
25575     1.697808e+09
28489     9.200360e+17
28496     1.661606e+07
12992     1.965894e+07
              ...     
178123    6.841204e+07
71249     4.496188e+07
111383    1.615192e+07
159487    1.465700e+08
166622    3.762414e+09
Name: userid2, Length: 26320, dtype: float64

In [35]:
## alternative solution
df["userid2"] = df["rt_userid"]
df["userid2"] = df["userid2"].fillna(df["qtd_userid"])

In [38]:
## alternative solution 2
df["userid2"] = df.rt_userid.fillna(0) + df.qtd_userid.fillna(0)
df["userid2"]

12221     9.200360e+17
28315     0.000000e+00
6092      0.000000e+00
13618     0.000000e+00
19205     0.000000e+00
              ...     
116861    0.000000e+00
118583    0.000000e+00
159427    0.000000e+00
159487    1.465700e+08
166622    3.762414e+09
Name: userid2, Length: 59887, dtype: float64

### 1.3 Sentiment scoring. 
- Now, use SentimentIntensityAnalyzer to label the sentiment of the text, in a new column called "weight." 
- The subset the dataframe to only have "userid", "user2", and "weight". 
- Rename "userid" to "user1", then group by "user1" and "user2", to find the mean sentiment (mean weight).

You now have your first edgelist!

In [31]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/samanthawang/nltk_data...


True

In [None]:
def get_sentiment_score(text):
    # Calculate the sentiment score
    score = SentimentIntensityAnalyzer().polarity_scores(text)['compound']
    return score

df['weight'] = df['text'].apply(get_sentiment_score)

In [43]:
import numpy as np

df.userid2 = df.userid2.astype(np.int64) #change userid2 to string
df[["userid", "userid2", "weight"]]

Unnamed: 0,userid,userid2,weight
12221,418859255,920035992073064448,-0.6597
28315,14499829,0,-0.2732
6092,1135486501,0,0.0000
13618,117793973,0,-0.4767
19205,1465196934,0,0.0000
...,...,...,...
116861,29201047,0,-0.9100
118583,2353731720,0,0.0000
159427,18909919,0,0.4215
159487,44783853,146569971,-0.2023


In [None]:
edgelist_df.rename(columns={'userid': 'user1', 'userid2': 'user2'}, inplace=True)
edgelist_df = edgelist_df.groupby(['user1', 'user2'], as_index=False).mean()

# Netwulf Visualization
#### If you don't have the module, pip install netwulf
#### https://netwulf.readthedocs.io/en/latest/about.html

In [66]:
EL =  pd.read_pickle("../public_data/edge_list_final2.pkl")

In [67]:
EL[["user1","user2"]].values

array([[ 426033838,  146569971],
       [ 418859255,  146569971],
       [ 418859255,  146569971],
       ...,
       [2953974395,    5558312],
       [2953974395, 1209417007],
       [  18023868,   23711785]])

In [68]:
G = nx.Graph() # Creates a graph object
G.add_edges_from( EL[["user1","user2"]].values ) # Adds edges (and hence nodes from the edgelist)

# Creates the biggest subgraph
CC_nodes = list( nx.connected_components(G) )[0] # Generates a list of connected components, extracts the largest
GG = nx.subgraph(G, CC_nodes) # Creates the subgraph of the largest CC

In [62]:
from netwulf import visualize
import netwulf as nw

In [63]:
stylized_network, config = visualize(GG,)

In [69]:
U = pd.read_pickle("../public_data/UserInfo.pkl")
id2color = dict(zip(U.userid.values, U.PlotColor.values))
for k, v in GG.nodes(data=True):
    v["group"] = id2color[k]

In [70]:
stylized_network, config = visualize(GG,port=2000)

### Activity Two
- Create a new column in U called "Degree," which denotes the degree of the node (userid). You will want to use G.degree()[n] where n is the userid.
- Group by User Type. Which group was the most popular during the pandemic?

In [76]:
# your code here.
U

Unnamed: 0,userid,PlotColor,User_type,Degree
0,1079770852302950016,grey,Independent,0
1,172858784,royalblue,Democrat,5
2,48117116,firebrick,Republican,1
3,129259273,firebrick,Republican,0
4,1249982359,firebrick,Republican,80
...,...,...,...,...
659,188369254,gold,Health Official,9
660,59545968,gold,Health Official,18
661,2353731720,gold,Health Official,8
662,293028988,gold,Health Official,10


In [78]:
# Calculate degrees
U['Degree'] = U['userid'].apply(lambda n: G.degree(n) if n in G else 0)

In [79]:
# Group by 'User Type'
popular_groups = U.groupby('User_type')['Degree'].sum().reset_index()
popular_groups = popular_groups.sort_values(by='Degree', ascending=False)