In [1]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import psutil
import pandas as pd
import pickle as pkl
import re
import time

### Network Generation [1:2]: The DataFrame

In [189]:
# Read the data
questions = pd.read_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
answers = pd.concat([pd.read_pickle('./pickle_dataframes/answers_with_sentiment1.pkl'), 
                     pd.read_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')]).reset_index(drop=True)
comments = pd.read_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

users = pd.read_pickle('./pickle_dataframes/users_with_all_attributes.pkl')

In [190]:
questions.OwnerUserId.unique().shape

(924,)

In [191]:
# Preparing a list of question IDs
parent_list = questions.Id.tolist()

# Identifying Comments Associated with Questions and Answers
comments_on_questions = comments[comments['PostId'].isin(questions['Id'])]
comments_on_answers = comments[comments['PostId'].isin(answers['Id'])]

In [192]:
# Aggregating commenters by the post they commented on
comments_on_questions_agg = comments_on_questions.groupby('PostId')['UserId'].apply(list).reset_index()
comments_on_answers_agg = comments_on_answers.groupby('PostId')['UserId'].apply(list).reset_index()
comments_on_questions_agg.head(1)

Unnamed: 0,PostId,UserId
0,1,"[28, 18, 8018]"


In [193]:
# Mapping Answer IDs to their corresponding Question IDs
answer_to_question_map = answers.set_index('Id')['ParentId'].to_dict()
comments_on_answers_agg['MappedPostId'] = comments_on_answers_agg['PostId'].map(lambda x: answer_to_question_map.get(x, None))

# Filtering out None values which have no corresponding question
comments_on_answers_agg = comments_on_answers_agg[comments_on_answers_agg['MappedPostId'].notnull()]

comments_on_answers_agg.head(1)

Unnamed: 0,PostId,UserId,MappedPostId
0,4,[7014],1


In [194]:
# Creating a unified DataFrame for comment data
comments_combined = pd.concat([
    comments_on_questions_agg.rename(columns={'PostId': 'QuestionId', 'UserId': 'CommentersOnQuestion'}),
    comments_on_answers_agg.rename(columns={'MappedPostId': 'QuestionId', 'UserId': 'CommentersOnAnswers'})
], ignore_index=True)

# Replacing NaN values with empty lists
comments_combined['CommentersOnQuestion'] = comments_combined['CommentersOnQuestion'].apply(lambda x: x if isinstance(x, list) else [])
comments_combined['CommentersOnAnswers'] = comments_combined['CommentersOnAnswers'].apply(lambda x: x if isinstance(x, list) else [])

# Grouping and combining lists
comments_combined = comments_combined.groupby('QuestionId').agg(
    CommentersOnQuestion=('CommentersOnQuestion', lambda x: sum(x, [])),
    CommentersOnAnswers=('CommentersOnAnswers', lambda x: sum(x, []))
).reset_index()
comments_combined.head(1)

Unnamed: 0,QuestionId,CommentersOnQuestion,CommentersOnAnswers
0,1,"[28, 18, 8018]","[7014, 9921]"


In [195]:
# Filtering answers that are related to the collected questions
df_int = answers[answers.ParentId.isin(parent_list)]
df_int.head(1)

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,AnswerTopic,Body_Processed,BodySentiment
0,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1,7,first past post voting tends result smaller nu...,0.9788


In [196]:
# Grouping answers by their parent question and aggregating answerer user IDs
df_subpost = df_int.groupby('ParentId').agg(
    users_list=('OwnerUserId', lambda x: list(x))
).reset_index(drop=False)
df_subpost.head(1)

Unnamed: 0,ParentId,users_list
0,1,"[26, 8, 4666]"


In [197]:
# Identifying original posters for each question
df_original_poster = questions[questions.Id.isin(df_int.ParentId.tolist())].copy()
df_original_poster = df_original_poster.groupby('Id').agg(
    original_poster=('OwnerUserId', lambda x: list(x))
)
df_original_poster.head(1)

Unnamed: 0_level_0,original_poster
Id,Unnamed: 1_level_1
1,[18]


In [198]:
# Merging to form a comprehensive DataFrame for graph construction
df_graph = pd.merge(
    left=df_original_poster,
    right=df_subpost,
    left_on='Id',
    right_on='ParentId'
)

# Cleaning up the 'original_poster' column
df_graph['original_poster'] = df_graph['original_poster'].apply(lambda x: x[0] if x else None)

# Integrating Comment Data with the Graph Data
df_graph = pd.merge(df_graph, comments_combined, left_on='ParentId', right_on='QuestionId', how='left')

#### If you want to see how to get each of the values in df_graph's columns expand below rows: 

In [199]:
# displaying the different columns of row 1 manually
df_graph.head(1)

Unnamed: 0,original_poster,ParentId,users_list,QuestionId,CommentersOnQuestion,CommentersOnAnswers
0,18,1,"[26, 8, 4666]",1.0,"[28, 18, 8018]","[7014, 9921]"


In [148]:
# We look at question 1
# We see that it belongs to questions['OwnerUserId']==18 (original_poster==18)
questions[questions['Id']==1]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,CombinedText,Topic,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
0,1,1,-1,5,2012-12-04 21:40:29.743,42,8309,know situation could arise u one candidate win...,18,2019-06-29 09:18:38.430,disadvantage first past post electoral system,,3,3,disadvantage first past post electoral system ...,7,know situation could arise u one candidate win...,disadvantage first past post electoral system,0.886,-0.4215


In [150]:
# We look at the answers to question 1
# We see that the answers['OwnerUserId'] corresponds to the users in df_graph['users_list'] == [26, 8, 4666]	
answers[answers['ParentId']==1]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,AnswerTopic,Body_Processed,BodySentiment
0,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1,7,first past post voting tends result smaller nu...,0.9788
1,5,2,1,-1,2012-12-04 21:58:39.037,47,-1,<p>Simple plurality voting has very little in ...,8,2012-12-04 22:04:42.767,Comment: N/A,Comment: N/A,-1,1,7,simple plurality voting little favor election ...,0.9782
3077,10983,2,1,-1,2016-05-21 05:28:21.770,5,-1,<p>First-past-the-post systems are polarizing....,4666,2016-05-21 05:28:21.770,Comment: N/A,Comment: N/A,-1,0,7,first past post system polarizing advantage ta...,0.9669


In [149]:
# We look at the comments on the questions (CommentersOnQuestion)
# We see that the comments['UserId'] corresponds to the users in df_graph['CommentersOnQuestion'] == [28, 18, 8018]	
comments[comments['PostId']==1]

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,CommentTopic,Text_Processed,TextSentiment
0,1,1,9,Is it fair to inquire about the disadvantages ...,2012-12-04 22:00:00.933,28,7,fair inquire disadvantage without taking accou...,0.2732
1,3,1,3,"I could have reformulated the question, but at...",2012-12-04 22:02:37.737,18,7,could reformulated question least disadvantage...,0.3269
7646,36295,1,0,You may find this cat's explanation interestin...,2016-05-19 12:09:10.397,8018,7,may find cat explanation interesting http www ...,0.4019


In [152]:
# We look at the comments on the first/3 of the question answers
# We see that the comments['UserId'] corresponds to one of the user in df_graph['CommentersOnAnswers'] == [7014, 9921]	 
comments[comments['PostId']==4]

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,CommentTopic,Text_Processed,TextSentiment
95611,97897,4,2,The primary system used by America makes the p...,2017-12-12 04:26:11.517,7014,7,primary system used america make polarization ...,0.7906


### Network Generation [2:2]: The graph

In [162]:
import networkx as nx

G = nx.Graph()

G.add_nodes_from(df_graph['original_poster'].values)
len(G.nodes())

912

In [None]:
for idx, row in df_graph.iterrows():
    user_to = row['original_poster']
    

    users_from = row['users_list']
    if len(users_from) == 0:
        continue
    for user_from in users_from:
        G.add_edges_from([(user_to, user_from)])

# Adding nodes and edges for original posters, answerers, and commenters
for idx, row in df_graph.iterrows():
    original_poster = row['original_poster']
    G.add_node(original_poster)

    # Add edges from original poster to answerers
    for user in row['users_list']:
        G.add_edge(original_poster, user)

    # Add edges from original poster to commenters on the question
    for commenter in row['CommentersOnQuestion']:
        G.add_edge(original_poster, commenter)

    # Add edges from original poster to commenters on the answers
    for commenter in row['CommentersOnAnswers']:
        G.add_edge(original_poster, commenter)

G.nodes(data=True)
G.edges()
# nx.write_graphml(G, 'graph.graphml')

### Network Analysis
> *"The "label shuffling technique" is incredibly useful. It may turn out to be a good tool to apply for your independent project. Keep it in mind."* - **Week8.ipynb**

Use the "label shuffling test" (Week 5 and 8) to test if the coast with the highest wikipedia page sentiment has a page sentiment that is significantly higher (5% confidence bound) than a randomly selected group of rappers of the same size.


**Prompt**: "During our lectures we worked with a different dataset. We did a permutation test. This is how they taught it to us: "
Compute the average node centrality for west-coast and east-coast rappers respectively. You are free to choose your favourite centrality measure or compare different ones. Which coast do the more central rappers have?
Create a randomized version of the full graph, where the association between nodes and coast is shuffled. Compute the average centrality of west-coast and east-coast rappers for this randomized graph.
Hint: To create this shuffling, simply give everyone a random coast assignment (but with the same number of west-coast and east-coast rappers as in the real data). The way I usually create random lables is by using either random.shuffle or numpy.random.permutation.
Repeat the shuffling process above 1000 times (at least) and plot two histograms, each histogram with the 1000 corresponding average values of centrality obtained for west-coast and east-coast rappers in the shuffeled netowrks.
Compare the value you have obtained in the real graph, with the randomized cases. Are rappers from one coast more central in the network (on average)?
Think about your results, what could be possible explanations for what you observe?
The shuffling procedure you just carried out is often called a permutation test. It is a super powerful tool that we use all the time. Once you start to think about it, you can answer many questions of the type "Is [property] higher/lower in this network than one might have expected by chance?" with permutation tests.
"

My friend did his permutation test like so: "
Permutation Test on Features from the K-means Groups in the Network
A permutation test [55] (also called re-randomization test) is an exact statistical hypothesis
test [56] making use of proof by contradiction [57]. A permutation test
involves two or more samples. The null hypothesis is that all samples come from the
same distribution H0 : F = G. Under the null hypothesis, the distribution of the
test statistic is obtained by calculating all possible values of the test statistic under
68 7 Network Analysis: Methods & Results
possible rearrangements of the observed data. Permutation tests are, therefore, a
form of resampling [58].
The Permutation Test used in this work is a version of [55], where we shuffle properties
of nodes to create ensembles of networks to which we can compare our observed
network [59]. In this work, we shuffle the cluster labels. The steps of the permutation
test conducted in this work is outlined below:
1. Compute the average node assortativity (for a selected feature) for each of the
clusters derived by the k-means clustering algorithm (see Section 6.2). We call
these C1, . . . ,Ck for clusters k ∈ {1, . . . ,K}, where Ck = 1
kN
Σ
n∈{nodes},k∈K ckn,
kN is the number of nodes in cluster k, and the assortativity of node n in cluster
k is ckn. For C1, . . . ,Ck these are the values we will compare to the shuffled/permuted
networks below.
2. Now, we build a single comparison graph. We create a version of the full graph,
R, where the association between nodes and clusters is shuffled, hence in R it
is arbitrary which cluster a node belongs to, but we maintain the same number
of nodes per cluster as in the original graph, G. Now, we compute the average
assortativity (for the selected feature) for each of the clusters k ∈ {1, . . . ,K}
for the R-graph. We will call these C1(R), . . . ,Ck(R).
3. We repeat the shuffling process above 1000 times (resulting in 1000 graphs,
R1, . . . ,R1000. Now, we plot the two histograms; one with Ck(R1), . . . ,Ck(R1000)
for k ∈ {1, . . . ,K}, which we then compare with our real values C1, . . . ,Ck for
each cluster k ∈ {1, . . . ,K}.
4. We compare the distribution of averages from the shuffled networks to our real
value. We can now answer questions like ”are the real values typical”, as in
Figure 7.3 Plot A, or rare, as in Figure 7.3 Plot B. If, for a cluster, the real
value is outside the distribution of averages, we can say that the cluster has a
statistically significant different real value from what is expected from the data
in that cluster.
A note on the measures computed: One can choose to run the above permutation
test with any derived metric or measure, for instance feature assortativity, closeness
centrality, or betweenness centrality (see Section 7.5.6). From the distribution of what
you would expect to see from the data points we work with (the typical histogram),
we can see whether specific clusters have significantly different real values from what
is expected (real value outside the typical histogram):
"

How can I perform this label reshuffling/permutation test on clusters based on user level features I found in my data?"

**Answer**
1. Initial Cluster Analysis
Cluster Your Data: Use a clustering algorithm (like k-means) to cluster your users based on the features you're interested in. Assign each user to a cluster.
Compute Initial Statistics: For each cluster, compute the average (or another statistic of interest) of a specific feature or set of features. These are your observed values.
2. Permutation Test Procedure
Shuffle Cluster Labels: Randomly shuffle the cluster labels assigned to the users, ensuring that the number of users in each cluster remains the same as in the original classification.
Recompute Statistics for Shuffled Data: For each shuffled configuration, recompute the same statistics as in your initial analysis for each cluster.
Repeat the Process: Perform this shuffling and recomputing process a large number of times (typically 1000 or more) to build a distribution of the statistic under the null hypothesis.
3. Analysis and Comparison
Create Histograms: For each cluster, create histograms of the computed statistics from the shuffled data.
Compare Observed Values with Distributions: Compare the initially observed values for each cluster with the distributions obtained from the shuffled data. If your observed value lies outside the bulk of the distribution for shuffled data, it suggests that the observed value is not simply due to random chance.
4. Statistical Significance
P-Value Calculation: For each cluster, you can calculate a p-value, which is the proportion of the shuffled datasets where the computed statistic was as extreme as the observed statistic. A small p-value indicates that the observed statistic is unusual under the null hypothesis of random distribution of features.
5. Interpretation
Draw Conclusions: Based on where your observed statistics fall in relation to the distributions from the shuffled data, draw conclusions about whether the features in each cluster are significantly different from what would be expected by chance.