In [315]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from networkx.algorithms.community import greedy_modularity_communities
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import networkx as nx
import psutil
import pandas as pd
import pickle as pkl
import re
import time

### Network Generation [1:2]: The DataFrame

In [322]:
# Read the data
questions = pd.read_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
answers = pd.concat([pd.read_pickle('./pickle_dataframes/answers_with_sentiment1.pkl'), 
                     pd.read_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')]).reset_index(drop=True)
comments = pd.read_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

users = pd.read_pickle('./pickle_dataframes/users_with_all_attributes.pkl')

In [323]:
questions.OwnerUserId.unique().shape

(924,)

In [324]:
# Preparing a list of question IDs
parent_list = questions.Id.tolist()

# Identifying Comments Associated with Questions and Answers
comments_on_questions = comments[comments['PostId'].isin(questions['Id'])]
comments_on_answers = comments[comments['PostId'].isin(answers['Id'])]

In [325]:
# Aggregating commenters by the post they commented on
comments_on_questions_agg = comments_on_questions.groupby('PostId')['UserId'].apply(list).reset_index()
comments_on_answers_agg = comments_on_answers.groupby('PostId')['UserId'].apply(list).reset_index()
comments_on_questions_agg.head(1)

Unnamed: 0,PostId,UserId
0,1,"[28, 18, 8018]"


In [326]:
# Mapping Answer IDs to their corresponding Question IDs
answer_to_question_map = answers.set_index('Id')['ParentId'].to_dict()
comments_on_answers_agg['MappedPostId'] = comments_on_answers_agg['PostId'].map(lambda x: answer_to_question_map.get(x, None))

# Filtering out None values which have no corresponding question
comments_on_answers_agg = comments_on_answers_agg[comments_on_answers_agg['MappedPostId'].notnull()]

comments_on_answers_agg.head(1)

Unnamed: 0,PostId,UserId,MappedPostId
0,4,[7014],1


In [327]:
# Creating a unified DataFrame for comment data
comments_combined = pd.concat([
    comments_on_questions_agg.rename(columns={'PostId': 'QuestionId', 'UserId': 'CommentOnQuestionUserId_list'}),
    comments_on_answers_agg.rename(columns={'MappedPostId': 'QuestionId', 'UserId': 'CommentOnAnswersUserId_list'})
], ignore_index=True)

# Replacing NaN values with empty lists
comments_combined['CommentOnQuestionUserId_list'] = comments_combined['CommentOnQuestionUserId_list'].apply(lambda x: x if isinstance(x, list) else [])
comments_combined['CommentOnAnswersUserId_list'] = comments_combined['CommentOnAnswersUserId_list'].apply(lambda x: x if isinstance(x, list) else [])

# Grouping and combining lists
comments_combined = comments_combined.groupby('QuestionId').agg(
    CommentOnQuestionUserId_list=('CommentOnQuestionUserId_list', lambda x: sum(x, [])),
    CommentOnAnswersUserId_list=('CommentOnAnswersUserId_list', lambda x: sum(x, []))
).reset_index()
comments_combined.head(1)

Unnamed: 0,QuestionId,CommentOnQuestionUserId_list,CommentOnAnswersUserId_list
0,1,"[28, 18, 8018]","[7014, 9921]"


In [328]:
# Filtering answers that are related to the collected questions
df_int = answers[answers.ParentId.isin(parent_list)]
df_int.head(1)

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,AnswerTopic,Body_Processed,BodySentiment
0,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1,7,first past post voting tends result smaller nu...,0.9788


In [329]:
# Grouping answers by their parent question and aggregating answerer user IDs
df_subpost = df_int.groupby('ParentId').agg(
    answers_UserId_list=('OwnerUserId', lambda x: list(x))
).reset_index(drop=False)
df_subpost.head(1)

Unnamed: 0,ParentId,answers_UserId_list
0,1,"[26, 8, 4666]"


In [330]:
# Identifying original posters for each question
df_original_poster = questions[questions.Id.isin(df_int.ParentId.tolist())].copy()
df_original_poster = df_original_poster.groupby('Id').agg(
    original_poster_UserId=('OwnerUserId', lambda x: list(x))
)
df_original_poster.head(1)

Unnamed: 0_level_0,original_poster_UserId
Id,Unnamed: 1_level_1
1,[18]


In [331]:
# Merging to form a comprehensive DataFrame for graph construction
df_graph = pd.merge(
    left=df_original_poster,
    right=df_subpost,
    left_on='Id',
    right_on='ParentId'
)

# Cleaning up the 'original_poster' column
df_graph['original_poster_UserId'] = df_graph['original_poster_UserId'].apply(lambda x: x[0] if x else None)

# Integrating Comment Data with the Graph Data
df_graph = pd.merge(df_graph, comments_combined, left_on='ParentId', right_on='QuestionId', how='left')

In [332]:
df_graph['CommentOnQuestionUserId_list'] = df_graph['CommentOnQuestionUserId_list'].apply(lambda x: x if isinstance(x, list) else [])
df_graph['CommentOnAnswersUserId_list'] = df_graph['CommentOnAnswersUserId_list'].apply(lambda x: x if isinstance(x, list) else [])

print(df_graph.shape)
df_graph = df_graph[df_graph['original_poster_UserId'] != -1]
print(df_graph.shape)

(10476, 6)
(9669, 6)


#### If you want to see how to get each of the values in df_graph's columns expand below rows: 

In [333]:
# displaying the different columns of row 1 manually
df_graph.head(1)

Unnamed: 0,original_poster_UserId,ParentId,answers_UserId_list,QuestionId,CommentOnQuestionUserId_list,CommentOnAnswersUserId_list
0,18,1,"[26, 8, 4666]",1.0,"[28, 18, 8018]","[7014, 9921]"


In [334]:
# We look at question 1
# We see that it belongs to questions['OwnerUserId']==18 (original_poster_UserId==18)
questions[questions['Id']==1]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,CombinedText,Topic,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
0,1,1,-1,5,2012-12-04 21:40:29.743,42,8309,know situation could arise u one candidate win...,18,2019-06-29 09:18:38.430,disadvantage first past post electoral system,,3,3,disadvantage first past post electoral system ...,7,know situation could arise u one candidate win...,disadvantage first past post electoral system,0.886,-0.4215


In [335]:
# We look at the answers to question 1
# We see that the answers['OwnerUserId'] corresponds to the users in df_graph['answers_UserId_list'] == [26, 8, 4666]	
answers[answers['ParentId']==1]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,AnswerTopic,Body_Processed,BodySentiment
0,4,2,1,-1,2012-12-04 21:58:11.187,7,-1,<p>First-past-the-post voting tends to result ...,26,2012-12-04 21:58:11.187,Comment: N/A,Comment: N/A,-1,1,7,first past post voting tends result smaller nu...,0.9788
1,5,2,1,-1,2012-12-04 21:58:39.037,47,-1,<p>Simple plurality voting has very little in ...,8,2012-12-04 22:04:42.767,Comment: N/A,Comment: N/A,-1,1,7,simple plurality voting little favor election ...,0.9782
3077,10983,2,1,-1,2016-05-21 05:28:21.770,5,-1,<p>First-past-the-post systems are polarizing....,4666,2016-05-21 05:28:21.770,Comment: N/A,Comment: N/A,-1,0,7,first past post system polarizing advantage ta...,0.9669


In [336]:
# We look at the comments on the questions (CommentersOnQuestion)
# We see that the comments['UserId'] corresponds to the users in df_graph['CommentersOnQuestion'] == [28, 18, 8018]	
comments[comments['PostId']==1]

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,CommentTopic,Text_Processed,TextSentiment
0,1,1,9,Is it fair to inquire about the disadvantages ...,2012-12-04 22:00:00.933,28,7,fair inquire disadvantage without taking accou...,0.2732
1,3,1,3,"I could have reformulated the question, but at...",2012-12-04 22:02:37.737,18,7,could reformulated question least disadvantage...,0.3269
7646,36295,1,0,You may find this cat's explanation interestin...,2016-05-19 12:09:10.397,8018,7,may find cat explanation interesting http www ...,0.4019


In [337]:
# We look at the comments on the first/3 of the question answers
# We see that the comments['UserId'] corresponds to one of the user in df_graph['CommentersOnAnswers'] == [7014, 9921]	 
comments[comments['PostId']==4]

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,CommentTopic,Text_Processed,TextSentiment
95611,97897,4,2,The primary system used by America makes the p...,2017-12-12 04:26:11.517,7014,7,primary system used america make polarization ...,0.7906


### Network Generation [2:2]: The graph

**We have to consider how we connect** 
- original_poster_UserId to CommentOnAnswersUserId_list
- answers_UserId_list to CommentOnAnswersUserId_list

Do we connect both or only one of them?
- In `G1` we connect original_poster_UserId to CommentOnAnswersUserId_list
- In `G2` we connect answers_UserId_list to CommentOnAnswersUserId_list
- In `G3` we connect both

In [338]:
G1 = nx.Graph()
# - original_poster_UserId to CommentOnAnswersUserId_list

# Adding nodes and edges for original posters, answerers, and commenters
for idx, row in df_graph.iterrows():
    original_poster_UserId = row['original_poster_UserId']
    G1.add_node(original_poster_UserId)

    # Add edges from original poster to answerers
    for user in row['answers_UserId_list']:
        G1.add_edge(original_poster_UserId, user)

    # Add edges from original poster to commenters on the question
    if isinstance(row['CommentOnQuestionUserId_list'], list):
        for commenter in row['CommentOnQuestionUserId_list']:
            G1.add_edge(original_poster_UserId, commenter)

    # Add edges from original poster to commenters on the answers
    if isinstance(row['CommentOnAnswersUserId_list'], list):
        for commenter in row['CommentOnAnswersUserId_list']:
            G1.add_edge(original_poster_UserId, commenter)

print(f'len(G1.nodes(): {len(G1.nodes(data=True))}', '\t', G1.nodes(data=True))
print(f'len(G1.edges()): {len(G1.edges())}', '\t', G1.edges())

len(G1.nodes(): 1463 	 [(18, {}), (26, {}), (8, {}), (4666, {}), (28, {}), (8018, {}), (7014, {}), (9921, {}), (52, {}), (14237, {}), (115, {}), (4800, {}), (69, {}), (174, {}), (175, {}), (232, {}), (300, {}), (4533, {}), (9418, {}), (29, {}), (1654, {}), (12464, {}), (21163, {}), (7303, {}), (78, {}), (22860, {}), (91, {}), (130, {}), (345, {}), (-1, {}), (11019, {}), (5534, {}), (1584, {}), (103, {}), (149, {}), (50, {}), (7434, {}), (2130, {}), (9801, {}), (342, {}), (22936, {}), (7460, {}), (16582, {}), (22982, {}), (6837, {}), (2571, {}), (5787, {}), (101, {}), (57, {}), (1471, {}), (2623, {}), (2833, {}), (5279, {}), (17279, {}), (16724, {}), (4789, {}), (4767, {}), (7265, {}), (23294, {}), (23, {}), (6738, {}), (2127, {}), (22137, {}), (6890, {}), (6927, {}), (15897, {}), (303, {}), (2951, {}), (1483, {}), (8260, {}), (5285, {}), (2721, {}), (5511, {}), (4482, {}), (1470, {}), (8784, {}), (14700, {}), (1370, {}), (12853, {}), (15531, {}), (8647, {}), (2747, {}), (14788, {}), (1

In [339]:
G2 = nx.Graph()

# - answers_UserId_list to CommentOnAnswersUserId_list
# - original_poster_UserId to CommentOnAnswersUserId_list

# Adding nodes and edges for original posters, answerers, and commenters
for idx, row in df_graph.iterrows():
    original_poster_UserId = row['original_poster_UserId']
    G2.add_node(original_poster_UserId)

    # Add edges from original poster to answerers
    for user in row['answers_UserId_list']:
        G2.add_edge(original_poster_UserId, user)

    # Add edges from original poster to commenters on the question
    if isinstance(row['CommentOnQuestionUserId_list'], list):
        for commenter in row['CommentOnQuestionUserId_list']:
            G2.add_edge(original_poster_UserId, commenter)

    # New Code: Add edges between answerers and users who commented on their answers
    if isinstance(row['answers_UserId_list'], list) and isinstance(row['CommentOnAnswersUserId_list'], list):
        for answerer in row['answers_UserId_list']:
            for commenter in row['CommentOnAnswersUserId_list']:
                G2.add_edge(answerer, commenter)

print(f'len(G2.nodes(): {len(G2.nodes(data=True))}', '\t', G2.nodes(data=True))
print(f'len(G2.edges()): {len(G2.edges())}', '\t', G2.edges())

len(G2.nodes(): 1463 	 [(18, {}), (26, {}), (8, {}), (4666, {}), (28, {}), (8018, {}), (7014, {}), (9921, {}), (52, {}), (14237, {}), (115, {}), (4800, {}), (69, {}), (174, {}), (175, {}), (232, {}), (300, {}), (4533, {}), (9418, {}), (29, {}), (1654, {}), (12464, {}), (21163, {}), (7303, {}), (78, {}), (22860, {}), (91, {}), (130, {}), (345, {}), (-1, {}), (11019, {}), (5534, {}), (1584, {}), (103, {}), (149, {}), (50, {}), (7434, {}), (2130, {}), (9801, {}), (342, {}), (22936, {}), (7460, {}), (16582, {}), (22982, {}), (6837, {}), (2571, {}), (5787, {}), (101, {}), (57, {}), (1471, {}), (2623, {}), (2833, {}), (5279, {}), (17279, {}), (16724, {}), (4789, {}), (4767, {}), (7265, {}), (23294, {}), (23, {}), (6738, {}), (2127, {}), (22137, {}), (6890, {}), (6927, {}), (15897, {}), (303, {}), (2951, {}), (1483, {}), (8260, {}), (5285, {}), (2721, {}), (5511, {}), (4482, {}), (1470, {}), (8784, {}), (14700, {}), (1370, {}), (12853, {}), (15531, {}), (8647, {}), (2747, {}), (14788, {}), (1

In [340]:
G3 = nx.Graph()

# - answers_UserId_list to CommentOnAnswersUserId_list
# - original_poster_UserId to CommentOnAnswersUserId_list

# Adding nodes and edges for original posters, answerers, and commenters
for idx, row in df_graph.iterrows():
    original_poster_UserId = row['original_poster_UserId']
    G3.add_node(original_poster_UserId)

    # Add edges from original poster to answerers
    for user in row['answers_UserId_list']:
        G3.add_edge(original_poster_UserId, user)

    # Add edges from original poster to commenters on the question
    if isinstance(row['CommentOnQuestionUserId_list'], list):
        for commenter in row['CommentOnQuestionUserId_list']:
            G3.add_edge(original_poster_UserId, commenter)

    # Add edges from original poster to commenters on the answers
    if isinstance(row['CommentOnAnswersUserId_list'], list):
        for commenter in row['CommentOnAnswersUserId_list']:
            G3.add_edge(original_poster_UserId, commenter)

    # New Code: Add edges between answerers and users who commented on their answers
    if isinstance(row['answers_UserId_list'], list) and isinstance(row['CommentOnAnswersUserId_list'], list):
        for answerer in row['answers_UserId_list']:
            for commenter in row['CommentOnAnswersUserId_list']:
                G3.add_edge(answerer, commenter)

print(f'len(G3.nodes(): {len(G3.nodes(data=True))}', '\t', G3.nodes(data=True))
print(f'len(G3.edges()): {len(G3.edges())}', '\t', G3.edges())

len(G3.nodes(): 1463 	 [(18, {}), (26, {}), (8, {}), (4666, {}), (28, {}), (8018, {}), (7014, {}), (9921, {}), (52, {}), (14237, {}), (115, {}), (4800, {}), (69, {}), (174, {}), (175, {}), (232, {}), (300, {}), (4533, {}), (9418, {}), (29, {}), (1654, {}), (12464, {}), (21163, {}), (7303, {}), (78, {}), (22860, {}), (91, {}), (130, {}), (345, {}), (-1, {}), (11019, {}), (5534, {}), (1584, {}), (103, {}), (149, {}), (50, {}), (7434, {}), (2130, {}), (9801, {}), (342, {}), (22936, {}), (7460, {}), (16582, {}), (22982, {}), (6837, {}), (2571, {}), (5787, {}), (101, {}), (57, {}), (1471, {}), (2623, {}), (2833, {}), (5279, {}), (17279, {}), (16724, {}), (4789, {}), (4767, {}), (7265, {}), (23294, {}), (23, {}), (6738, {}), (2127, {}), (22137, {}), (6890, {}), (6927, {}), (15897, {}), (303, {}), (2951, {}), (1483, {}), (8260, {}), (5285, {}), (2721, {}), (5511, {}), (4482, {}), (1470, {}), (8784, {}), (14700, {}), (1370, {}), (12853, {}), (15531, {}), (8647, {}), (2747, {}), (14788, {}), (1

In [320]:
# Save Results

#nx.write_graphml(G1, 'graph1.graphml')
#nx.write_graphml(G2, 'graph2.graphml')

### Network Analysis


In [348]:
# Calculate different network statistics
avg_degree_centrality = sum(nx.degree_centrality(G1).values()) / len(G1)
avg_clustering_coefficient = nx.average_clustering(G1)
connected_components = list(nx.connected_components(G1))
avg_betweenness_centrality = sum(nx.betweenness_centrality(G1).values()) / len(G1)
communities = greedy_modularity_communities(G1)

# Analysis - You might print these or plot them using matplotlib or similar
print("Average Degree Centrality:", avg_degree_centrality)
print("Average Clustering Coefficient:", avg_clustering_coefficient)
print("Number of Connected Components:", len(connected_components))
print("Average Betweenness Centrality:", avg_betweenness_centrality)
print("Number of Communities detected:", len(communities))

Average Degree Centrality: 0.03243994827262136
Average Clustering Coefficient: 0.3731202489653536
Number of Connected Components: 1
Average Betweenness Centrality: 0.0007910656467275
Number of Communities detected: 12


In [349]:
# Calculate different network statistics
avg_degree_centrality = sum(nx.degree_centrality(G2).values()) / len(G2)
avg_clustering_coefficient = nx.average_clustering(G2)
connected_components = list(nx.connected_components(G2))
avg_betweenness_centrality = sum(nx.betweenness_centrality(G2).values()) / len(G2)
communities = greedy_modularity_communities(G2)

# Analysis - You might print these or plot them using matplotlib or similar
print("Average Degree Centrality:", avg_degree_centrality)
print("Average Clustering Coefficient:", avg_clustering_coefficient)
print("Number of Connected Components:", len(connected_components))
print("Average Betweenness Centrality:", avg_betweenness_centrality)
print("Number of Communities detected:", len(communities))

Average Degree Centrality: 0.06903061658623631
Average Clustering Coefficient: 0.5390484157854091
Number of Connected Components: 1
Average Betweenness Centrality: 0.0006547757426202162
Number of Communities detected: 12


In [350]:
# Calculate different network statistics
avg_degree_centrality = sum(nx.degree_centrality(G3).values()) / len(G3)
avg_clustering_coefficient = nx.average_clustering(G3)
connected_components = list(nx.connected_components(G3))
avg_betweenness_centrality = sum(nx.betweenness_centrality(G3).values()) / len(G3)
communities = greedy_modularity_communities(G3)

# Analysis - You might print these or plot them using matplotlib or similar
print("Average Degree Centrality:", avg_degree_centrality)
print("Average Clustering Coefficient:", avg_clustering_coefficient)
print("Number of Connected Components:", len(connected_components))
print("Average Betweenness Centrality:", avg_betweenness_centrality)
print("Number of Communities detected:", len(communities))

Average Degree Centrality: 0.07672239920781958
Average Clustering Coefficient: 0.5374046079320877
Number of Connected Components: 1
Average Betweenness Centrality: 0.0006442757066173029
Number of Communities detected: 16


> *"The "label shuffling technique" is incredibly useful. It may turn out to be a good tool to apply for your independent project. Keep it in mind."* - **Week8.ipynb**

Use the "label shuffling test" (Week 5 and 8) to test if the coast with the highest wikipedia page sentiment has a page sentiment that is significantly higher (5% confidence bound) than a randomly selected group of rappers of the same size.

**Prompt**: "During our lectures we worked with a different dataset. We did a permutation test. This is how they taught it to us: "
Compute the average node centrality for west-coast and east-coast rappers respectively. You are free to choose your favourite centrality measure or compare different ones. Which coast do the more central rappers have?
Create a randomized version of the full graph, where the association between nodes and coast is shuffled. Compute the average centrality of west-coast and east-coast rappers for this randomized graph.
Hint: To create this shuffling, simply give everyone a random coast assignment (but with the same number of west-coast and east-coast rappers as in the real data). The way I usually create random lables is by using either random.shuffle or numpy.random.permutation.
Repeat the shuffling process above 1000 times (at least) and plot two histograms, each histogram with the 1000 corresponding average values of centrality obtained for west-coast and east-coast rappers in the shuffeled netowrks.
Compare the value you have obtained in the real graph, with the randomized cases. Are rappers from one coast more central in the network (on average)?
Think about your results, what could be possible explanations for what you observe?
The shuffling procedure you just carried out is often called a permutation test. It is a super powerful tool that we use all the time. Once you start to think about it, you can answer many questions of the type "Is [property] higher/lower in this network than one might have expected by chance?" with permutation tests.
"

How can I perform this label reshuffling/permutation test on clusters based on user level features I found in my data?"

**Answer**
1. Initial Cluster Analysis
Cluster Your Data: Use a clustering algorithm (like k-means) to cluster your users based on the features you're interested in. Assign each user to a cluster.
Compute Initial Statistics: For each cluster, compute the average (or another statistic of interest) of a specific feature or set of features. These are your observed values.
2. Permutation Test Procedure
Shuffle Cluster Labels: Randomly shuffle the cluster labels assigned to the users, ensuring that the number of users in each cluster remains the same as in the original classification.
Recompute Statistics for Shuffled Data: For each shuffled configuration, recompute the same statistics as in your initial analysis for each cluster.
Repeat the Process: Perform this shuffling and recomputing process a large number of times (typically 1000 or more) to build a distribution of the statistic under the null hypothesis.
3. Analysis and Comparison
Create Histograms: For each cluster, create histograms of the computed statistics from the shuffled data.
Compare Observed Values with Distributions: Compare the initially observed values for each cluster with the distributions obtained from the shuffled data. If your observed value lies outside the bulk of the distribution for shuffled data, it suggests that the observed value is not simply due to random chance.
4. Statistical Significance
P-Value Calculation: For each cluster, you can calculate a p-value, which is the proportion of the shuffled datasets where the computed statistic was as extreme as the observed statistic. A small p-value indicates that the observed statistic is unusual under the null hypothesis of random distribution of features.
5. Interpretation
Draw Conclusions: Based on where your observed statistics fall in relation to the distributions from the shuffled data, draw conclusions about whether the features in each cluster are significantly different from what would be expected by chance.