In [None]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from networkx.algorithms.community import greedy_modularity_communities
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

import community as community_louvain
import dask.dataframe as dd
import html
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import psutil
import pandas as pd
import pickle as pkl
import re
import seaborn as sns
import time

### Network Generation [1:2]: The DataFrame

In [None]:
# Read the data
questions = pd.read_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
answers = pd.concat([pd.read_pickle('./pickle_dataframes/answers_with_sentiment1.pkl'), 
                     pd.read_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')]).reset_index(drop=True)
comments = pd.read_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

users = pd.read_pickle('./pickle_dataframes/users_with_all_attributes.pkl')

users_with_all_attributes = pd.read_pickle('./pickle_dataframes/users_with_all_attributes.pkl')
users_with_all_attributes.head(2)

In [None]:
# Preparing a list of question IDs
parent_list = questions.Id.tolist()

# Identifying Comments Associated with Questions and Answers
comments_on_questions = comments[comments['PostId'].isin(questions['Id'])]
comments_on_answers = comments[comments['PostId'].isin(answers['Id'])]

In [None]:
# Aggregating commenters by the post they commented on
comments_on_questions_agg = comments_on_questions.groupby('PostId')['UserId'].apply(list).reset_index()
comments_on_answers_agg = comments_on_answers.groupby('PostId')['UserId'].apply(list).reset_index()

In [None]:
# Mapping Answer IDs to their corresponding Question IDs
answer_to_question_map = answers.set_index('Id')['ParentId'].to_dict()
comments_on_answers_agg['MappedPostId'] = comments_on_answers_agg['PostId'].map(lambda x: answer_to_question_map.get(x, None))

# Filtering out None values which have no corresponding question
comments_on_answers_agg = comments_on_answers_agg[comments_on_answers_agg['MappedPostId'].notnull()]

In [None]:
# Creating a unified DataFrame for comment data
comments_combined = pd.concat([
    comments_on_questions_agg.rename(columns={'PostId': 'QuestionId', 'UserId': 'CommentOnQuestionUserId_list'}),
    comments_on_answers_agg.rename(columns={'MappedPostId': 'QuestionId', 'UserId': 'CommentOnAnswersUserId_list'})
], ignore_index=True)

# Replacing NaN values with empty lists
comments_combined['CommentOnQuestionUserId_list'] = comments_combined['CommentOnQuestionUserId_list'].apply(lambda x: x if isinstance(x, list) else [])
comments_combined['CommentOnAnswersUserId_list'] = comments_combined['CommentOnAnswersUserId_list'].apply(lambda x: x if isinstance(x, list) else [])

# Grouping and combining lists
comments_combined = comments_combined.groupby('QuestionId').agg(
    CommentOnQuestionUserId_list=('CommentOnQuestionUserId_list', lambda x: sum(x, [])),
    CommentOnAnswersUserId_list=('CommentOnAnswersUserId_list', lambda x: sum(x, []))
).reset_index()

In [None]:
# Filtering answers that are related to the collected questions
df_int = answers[answers.ParentId.isin(parent_list)]

In [None]:
# Grouping answers by their parent question and aggregating answerer user IDs
df_subpost = df_int.groupby('ParentId').agg(
    answers_UserId_list=('OwnerUserId', lambda x: list(x))
).reset_index(drop=False)

In [None]:
# Identifying original posters for each question
df_original_poster = questions[questions.Id.isin(df_int.ParentId.tolist())].copy()
df_original_poster = df_original_poster.groupby('Id').agg(
    original_poster_UserId=('OwnerUserId', lambda x: list(x))
)

In [None]:
# Merging to form a comprehensive DataFrame for graph construction
df_graph = pd.merge(
    left=df_original_poster,
    right=df_subpost,
    left_on='Id',
    right_on='ParentId'
)

# Cleaning up the 'original_poster' column
df_graph['original_poster_UserId'] = df_graph['original_poster_UserId'].apply(lambda x: x[0] if x else None)

# Integrating Comment Data with the Graph Data
df_graph = pd.merge(df_graph, comments_combined, left_on='ParentId', right_on='QuestionId', how='left')

In [None]:
df_graph['CommentOnQuestionUserId_list'] = df_graph['CommentOnQuestionUserId_list'].apply(lambda x: x if isinstance(x, list) else [])
df_graph['CommentOnAnswersUserId_list'] = df_graph['CommentOnAnswersUserId_list'].apply(lambda x: x if isinstance(x, list) else [])
df_graph = df_graph[df_graph['original_poster_UserId'] != -1]

#### If you want to see how to get each of the values in df_graph's columns expand below rows: 

In [None]:
# displaying the different columns of row 1 manually
df_graph.head(1)

In [None]:
# We look at question 1
# We see that it belongs to questions['OwnerUserId']==18 (original_poster_UserId==18)
questions[questions['Id']==1]

In [None]:
# We look at the answers to question 1
# We see that the answers['OwnerUserId'] corresponds to the users in df_graph['answers_UserId_list'] == [26, 8, 4666]	
answers[answers['ParentId']==1]

In [None]:
# We look at the comments on the questions (CommentersOnQuestion)
# We see that the comments['UserId'] corresponds to the users in df_graph['CommentersOnQuestion'] == [28, 18, 8018]	
comments[comments['PostId']==1]

In [None]:
# We look at the comments on the first/3 of the question answers
# We see that the comments['UserId'] corresponds to one of the user in df_graph['CommentersOnAnswers'] == [7014, 9921]	 
comments[comments['PostId']==4]

### Network Generation [2:2]: The graph

**We have to consider how we connect** 
- original_poster_UserId to CommentOnAnswersUserId_list
- answers_UserId_list to CommentOnAnswersUserId_list

Do we connect both or only one of them?
- In `G1` we connect original_poster_UserId to CommentOnAnswersUserId_list

In a more extensive study, we would've also defined `G2` and `G3`:
- In `G2` we connect answers_UserId_list to CommentOnAnswersUserId_list
- In `G3` we connect both

In [None]:
G1 = nx.Graph()
# - original_poster_UserId to CommentOnAnswersUserId_list

# Adding nodes and edges for original posters, answerers, and commenters
for idx, row in df_graph.iterrows():
    original_poster_UserId = row['original_poster_UserId']
    G1.add_node(original_poster_UserId)

    # Add edges from original poster to answerers
    for user in row['answers_UserId_list']:
        G1.add_edge(original_poster_UserId, user)

    # Add edges from original poster to commenters on the question
    if isinstance(row['CommentOnQuestionUserId_list'], list):
        for commenter in row['CommentOnQuestionUserId_list']:
            G1.add_edge(original_poster_UserId, commenter)

    # Add edges from original poster to commenters on the answers
    if isinstance(row['CommentOnAnswersUserId_list'], list):
        for commenter in row['CommentOnAnswersUserId_list']:
            G1.add_edge(original_poster_UserId, commenter)

print(f'len(G1.nodes(): {len(G1.nodes(data=True))}')
print(f'len(G1.edges()): {len(G1.edges())}')

### Community Detection: Network

In [None]:
# Calculate different network statistics
avg_degree_centrality = sum(nx.degree_centrality(G1).values()) / len(G1)
avg_clustering_coefficient = nx.average_clustering(G1)
connected_components = list(nx.connected_components(G1))
avg_betweenness_centrality = sum(nx.betweenness_centrality(G1).values()) / len(G1)
communities = greedy_modularity_communities(G1)

# Analysis - You might print these or plot them using matplotlib or similar
print("Average Degree Centrality:", avg_degree_centrality)
print("Average Clustering Coefficient:", avg_clustering_coefficient)
print("Number of Connected Components:", len(connected_components))
print("Average Betweenness Centrality:", avg_betweenness_centrality)
print("Number of Communities detected:", len(communities))

In [None]:
# Detect communities
partition = community_louvain.best_partition(G1)

# Assign community label to each node
for node, community in partition.items():
    G1.nodes[node]['community'] = community

In [None]:
G1.nodes(data=True)

In [None]:
# Extract Node data to dataframe
node_data = [{'UserId': node, 'Community': data['community']} for node, data in G1.nodes(data=True)]
community_df = pd.DataFrame(node_data)
community_df.Community.value_counts()

### Community Detection: K-Means Clustering

In [None]:
def custom_cluster(df, clustering_features, n_clusters, clustering_name, random_state=42, verbose=False, minmax=True):
    # Impute NaNs and scale features
    features = df[clustering_features].fillna(0)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    df[clustering_name] = kmeans.fit_predict(scaled_features)

    # Display plots
    if verbose:
        # Elbow method and silhouette analysis
        wcss, silhouette_scores = [], []
        for i in range(1, 11):
            kmeans_i = KMeans(n_clusters=i, random_state=random_state).fit(scaled_features)
            wcss.append(kmeans_i.inertia_)
            if i > 1:
                score = silhouette_score(scaled_features, kmeans_i.labels_)
                silhouette_scores.append(score)

        # Plot elbow method and silhouette analysis results
        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
        axs[0].plot(range(1, 11), wcss)
        axs[0].set_title('Elbow Method')
        axs[0].set_xlabel('Number of Clusters')
        axs[0].set_ylabel('WCSS')
        axs[0].grid(True)

        axs[1].plot(range(2, 11), silhouette_scores)
        axs[1].set_title('Silhouette Analysis')
        axs[1].set_xlabel('Number of Clusters')
        axs[1].set_ylabel('Silhouette Score')
        axs[1].grid(True)

        plt.tight_layout()
        plt.show()

        # Plot boxplots of feature distributions
        num_features = len(clustering_features)
        num_rows = (num_features - 1) // 5 + 1
        num_cols = min(num_features, 5)
        fig, axs = plt.subplots(num_rows, num_cols, figsize=(4*num_cols, 4*num_rows))
        axs = axs.flatten() if num_features > 1 else [axs]
        for i, feature in enumerate(clustering_features):
            sns.boxplot(x=clustering_name, y=feature, data=df, ax=axs[i])
            axs[i].set(title=f'{feature} Distribution', xlabel='Cluster', ylabel='Count')
        for ax in axs[len(clustering_features):]:
            ax.set_visible(False)
        plt.tight_layout()
        plt.show()

    # Compute cluster information
    cluster_counts = df[clustering_name].value_counts().sort_index().reset_index(name='Count')
    cluster_means = df.groupby(clustering_name)[clustering_features].mean().add_suffix('Mean')
    cluster_info = pd.merge(cluster_counts, cluster_means, on=clustering_name)

    # Plot lineplot of cluster information
    if verbose:
        if minmax:  # MinMax scaling for lineplot
            minmax_scaler = MinMaxScaler()
            mean_feature_names = [f'{feature}Mean' for feature in clustering_features]
            scaled = cluster_info[mean_feature_names].apply(lambda x: minmax_scaler.fit_transform(x.values.reshape(-1, 1)).flatten())
            scaled[clustering_name] = cluster_info[clustering_name]
            melted = scaled.melt(id_vars=[clustering_name], var_name='Feature', value_name='NormalizedValue')
            ylabel = "Per-Feature Normalised Value"
        else:  # Absolute values for lineplot
            melted = cluster_info.drop('Count', axis=1).melt(id_vars=[clustering_name], var_name='Feature', value_name='Mean')
            ylabel = "Mean Value"

        plt.figure(figsize=(10, 5))
        value_column = 'NormalizedValue' if minmax else 'Mean'
        title = "Per-Feature Normalised Values Across Clusters" if minmax else "Mean Values of Features Across Clusters"
        for cluster in melted[clustering_name].unique():
            data = melted[melted[clustering_name] == cluster]
            plt.plot(data['Feature'], data[value_column], label=f'Cluster {cluster}')
        plt.title(title)
        plt.xlabel("Feature")
        plt.ylabel(ylabel)
        plt.xticks(rotation=45)
        plt.legend(loc='upper right')
        plt.grid(True)
        plt.show()

    return cluster_info

In [None]:
def permutation_test_clustering(df, clustering_name, clustering_features, num_permutations=1000, verbose=False):
    # Impute NaNs and scale features
    features = df[clustering_features].fillna(0)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Compute silhouette score using actual labels
    actual_labels = df[clustering_name].values
    actual_silhouette_score = silhouette_score(scaled_features, actual_labels)
    
    # Compute silhouette scores using permuted labels
    permuted_silhouette_scores = []
    for _ in range(num_permutations):
        shuffled_labels = np.random.permutation(actual_labels)
        permuted_silhouette_score = silhouette_score(scaled_features, shuffled_labels)
        permuted_silhouette_scores.append(permuted_silhouette_score)

    # Calculate p-value for significance testing
    p_value = (np.sum(np.array(permuted_silhouette_scores) >= actual_silhouette_score) + 1) / (num_permutations + 1)

    if verbose:
        print(f'Actual silhouette score: {actual_silhouette_score}')
        print(f'p-value: {p_value}')

    return actual_silhouette_score, p_value

### 1. Clustering by Activity Metrics

#### 1.1 Clustering by question count, answer count, and comment count

In [None]:
clustering_features = ['QuestionCount', 'AnswerCount', 'CommentCount']
clustering_name = 'ActivityCluster'

custom_cluster(users_with_all_attributes, clustering_features, 4, clustering_name, verbose=True)

In [None]:
permutation_test_clustering(users_with_all_attributes, clustering_name, clustering_features, verbose=True);

### 2. Clustering by Quality Metrics

#### 2.1 Clustering by reputation, average question score, average answer score, and average comment score

In [None]:
clustering_features = ['Reputation', 'AvgQuestionScore', 'AvgAnswerScore', 'AvgCommentScore']
clustering_name = 'QualityCluster1'

custom_cluster(users_with_all_attributes, clustering_features, 4, clustering_name, verbose=True)

In [None]:
permutation_test_clustering(users_with_all_attributes, clustering_name, clustering_features, verbose=True);

#### 2.2 Clustering by accepted answer count and accepted answer fraction

In [None]:
clustering_features = ['AcceptedAnswerCount', 'AcceptedAnswerFraction']
clustering_name = 'QualityCluster2'

custom_cluster(users_with_all_attributes, clustering_features, 3, clustering_name, verbose=True)

In [None]:
permutation_test_clustering(users_with_all_attributes, clustering_name, clustering_features, verbose=True);

### 3. Clustering by Sentiment Metrics

#### 3.1 Clustering by average question body sentiment, average question title sentiment, average answer sentiment, average comment sentiment

In [None]:
clustering_features = ['AvgQuestionBodySentiment', 'AvgQuestionTitleSentiment', 'AvgAnswerSentiment', 'AvgCommentSentiment']
clustering_name = 'SentimentCluster'

custom_cluster(users_with_all_attributes, clustering_features, 4, clustering_name, verbose=True)

In [None]:
permutation_test_clustering(users_with_all_attributes, clustering_name, clustering_features, verbose=True);

### 4. Clustering by Engagement Metrics

#### 4.1 Clustering by views, upvotes, and downvotes

In [None]:
clustering_features = ['Views', 'UpVotes', 'DownVotes']
clustering_name = 'EngagementCluster'

custom_cluster(users_with_all_attributes, clustering_features, 3, clustering_name, verbose=True)

In [None]:
permutation_test_clustering(users_with_all_attributes, clustering_name, clustering_features, verbose=True);

### 5. Clustering by Topic Engagement Metrics

#### 5.1 Clustering by topic engagement columns

In [None]:
clustering_features = list(range(25))
clustering_name = 'TopicEngagementCluster'

custom_cluster(users_with_all_attributes, clustering_features, 7, clustering_name, verbose=True, minmax=False)

In [None]:
permutation_test_clustering(users_with_all_attributes, clustering_name, clustering_features, verbose=True);

#### Assigning Node Attributes

In [None]:
users_with_all_attributes.columns

In [None]:
UserAttributesForNetwork = users_with_all_attributes[['Id', 'MostEngagedTopic', 'EngagementCluster', 'ActivityCluster', 'Quality1Cluster', 'Quality2Cluster', 'SentimentCluster', 'EngagementSentimentCluster', 'Engagement_MostEngaged_Cluster']]

In [None]:
user_attrs = UserAttributesForNetwork.set_index('Id').T.to_dict()
list(user_attrs.items())[:2]

In [None]:
# Iterate over the user attributes dictionary and add each to the corresponding node
for user_id, attrs in user_attrs.items():
    if user_id in G1.nodes:
        nx.set_node_attributes(G1, {user_id: attrs})

In [None]:
# Check attributes of a specific user node
print(G1.nodes[5])

In [None]:
# Save Network to Gephi
# nx.write_graphml(G1, './graphml/graph2.graphml')

### Results

#### Clusters vs. Communities

#### User Attribute Analysis

### Label Reshuffleing

> *"The "label shuffling technique" is incredibly useful. It may turn out to be a good tool to apply for your independent project. Keep it in mind."* - **Week8.ipynb**

Use the "label shuffling test" (Week 5 and 8) to test if the coast with the highest wikipedia page sentiment has a page sentiment that is significantly higher (5% confidence bound) than a randomly selected group of rappers of the same size.

1. Initial Cluster Analysis
Cluster Your Data: Use a clustering algorithm (like k-means) to cluster your users based on the features you're interested in. Assign each user to a cluster.
Compute Initial Statistics: For each cluster, compute the average (or another statistic of interest) of a specific feature or set of features. These are your observed values.
2. Permutation Test Procedure
Shuffle Cluster Labels: Randomly shuffle the cluster labels assigned to the users, ensuring that the number of users in each cluster remains the same as in the original classification.
Recompute Statistics for Shuffled Data: For each shuffled configuration, recompute the same statistics as in your initial analysis for each cluster.
Repeat the Process: Perform this shuffling and recomputing process a large number of times (typically 1000 or more) to build a distribution of the statistic under the null hypothesis.
3. Analysis and Comparison
Create Histograms: For each cluster, create histograms of the computed statistics from the shuffled data.
Compare Observed Values with Distributions: Compare the initially observed values for each cluster with the distributions obtained from the shuffled data. If your observed value lies outside the bulk of the distribution for shuffled data, it suggests that the observed value is not simply due to random chance.
4. Statistical Significance
P-Value Calculation: For each cluster, you can calculate a p-value, which is the proportion of the shuffled datasets where the computed statistic was as extreme as the observed statistic. A small p-value indicates that the observed statistic is unusual under the null hypothesis of random distribution of features.
5. Interpretation
Draw Conclusions: Based on where your observed statistics fall in relation to the distributions from the shuffled data, draw conclusions about whether the features in each cluster are significantly different from what would be expected by chance.