In [4]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import psutil
import pandas as pd
import pickle as pkl
import re
import time



In [5]:
# Read the data
# Load and merge dataframes
comments = pd.concat([pd.read_pickle('./pickle_dataframes/comments1.pkl'),
                      pd.read_pickle('./pickle_dataframes/comments2.pkl')]).reset_index(drop=True)

posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

users = pd.read_pickle('./pickle_dataframes/users.pkl')

In [6]:
comments = comments.sample(frac=0.1, random_state=0)
posts = posts.sample(frac=0.1, random_state=0)

### Filter Users on Min. Activity

In [7]:
# Filter posts for questions and answers
posts_qa = posts[posts['PostTypeId'].isin([1, 2])]

# Calculate user activity counts
post_count = posts_qa[posts_qa['PostTypeId'] == 1].groupby('OwnerUserId').size().rename('PostCount')
answer_count = posts_qa[posts_qa['PostTypeId'] == 2].groupby('OwnerUserId').size().rename('AnswerCount')
comment_count = comments.groupby('UserId').size().rename('CommentCount')

In [8]:
# Calculate number of accepted answers per user
accepted_answers = set(posts_qa[(posts_qa['PostTypeId'] == 1) & (posts_qa['OwnerUserId'] > -1)]['AcceptedAnswerId'])
accepted_answers_count = posts_qa[posts_qa['Id'].isin(accepted_answers)].groupby('OwnerUserId').size().rename('AcceptedAnswerCount')

# Merge activity counts with user data
user_activity = users.merge(post_count, left_on='Id',  right_index=True, how='left') \
                     .merge(comment_count, left_on='Id', right_index=True, how='left') \
                     .merge(accepted_answers_count, left_on='Id', right_index=True, how='left') \
                     .merge(answer_count, left_on='Id', right_index=True, how='left') \
                     .fillna({'PostCount': 0, 'CommentCount': 0, 'AcceptedAnswerCount': 0, 'AnswerCount': 0})

In [9]:
# Identify and process active users
active_users = user_activity.assign(TotalActivity=lambda x: x['PostCount'] + x['CommentCount'] + x['AnswerCount'])
active_users = active_users[active_users['TotalActivity'] > 15]
active_user_ids = set(active_users['Id'])

In [10]:
active_users

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,PostCount,CommentCount,AcceptedAnswerCount,AnswerCount,TotalActivity
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,88.0,863.0,5.0,157.0,1108.0
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,2.0,28.0,1.0,8.0,38.0
96,101,9601,2012-12-05 05:03:38.487,2023-08-30 10:27:39.800,2195,2259,3113,4.0,43.0,0.0,7.0,54.0
109,115,91134,2012-12-05 11:40:42.477,2023-09-02 00:57:59.273,5329,5504,2715,18.0,458.0,3.0,67.0,543.0
123,130,47394,2012-12-05 21:07:06.017,2023-08-31 06:00:47.800,3958,2344,338,34.0,223.0,1.0,15.0,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...
34261,42320,7475,2022-03-03 00:24:15.180,2023-08-22 07:22:22.197,447,484,516,0.0,41.0,0.0,8.0,49.0
34626,42710,2048,2022-03-26 18:18:39.627,2023-08-25 03:17:14.093,103,62,104,0.0,17.0,0.0,4.0,21.0
34677,42766,3416,2022-03-30 12:20:39.217,2023-09-01 18:00:55.157,75,115,59,0.0,34.0,0.0,9.0,43.0
36033,44276,4246,2022-09-05 14:11:52.263,2023-09-03 01:01:26.820,153,326,201,4.0,14.0,0.0,3.0,21.0


In [11]:
# Filter posts and comments by active users
active_posts = posts_qa[posts_qa['OwnerUserId'].isin(active_user_ids)]
parent_posts = posts_qa[posts_qa['ParentId'].isin(active_posts['Id'])]
active_posts = pd.concat([active_posts, parent_posts]).drop_duplicates()
active_posts

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount
26602,42820,2,42791,-1,2019-07-09 16:34:06.980,0,-1,"<p>In addition to the other answers, it is pos...",13141,2019-07-10 08:09:41.230,Comment: N/A,Comment: N/A,-1,6
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1
28067,45617,1,-1,-19917,2019-09-17 10:49:05.153,2,234,<p>Noted social scientist Charles Murray has <...,-1,2019-09-17 13:15:58.513,Yang's UBI Plan: Does it Differ From Charles M...,<united-states><social-welfare>,1,1
35607,57753,2,57746,-1,2020-10-04 14:00:19.883,4,-1,<blockquote>&#xA;<p>In the event of a Presiden...,26455,2020-10-04 14:00:19.883,Comment: N/A,Comment: N/A,-1,0
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47366,75413,2,75355,-1,2022-09-13 06:02:50.137,-2,-1,<p>The premise of the question seems flawed. T...,44350,2022-09-13 06:02:50.137,Comment: N/A,Comment: N/A,-1,0
32630,52549,2,52537,-1,2020-04-07 12:01:20.917,53,-1,<p>Republicans prevented this change because a...,7597,2020-04-07 14:31:13.023,Comment: N/A,Comment: N/A,-1,1
23699,39158,2,904,-1,2019-03-01 22:47:31.477,-1,-1,<p>There are many political parties in Europe ...,17380,2019-03-01 22:47:31.477,Comment: N/A,Comment: N/A,-1,1
728,855,2,406,-1,2013-01-22 17:44:49.277,10,-1,"<p>It may yet be too early to address the ""moo...",175,2013-01-24 15:36:02.337,Comment: N/A,Comment: N/A,-1,6


In [12]:
# Calculate average scores
avg_answer_score = active_posts[active_posts['PostTypeId'] == 2].groupby('OwnerUserId')['Score'].mean().rename('AvgAnswerScore')
avg_post_score = active_posts[active_posts['PostTypeId'] == 1].groupby('OwnerUserId')['Score'].mean().rename('AvgPostScore')

In [13]:
active_users = active_users.merge(avg_answer_score, left_on='Id', right_index=True, how='left') \
                           .merge(avg_post_score, left_on='Id', right_index=True, how='left') \
                           .fillna({'AvgAnswerScore': 0, 'AvgPostScore': 0})

In [14]:
active_users['AcceptedAnswerFraction'] = active_users['AcceptedAnswerCount'] / active_users['AnswerCount']

In [15]:
comments

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
28476,56951,15807,0,@DavidGrinberg Yep (not a native speaker).,2017-02-21 20:19:02.350,1384
7773,15133,3448,0,"I think these can all be argued as valid, but ...",2014-07-10 00:26:10.203,-1
7744,15052,3436,0,This is a very simplistic answer. I think this...,2014-07-05 01:58:42.137,169
180896,346006,80011,1,@JanKanis This is why I didn't use the article...,2023-06-23 09:34:36.730,42309
117656,226354,57061,2,"BBC summary, in lieu of an answer: https://www...",2020-09-09 16:08:22.537,300
...,...,...,...,...,...,...
65821,127602,33533,0,@gerrit I think they're along the right lines....,2018-09-10 13:58:38.373,22332
156880,304320,73364,3,@user366312 Stack Politics is not a political ...,2022-05-29 17:27:54.767,41835
56624,110746,1828,1,Because many people will disagree with you and...,2018-04-15 03:48:09.943,5894
95011,179351,45804,0,I've returned the book to the library already....,2019-09-23 14:28:12.830,20953


In [16]:
# Filter and preprocess comments
active_comments = comments[comments['PostId'].isin(active_user_ids)].drop_duplicates()
active_comments

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
49227,98065,26824,0,"@Royal Canadian Bandit Counting of ""conventio...",2017-12-13 16:58:59.867,15671
21341,44242,12844,0,You should block-quote the quotes with citatio...,2016-10-30 03:05:40.207,9579
57567,112458,29927,0,@origimbo yes leftists supporting Arab votes i...,2018-04-27 19:58:21.623,7434
23995,48936,14026,3,I fundamentally think you have the right answe...,2016-12-18 03:48:40.807,3169
68849,132680,34520,1,There are still a small number of dissident Lo...,2018-10-18 12:40:30.833,21901
37207,74533,20514,0,@Relaxed surely there would be at least some n...,2017-06-29 18:58:09.743,7434
37208,74534,20514,0,@Relaxed Basically I'm trying to understand if...,2017-06-29 18:59:48.293,7434
45731,91446,25684,0,"Also, what's the source for the Wallonia separ...",2017-10-25 13:50:10.210,7732
87253,164768,42121,0,"@jpmc26 """"home invasion"""" is an extremely rare...",2019-06-13 14:52:11.533,7887
5865,12182,2579,0,"@Chad It's not about predicting the future, it...",2014-01-11 10:23:58.887,1953


### Preprocess text

In [17]:
# Modify preprocess_text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>|[^a-zA-Z0-9]', ' ', text.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]
    text = ' '.join(words)

    return text

In [22]:
from tqdm import tqdm

tqdm.pandas()

active_comments['Text_Processed'] = ''
active_comments['Text_Processed'] = active_comments['Text'].progress_apply(lambda text: preprocess_text(text))

100%|██████████| 34/34 [00:00<00:00, 146.48it/s]


In [24]:
# Apply preprocessing
# Comments
# ddf_comments = dd.from_pandas(active_comments, npartitions=8)
# ddf_comments['Text_Processed'] = ddf_comments['Text'].map_partitions(lambda df: df.apply(preprocess_text))
# comments_processed = ddf_comments.compute()

In [47]:
# # Posts
# ddf_posts = dd.from_pandas(active_posts, npartitions=8)
# ddf_posts['Body_Processed'] = ddf_posts['Body'].map_partitions(lambda df: df.apply(preprocess_text))
# ddf_posts['Title_Processed'] = ddf_posts['Title'].map_partitions(lambda df: df.apply(preprocess_text))
# posts_processed = ddf_posts.compute()

In [28]:
from tqdm import tqdm

tqdm.pandas()

active_posts['Body_Processed'] = ''
active_posts['Body_Processed'] = active_posts['Body'].progress_apply(lambda text: preprocess_text(text))
active_posts['Title_Processed'] = ''
active_posts['Title_Processed'] = active_posts['Title'].progress_apply(lambda text: preprocess_text(text))

100%|██████████| 3286/3286 [02:08<00:00, 25.57it/s]
100%|██████████| 3286/3286 [00:02<00:00, 1201.06it/s]


In [29]:
active_posts

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed
26602,42820,2,42791,-1,2019-07-09 16:34:06.980,0,-1,"<p>In addition to the other answers, it is pos...",13141,2019-07-10 08:09:41.230,Comment: N/A,Comment: N/A,-1,6,addition answer possible looked happened uk de...,comment n
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1,formulated seems like odd question ask unless ...,comment n
28067,45617,1,-1,-19917,2019-09-17 10:49:05.153,2,234,<p>Noted social scientist Charles Murray has <...,-1,2019-09-17 13:15:58.513,Yang's UBI Plan: Does it Differ From Charles M...,<united-states><social-welfare>,1,1,noted social scientist charles murray long adv...,yang ubi plan differ charles murray
35607,57753,2,57746,-1,2020-10-04 14:00:19.883,4,-1,<blockquote>&#xA;<p>In the event of a Presiden...,26455,2020-10-04 14:00:19.883,Comment: N/A,Comment: N/A,-1,0,event presidential candidate death printing ba...,comment n
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0,politician often appear v called talking head ...,comment n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47366,75413,2,75355,-1,2022-09-13 06:02:50.137,-2,-1,<p>The premise of the question seems flawed. T...,44350,2022-09-13 06:02:50.137,Comment: N/A,Comment: N/A,-1,0,premise question seems flawed king expressing ...,comment n
32630,52549,2,52537,-1,2020-04-07 12:01:20.917,53,-1,<p>Republicans prevented this change because a...,7597,2020-04-07 14:31:13.023,Comment: N/A,Comment: N/A,-1,1,republican prevented change core beneficial el...,comment n
23699,39158,2,904,-1,2019-03-01 22:47:31.477,-1,-1,<p>There are many political parties in Europe ...,17380,2019-03-01 22:47:31.477,Comment: N/A,Comment: N/A,-1,1,many political party europe see amnesty intern...,comment n
728,855,2,406,-1,2013-01-22 17:44:49.277,10,-1,"<p>It may yet be too early to address the ""moo...",175,2013-01-24 15:36:02.337,Comment: N/A,Comment: N/A,-1,6,may yet early address moon base scenario direc...,comment n


### Sentiment Analysis Time

In [33]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/christianr/nltk_data...


True

In [34]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    # Check if the text is missing or NaN, return 0.0 in such cases
    if pd.isna(text):
        return 0.0
    # Ensure the text is encoded as a string
    text = str(text)
    return sia.polarity_scores(text)['compound']

In [49]:
# # Convert pandas DataFrame to Dask DataFrame
# comments_dask = dd.from_pandas(comments_processed, npartitions=8)  # Adjust npartitions based on available memory
# posts_dask = dd.from_pandas(posts_processed, npartitions=8)  # Adjust npartitions based on available memory

# # Apply sentiment analysis to comments and posts
# comments_dask['Sentiment'] = comments_dask['Text'].map(analyze_sentiment)
# posts_dask['BodySentiment'] = posts_dask['Body'].map(analyze_sentiment)
# posts_dask['TitleSentiment'] = posts_dask['Title'].map(analyze_sentiment)

# # Compute results with progress bar
# with ProgressBar():
#     comments_result = comments_dask.compute()
#     posts_result = posts_dask.compute()

[########################################] | 100% Completed | 120.05 ms
[########################################] | 100% Completed | 10.69 ss


In [35]:
active_comments['Sentiment'] = active_comments['Text'].progress_apply(lambda text: analyze_sentiment(text))
active_posts['BodySentiment'] = active_posts['Body'].progress_apply(lambda text: analyze_sentiment(text))
active_posts['TitleSentiment'] = active_posts['Title'].progress_apply(lambda text: analyze_sentiment(text))

100%|██████████| 34/34 [00:00<00:00, 1992.38it/s]
100%|██████████| 3286/3286 [00:07<00:00, 438.06it/s]
100%|██████████| 3286/3286 [00:00<00:00, 15945.19it/s]


In [42]:
comments_result = active_comments.copy()
posts_result = active_posts.copy()

In [40]:
posts_result.head(5)

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
26602,42820,2,42791,-1,2019-07-09 16:34:06.980,0,-1,"<p>In addition to the other answers, it is pos...",13141,2019-07-10 08:09:41.230,Comment: N/A,Comment: N/A,-1,6,addition answer possible looked happened uk de...,comment n,0.5343,0.0
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1,formulated seems like odd question ask unless ...,comment n,0.8555,0.0
28067,45617,1,-1,-19917,2019-09-17 10:49:05.153,2,234,<p>Noted social scientist Charles Murray has <...,-1,2019-09-17 13:15:58.513,Yang's UBI Plan: Does it Differ From Charles M...,<united-states><social-welfare>,1,1,noted social scientist charles murray long adv...,yang ubi plan differ charles murray,0.6249,0.0
35607,57753,2,57746,-1,2020-10-04 14:00:19.883,4,-1,<blockquote>&#xA;<p>In the event of a Presiden...,26455,2020-10-04 14:00:19.883,Comment: N/A,Comment: N/A,-1,0,event presidential candidate death printing ba...,comment n,0.9907,0.0
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0,politician often appear v called talking head ...,comment n,0.3612,0.0


In [53]:
posts_result[
    posts_result.ParentId.isin(posts_result.Id.tolist())
    ].head(5)

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1,formulated seems like odd question ask unless ...,comment n,0.8555,0.0
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0,politician often appear v called talking head ...,comment n,0.3612,0.0
21639,35467,2,35460,-1,2018-11-19 15:42:50.583,2,-1,<p>In simple terms it means being part of a la...,6095,2018-11-19 15:42:50.583,Comment: N/A,Comment: N/A,-1,12,simple term mean part larger democratic organi...,comment n,0.9612,0.0
41218,67954,2,67900,-1,2021-08-07 01:27:15.867,2,-1,<p>Your specific Q makes more sense focusing o...,33430,2021-08-07 01:27:15.867,Comment: N/A,Comment: N/A,-1,2,specific q make sense focusing news calling rt...,comment n,0.4472,0.0
22539,37667,2,37645,-1,2019-01-04 10:14:36.640,9,-1,<p>In the event of a no-deal crash out of the ...,6095,2019-01-04 10:14:36.640,Comment: N/A,Comment: N/A,-1,2,event deal crash eu france belgium netherlands...,comment n,0.1779,0.0


In [54]:
posts_result[posts_result.Id == 79027]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
50318,79027,1,-1,13493,2023-04-03 18:40:34.400,2,4053,"<p>Both political parties use <a href=""https:/...",6105,2023-04-06 14:31:19.757,"Why does the right seem to rely on ""communism""...",<united-states><parties><rhetoric>,5,1,political party use snarl word denigrate polic...,right seem rely communism snarl word left,-0.4049,0.0


In [95]:
parent_list = posts_result.Id.tolist()

In [96]:
parent_list = posts_result.Id.tolist()
df_int = posts_result[
    posts_result.ParentId.isin(parent_list)
    ]
df_int

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1,formulated seems like odd question ask unless ...,comment n,0.8555,0.0
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0,politician often appear v called talking head ...,comment n,0.3612,0.0
21639,35467,2,35460,-1,2018-11-19 15:42:50.583,2,-1,<p>In simple terms it means being part of a la...,6095,2018-11-19 15:42:50.583,Comment: N/A,Comment: N/A,-1,12,simple term mean part larger democratic organi...,comment n,0.9612,0.0
41218,67954,2,67900,-1,2021-08-07 01:27:15.867,2,-1,<p>Your specific Q makes more sense focusing o...,33430,2021-08-07 01:27:15.867,Comment: N/A,Comment: N/A,-1,2,specific q make sense focusing news calling rt...,comment n,0.4472,0.0
22539,37667,2,37645,-1,2019-01-04 10:14:36.640,9,-1,<p>In the event of a no-deal crash out of the ...,6095,2019-01-04 10:14:36.640,Comment: N/A,Comment: N/A,-1,2,event deal crash eu france belgium netherlands...,comment n,0.1779,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47366,75413,2,75355,-1,2022-09-13 06:02:50.137,-2,-1,<p>The premise of the question seems flawed. T...,44350,2022-09-13 06:02:50.137,Comment: N/A,Comment: N/A,-1,0,premise question seems flawed king expressing ...,comment n,0.2538,0.0
32630,52549,2,52537,-1,2020-04-07 12:01:20.917,53,-1,<p>Republicans prevented this change because a...,7597,2020-04-07 14:31:13.023,Comment: N/A,Comment: N/A,-1,1,republican prevented change core beneficial el...,comment n,0.9451,0.0
23699,39158,2,904,-1,2019-03-01 22:47:31.477,-1,-1,<p>There are many political parties in Europe ...,17380,2019-03-01 22:47:31.477,Comment: N/A,Comment: N/A,-1,1,many political party europe see amnesty intern...,comment n,-0.7016,0.0
728,855,2,406,-1,2013-01-22 17:44:49.277,10,-1,"<p>It may yet be too early to address the ""moo...",175,2013-01-24 15:36:02.337,Comment: N/A,Comment: N/A,-1,6,may yet early address moon base scenario direc...,comment n,0.8273,0.0


In [98]:
df_subpost = df_int.groupby('ParentId').agg(
    users_list=('OwnerUserId', lambda x: list(x))
).reset_index(drop=False)
df_subpost

Unnamed: 0,ParentId,users_list
0,33,"[115, 73]"
1,307,[115]
2,323,"[36659, 5224]"
3,406,"[193, 175]"
4,904,"[220, 17380]"
...,...,...
140,79055,[45750]
141,79918,[21949]
142,80260,[32479]
143,80384,"[17668, 42582, 46306]"


In [103]:
df_original_poster = posts_result[posts_result.Id.isin(df_int.ParentId.tolist())].copy()
df_original_poster = df_original_poster.groupby('Id').agg(
    original_poster=('OwnerUserId', lambda x: list(x))
)
df_original_poster

Unnamed: 0_level_0,original_poster
Id,Unnamed: 1_level_1
33,[-1]
307,[130]
323,[130]
406,[115]
904,[130]
...,...
79055,[16047]
79918,[22967]
80260,[28660]
80384,[21531]


In [111]:
df_graph = pd.merge(
    left=df_original_poster,
    right=df_subpost,
    left_on='Id',
    right_on='ParentId'
)

df_graph['original_poster'] = df_graph['original_poster'].apply(lambda x: x[0] if x else None)
df_graph

Unnamed: 0,original_poster,ParentId,users_list
0,-1,33,"[115, 73]"
1,130,307,[115]
2,130,323,"[36659, 5224]"
3,115,406,"[193, 175]"
4,130,904,"[220, 17380]"
...,...,...,...
140,16047,79055,[45750]
141,22967,79918,[21949]
142,28660,80260,[32479]
143,21531,80384,"[17668, 42582, 46306]"


In [113]:
import networkx as nx

G = nx.Graph()

G.add_nodes_from(df_graph['original_poster'].values)


In [114]:
G.nodes()

NodeView((-1, 130, 115, 193, 1654, 23, 5279, 2984, 5046, 2430, 270, 8808, 8647, 9016, 10373, 2130, 3169, 7434, 12027, 6738, 11278, 5894, 20263, 9638, 18373, 12531, 7476, 6116, 22967, 24065, 19165, 5534, 18862, 15671, 25684, 6890, 29927, 6105, 28994, 19301, 22118, 29035, 1953, 5571, 3164, 32479, 27084, 22860, 16047, 16957, 28067, 28660, 18367, 21531))

In [121]:
for idx, row in df_graph.iterrows():
    user_to = row['original_poster']
    

    users_from = row['users_list']
    if len(users_from) == 0:
        continue
    for user_from in users_from:
        G.add_edges_from([(user_to, user_from)])
    

In [123]:
G.nodes(data=True)

NodeDataView({-1: {}, 130: {}, 115: {}, 193: {}, 1654: {}, 23: {}, 5279: {}, 2984: {}, 5046: {}, 2430: {}, 270: {}, 8808: {}, 8647: {}, 9016: {}, 10373: {}, 2130: {}, 3169: {}, 7434: {}, 12027: {}, 6738: {}, 11278: {}, 5894: {}, 20263: {}, 9638: {}, 18373: {}, 12531: {}, 7476: {}, 6116: {}, 22967: {}, 24065: {}, 19165: {}, 5534: {}, 18862: {}, 15671: {}, 25684: {}, 6890: {}, 29927: {}, 6105: {}, 28994: {}, 19301: {}, 22118: {}, 29035: {}, 1953: {}, 5571: {}, 3164: {}, 32479: {}, 27084: {}, 22860: {}, 16047: {}, 16957: {}, 28067: {}, 28660: {}, 18367: {}, 21531: {}, 73: {}, 36659: {}, 5224: {}, 175: {}, 220: {}, 17380: {}, 15897: {}, 4330: {}, 3093: {}, 17668: {}, 6034: {}, 300: {}, 6259: {}, 7089: {}, 1827: {}, 10422: {}, 4533: {}, 6095: {}, 6659: {}, 9459: {}, 4666: {}, 7400: {}, 6927: {}, 1470: {}, 20728: {}, 10873: {}, 6598: {}, 169: {}, 11511: {}, 9801: {}, 13377: {}, 11429: {}, 11725: {}, 13893: {}, 10023: {}, 13296: {}, 2969: {}, 16581: {}, 13689: {}, 17151: {}, 2670: {}, 22936: 

In [124]:
G.edges()

EdgeView([(-1, 115), (-1, 73), (-1, 4330), (-1, 23), (-1, 2984), (-1, 10422), (-1, 4533), (-1, 6095), (-1, 4666), (-1, -1), (-1, 7400), (-1, 8647), (-1, 10873), (-1, 10373), (-1, 11511), (-1, 10023), (-1, 130), (-1, 2969), (-1, 13689), (-1, 17151), (-1, 7434), (-1, 14788), (-1, 5894), (-1, 18373), (-1, 19301), (-1, 2951), (-1, 23014), (-1, 26455), (-1, 6105), (-1, 28994), (-1, 22118), (-1, 11278), (-1, 2670), (130, 115), (130, 36659), (130, 5224), (130, 220), (130, 17380), (130, 15897), (130, 300), (130, 6927), (130, 1470), (130, 22936), (130, 20371), (130, 18942), (130, 17715), (130, 24082), (130, 9638), (130, 34662), (130, 28067), (115, 193), (115, 175), (115, 3093), (115, 2130), (115, 4666), (193, 2130), (1654, 2130), (5279, 17668), (5279, 6095), (5279, 2477), (5279, 6606), (2984, 6259), (5046, 6034), (5046, 13296), (2430, 7089), (270, 1827), (270, 6659), (8808, 9459), (9016, 20728), (10373, 6598), (10373, 169), (3169, 9801), (3169, 7434), (3169, 14237), (7434, 4666), (7434, 13377),

In [125]:
nx.write_graphml(G, 'graph.graphml')

In [43]:
comments_result.head(5)

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,Text_Processed,Sentiment
49227,98065,26824,0,"@Royal Canadian Bandit Counting of ""conventio...",2017-12-13 16:58:59.867,15671,royal canadian bandit counting conventional ba...,0.0
21341,44242,12844,0,You should block-quote the quotes with citatio...,2016-10-30 03:05:40.207,9579,block quote quote citation possible,0.0
57567,112458,29927,0,@origimbo yes leftists supporting Arab votes i...,2018-04-27 19:58:21.623,7434,origimbo yes leftist supporting arab vote isra...,0.6808
23995,48936,14026,3,I fundamentally think you have the right answe...,2016-12-18 03:48:40.807,3169,fundamentally think right answer downvote argu...,-0.7102
68849,132680,34520,1,There are still a small number of dissident Lo...,2018-10-18 12:40:30.833,21901,still small number dissident loyalist well,0.5423


In [45]:
active_users

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,PostCount,CommentCount,AcceptedAnswerCount,AnswerCount,TotalActivity,AvgAnswerScore,AvgPostScore,AcceptedAnswerFraction
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,88.0,863.0,5.0,157.0,1108.0,6.401274,8.886364,0.031847
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,2.0,28.0,1.0,8.0,38.0,7.125000,10.000000,0.125000
96,101,9601,2012-12-05 05:03:38.487,2023-08-30 10:27:39.800,2195,2259,3113,4.0,43.0,0.0,7.0,54.0,18.000000,18.000000,0.000000
109,115,91134,2012-12-05 11:40:42.477,2023-09-02 00:57:59.273,5329,5504,2715,18.0,458.0,3.0,67.0,543.0,8.880597,8.444444,0.044776
123,130,47394,2012-12-05 21:07:06.017,2023-08-31 06:00:47.800,3958,2344,338,34.0,223.0,1.0,15.0,272.0,20.266667,12.500000,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34261,42320,7475,2022-03-03 00:24:15.180,2023-08-22 07:22:22.197,447,484,516,0.0,41.0,0.0,8.0,49.0,0.750000,0.000000,0.000000
34626,42710,2048,2022-03-26 18:18:39.627,2023-08-25 03:17:14.093,103,62,104,0.0,17.0,0.0,4.0,21.0,20.500000,0.000000,0.000000
34677,42766,3416,2022-03-30 12:20:39.217,2023-09-01 18:00:55.157,75,115,59,0.0,34.0,0.0,9.0,43.0,8.555556,0.000000,0.000000
36033,44276,4246,2022-09-05 14:11:52.263,2023-09-03 01:01:26.820,153,326,201,4.0,14.0,0.0,3.0,21.0,7.666667,7.500000,0.000000


In [48]:
posts_result[posts_result.OwnerUserId.isin(active_users.Id)]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
26602,42820,2,42791,-1,2019-07-09 16:34:06.980,0,-1,"<p>In addition to the other answers, it is pos...",13141,2019-07-10 08:09:41.230,Comment: N/A,Comment: N/A,-1,6,addition answer possible looked happened uk de...,comment n,0.5343,0.0000
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1,formulated seems like odd question ask unless ...,comment n,0.8555,0.0000
28067,45617,1,-1,-19917,2019-09-17 10:49:05.153,2,234,<p>Noted social scientist Charles Murray has <...,-1,2019-09-17 13:15:58.513,Yang's UBI Plan: Does it Differ From Charles M...,<united-states><social-welfare>,1,1,noted social scientist charles murray long adv...,yang ubi plan differ charles murray,0.6249,0.0000
35607,57753,2,57746,-1,2020-10-04 14:00:19.883,4,-1,<blockquote>&#xA;<p>In the event of a Presiden...,26455,2020-10-04 14:00:19.883,Comment: N/A,Comment: N/A,-1,0,event presidential candidate death printing ba...,comment n,0.9907,0.0000
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0,politician often appear v called talking head ...,comment n,0.3612,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46510,74410,2,74399,-1,2022-07-25 16:31:54.583,20,-1,"<p>(Kyoto is <a href=""https://treaties.un.org/...",21531,2022-07-27 00:15:44.047,Comment: N/A,Comment: N/A,-1,28,kyoto superseded paris protocol fwiw china rus...,comment n,-0.4346,0.0000
2980,4737,1,-1,4748,2014-08-22 00:15:11.813,0,248,<p>SNAP decreased quite a bit nation wide. Wha...,-1,2014-08-24 06:00:59.557,What was the rationale behind cutting SNAP pay...,<united-states><social-welfare>,1,5,snap decreased quite bit nation wide rational ...,rationale behind cutting snap payment increase,0.3270,0.2869
25103,40915,2,40914,-1,2019-04-22 05:06:11.817,2,-1,<p>Apparently there were none; from the <a hre...,18373,2019-04-22 05:06:11.817,Comment: N/A,Comment: N/A,-1,0,apparently none official report referendum rec...,comment n,0.9371,0.0000
31945,50622,2,13817,-1,2020-03-02 20:02:27.517,-2,-1,<p>Ignoring the obvious reason for the militar...,20728,2020-03-02 20:02:27.517,Comment: N/A,Comment: N/A,-1,0,ignoring obvious reason military existence one...,comment n,-0.1796,0.0000


### Save Results

In [None]:
#comments_result.to_pickle('comments_result.pkl')
#posts_result.to_pickle('posts_result.pkl')

In [None]:
# Calculate average sentiment for answers
avg_answer_sentiment = comments_result.groupby('PostId')['Sentiment'].mean().rename('AvgAnswerSentiment')
active_user_answers = active_users.merge(avg_answer_sentiment, left_on='Id', right_on='PostId', how='left').fillna({'AvgAnswerSentiment': 0}) \
                                  .merge(df_topics, left_on='ParentId', right_on='Id', how='left').fillna({'Topic': 'None'})

In [None]:
def replace_sentiment(x):
    topic = x['Topic']
    if topic == 'None':
        return 0
    sentiment = x['AvgAnswerSentiment']
    return [sentiment if int(val) > 0 else 0 for val in topic]

active_user_answers['TopicSentiment'] = active_user_answers.apply(replace_sentiment, axis=1)


In [None]:
def calculate_topic_sentiment(group):
    # Extract the 'Topic' column as a list of lists
    transposed_topics_sentiment = group['TopicSentiment'].transpose()
    
    # Calculate the mean for each row
    mean_values = transposed_topics_sentiment.apply(lambda x: pd.to_numeric(x, errors='coerce')).mean(axis=0)
    
    return mean_values.tolist()

In [None]:
# Get average answer sentiment for each user
user_avg_answer_sentiment = active_user_answers.groupby('OwnerUserId_x')['AvgAnswerSentiment'].mean().rename('AnswerSentiment')
user_answers_drop_no_sent = active_user_answers.loc[active_user_answers['TopicSentiment'] != 0]
user_avg_answer_sentiment_topic = active_user_answers[active_user_answers['TopicSentiment'] != 0].groupby('OwnerUserId_x').apply(calculate_topic_sentiment).rename('AvgTopicSentiment')

active_users = active_users.merge(user_avg_answer_sentiment, left_on='Id', right_on='OwnerUserId_x', how='left').fillna({'AnswerSentiment': 0})
active_users = active_users.merge(user_avg_answer_sentiment_topic, left_on='Id', right_on='OwnerUserId_x', how='left').fillna({'AvgTopicSentiment': 0})
