In [1]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import psutil
import pandas as pd
import pickle as pkl
import re
import time

In [2]:
# Read the data
# Load and merge dataframes
comments = pd.concat([pd.read_pickle('./pickle_dataframes/comments1.pkl'),
                      pd.read_pickle('./pickle_dataframes/comments2.pkl')]).reset_index(drop=True)

posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

users = pd.read_pickle('./pickle_dataframes/users.pkl')

In [3]:
comments = comments.sample(frac=0.1, random_state=0)
posts = posts.sample(frac=0.1, random_state=0)

### Filter Users on Min. Activity

In [4]:
# Filter posts for questions and answers
posts_qa = posts[posts['PostTypeId'].isin([1, 2])]

# Calculate user activity counts
post_count = posts_qa[posts_qa['PostTypeId'] == 1].groupby('OwnerUserId').size().rename('PostCount')
answer_count = posts_qa[posts_qa['PostTypeId'] == 2].groupby('OwnerUserId').size().rename('AnswerCount')
comment_count = comments.groupby('UserId').size().rename('CommentCount')

In [5]:
# Calculate number of accepted answers per user
accepted_answers = set(posts_qa[(posts_qa['PostTypeId'] == 1) & (posts_qa['OwnerUserId'] > -1)]['AcceptedAnswerId'])
accepted_answers_count = posts_qa[posts_qa['Id'].isin(accepted_answers)].groupby('OwnerUserId').size().rename('AcceptedAnswerCount')

# Merge activity counts with user data
user_activity = users.merge(post_count, left_on='Id',  right_index=True, how='left') \
                     .merge(comment_count, left_on='Id', right_index=True, how='left') \
                     .merge(accepted_answers_count, left_on='Id', right_index=True, how='left') \
                     .merge(answer_count, left_on='Id', right_index=True, how='left') \
                     .fillna({'PostCount': 0, 'CommentCount': 0, 'AcceptedAnswerCount': 0, 'AnswerCount': 0})

In [6]:
# Identify and process active users
active_users = user_activity.assign(TotalActivity=lambda x: x['PostCount'] + x['CommentCount'] + x['AnswerCount'])
active_users = active_users[active_users['TotalActivity'] > 15]
active_user_ids = set(active_users['Id'])

In [7]:
active_users

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,PostCount,CommentCount,AcceptedAnswerCount,AnswerCount,TotalActivity
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,88.0,863.0,5.0,157.0,1108.0
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,2.0,28.0,1.0,8.0,38.0
96,101,9601,2012-12-05 05:03:38.487,2023-08-30 10:27:39.800,2195,2259,3113,4.0,43.0,0.0,7.0,54.0
109,115,91134,2012-12-05 11:40:42.477,2023-09-02 00:57:59.273,5329,5504,2715,18.0,458.0,3.0,67.0,543.0
123,130,47394,2012-12-05 21:07:06.017,2023-08-31 06:00:47.800,3958,2344,338,34.0,223.0,1.0,15.0,272.0
...,...,...,...,...,...,...,...,...,...,...,...,...
34261,42320,7475,2022-03-03 00:24:15.180,2023-08-22 07:22:22.197,447,484,516,0.0,41.0,0.0,8.0,49.0
34626,42710,2048,2022-03-26 18:18:39.627,2023-08-25 03:17:14.093,103,62,104,0.0,17.0,0.0,4.0,21.0
34677,42766,3416,2022-03-30 12:20:39.217,2023-09-01 18:00:55.157,75,115,59,0.0,34.0,0.0,9.0,43.0
36033,44276,4246,2022-09-05 14:11:52.263,2023-09-03 01:01:26.820,153,326,201,4.0,14.0,0.0,3.0,21.0


In [8]:
# Filter posts and comments by active users
active_posts = posts_qa[posts_qa['OwnerUserId'].isin(active_user_ids)]
parent_posts = posts_qa[posts_qa['ParentId'].isin(active_posts['Id'])]
active_posts = pd.concat([active_posts, parent_posts]).drop_duplicates()
active_posts

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount
26602,42820,2,42791,-1,2019-07-09 16:34:06.980,0,-1,"<p>In addition to the other answers, it is pos...",13141,2019-07-10 08:09:41.230,Comment: N/A,Comment: N/A,-1,6
50338,79048,2,79027,-1,2023-04-04 23:04:18.057,9,-1,<p>As formulated this seems like an odd questi...,21531,2023-04-05 16:36:06.123,Comment: N/A,Comment: N/A,-1,1
28067,45617,1,-1,-19917,2019-09-17 10:49:05.153,2,234,<p>Noted social scientist Charles Murray has <...,-1,2019-09-17 13:15:58.513,Yang's UBI Plan: Does it Differ From Charles M...,<united-states><social-welfare>,1,1
35607,57753,2,57746,-1,2020-10-04 14:00:19.883,4,-1,<blockquote>&#xA;<p>In the event of a Presiden...,26455,2020-10-04 14:00:19.883,Comment: N/A,Comment: N/A,-1,0
24743,40396,2,40379,-1,2019-04-09 09:17:20.213,4,-1,<p>Politicians often appear on T.V. as so-call...,8554,2019-04-09 09:17:20.213,Comment: N/A,Comment: N/A,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47366,75413,2,75355,-1,2022-09-13 06:02:50.137,-2,-1,<p>The premise of the question seems flawed. T...,44350,2022-09-13 06:02:50.137,Comment: N/A,Comment: N/A,-1,0
32630,52549,2,52537,-1,2020-04-07 12:01:20.917,53,-1,<p>Republicans prevented this change because a...,7597,2020-04-07 14:31:13.023,Comment: N/A,Comment: N/A,-1,1
23699,39158,2,904,-1,2019-03-01 22:47:31.477,-1,-1,<p>There are many political parties in Europe ...,17380,2019-03-01 22:47:31.477,Comment: N/A,Comment: N/A,-1,1
728,855,2,406,-1,2013-01-22 17:44:49.277,10,-1,"<p>It may yet be too early to address the ""moo...",175,2013-01-24 15:36:02.337,Comment: N/A,Comment: N/A,-1,6


In [9]:
# Calculate average scores
avg_answer_score = active_posts[active_posts['PostTypeId'] == 2].groupby('OwnerUserId')['Score'].mean().rename('AvgAnswerScore')
avg_post_score = active_posts[active_posts['PostTypeId'] == 1].groupby('OwnerUserId')['Score'].mean().rename('AvgPostScore')

In [10]:
active_users = active_users.merge(avg_answer_score, left_on='Id', right_index=True, how='left') \
                           .merge(avg_post_score, left_on='Id', right_index=True, how='left') \
                           .fillna({'AvgAnswerScore': 0, 'AvgPostScore': 0})

In [11]:
active_users['AcceptedAnswerFraction'] = active_users['AcceptedAnswerCount'] / active_users['AnswerCount']

In [12]:
comments

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
28476,56951,15807,0,@DavidGrinberg Yep (not a native speaker).,2017-02-21 20:19:02.350,1384
7773,15133,3448,0,"I think these can all be argued as valid, but ...",2014-07-10 00:26:10.203,-1
7744,15052,3436,0,This is a very simplistic answer. I think this...,2014-07-05 01:58:42.137,169
180896,346006,80011,1,@JanKanis This is why I didn't use the article...,2023-06-23 09:34:36.730,42309
117656,226354,57061,2,"BBC summary, in lieu of an answer: https://www...",2020-09-09 16:08:22.537,300
...,...,...,...,...,...,...
65821,127602,33533,0,@gerrit I think they're along the right lines....,2018-09-10 13:58:38.373,22332
156880,304320,73364,3,@user366312 Stack Politics is not a political ...,2022-05-29 17:27:54.767,41835
56624,110746,1828,1,Because many people will disagree with you and...,2018-04-15 03:48:09.943,5894
95011,179351,45804,0,I've returned the book to the library already....,2019-09-23 14:28:12.830,20953


In [13]:
# Filter and preprocess comments
active_comments = comments[comments['PostId'].isin(active_user_ids)].drop_duplicates()
active_comments

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId
49227,98065,26824,0,"@Royal Canadian Bandit Counting of ""conventio...",2017-12-13 16:58:59.867,15671
21341,44242,12844,0,You should block-quote the quotes with citatio...,2016-10-30 03:05:40.207,9579
57567,112458,29927,0,@origimbo yes leftists supporting Arab votes i...,2018-04-27 19:58:21.623,7434
23995,48936,14026,3,I fundamentally think you have the right answe...,2016-12-18 03:48:40.807,3169
68849,132680,34520,1,There are still a small number of dissident Lo...,2018-10-18 12:40:30.833,21901
37207,74533,20514,0,@Relaxed surely there would be at least some n...,2017-06-29 18:58:09.743,7434
37208,74534,20514,0,@Relaxed Basically I'm trying to understand if...,2017-06-29 18:59:48.293,7434
45731,91446,25684,0,"Also, what's the source for the Wallonia separ...",2017-10-25 13:50:10.210,7732
87253,164768,42121,0,"@jpmc26 """"home invasion"""" is an extremely rare...",2019-06-13 14:52:11.533,7887
5865,12182,2579,0,"@Chad It's not about predicting the future, it...",2014-01-11 10:23:58.887,1953


### Preprocess text

In [14]:
# Modify preprocess_text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>|[^a-zA-Z0-9]', ' ', text.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]
    text = ' '.join(words)

    return text

In [15]:
# Apply preprocessing
# Comments
ddf_comments = dd.from_pandas(active_comments, npartitions=8)
ddf_comments['Text_Processed'] = ddf_comments['Text'].map_partitions(lambda df: df.apply(preprocess_text))
comments_processed = ddf_comments.compute()

In [16]:
# Posts
ddf_posts = dd.from_pandas(active_posts, npartitions=8)
ddf_posts['Body_Processed'] = ddf_posts['Body'].map_partitions(lambda df: df.apply(preprocess_text))
ddf_posts['Title_Processed'] = ddf_posts['Title'].map_partitions(lambda df: df.apply(preprocess_text))
posts_processed = ddf_posts.compute()

### Sentiment Analysis Time

In [17]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    # Check if the text is missing or NaN, return 0.0 in such cases
    if pd.isna(text):
        return 0.0
    # Ensure the text is encoded as a string
    text = str(text)
    return sia.polarity_scores(text)['compound']

In [18]:
# Convert pandas DataFrame to Dask DataFrame
comments_dask = dd.from_pandas(comments_processed, npartitions=8)  # Adjust npartitions based on available memory
posts_dask = dd.from_pandas(posts_processed, npartitions=8)  # Adjust npartitions based on available memory

# Apply sentiment analysis to comments and posts
comments_dask['Sentiment'] = comments_dask['Text'].map(analyze_sentiment)
posts_dask['BodySentiment'] = posts_dask['Body'].map(analyze_sentiment)
posts_dask['TitleSentiment'] = posts_dask['Title'].map(analyze_sentiment)

# Compute results with progress bar
with ProgressBar():
    comments_result = comments_dask.compute()
    posts_result = posts_dask.compute()

[########################################] | 100% Completed | 106.65 ms
[########################################] | 100% Completed | 5.28 sms


In [19]:
posts_result

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Body_Processed,Title_Processed,BodySentiment,TitleSentiment
24,33,1,-1,44,2012-12-04 23:10:02.983,31,18830,"<p>As of 2012, the United States currently has...",-1,2019-02-19 16:28:12.673,Who was the last US President to oversee a red...,<united-states><president><economy><debt><policy>,4,0,2012 united state currently 16 trillion debt l...,last u president oversee reduction debt,-0.1531,-0.3612
28,38,2,33,-1,2012-12-04 23:35:27.170,6,-1,<p>The last time the United States was debt fr...,73,2012-12-04 23:35:27.170,Comment: N/A,Comment: N/A,-1,3,last time united state debt free andrew jackso...,comment n,-0.5106,0.0000
89,103,2,7,-1,2012-12-05 12:45:08.997,38,-1,"<p>To quote from my own <a href=""https://histo...",115,2017-06-22 16:58:06.440,Comment: N/A,Comment: N/A,-1,8,quote answer history se term left right object...,comment n,0.9438,0.0000
105,120,2,33,-1,2012-12-05 17:40:10.617,20,-1,"<p>To add to Lennart's already great answer, t...",115,2012-12-05 19:53:13.750,Comment: N/A,Comment: N/A,-1,9,add lennart already great answer fifth correct...,comment n,0.7003,0.0000
151,182,2,174,-1,2012-12-06 11:49:29.033,36,-1,"<p>The monarch of the United Kingdom is, as ar...",101,2012-12-06 13:06:32.973,Comment: N/A,Comment: N/A,-1,3,monarch united kingdom contemporary monarchy h...,comment n,0.7184,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52002,81023,1,-1,-1,2023-08-26 15:59:04.447,6,753,"<p><a href=""https://www.businessinsider.in/pol...",28660,2023-08-31 15:46:07.943,Why does China consider capital control necess...,<economy><china><finance>,3,5,http www businessinsider policy economy news 3...,china consider capital control necessary indep...,0.9643,0.0000
52027,81052,1,-1,-1,2023-08-28 21:36:04.270,0,48,"<p><a href=""https://en.wikipedia.org/wiki/New_...",28660,2023-08-28 21:36:04.270,What are operations that the New Development B...,<russian-federation><brics>,0,0,http en wikipedia org wiki new development ban...,operation new development bank carry russian u...,-0.9396,-0.5994
52072,81104,1,-1,15569,2023-09-01 10:24:50.747,6,1730,<p>I saw on Wikipedia when I was looking at th...,29035,2023-09-01 13:44:58.340,What if no Republican candidate gets a majorit...,<united-states><republican-party><republican-p...,2,0,saw wikipedia looking 2024 republican party pr...,republican candidate get majority delegate pre...,0.6767,-0.2960
52088,81121,1,-1,-1,2023-09-03 00:41:14.053,-1,19,<p>Why won't American cities use sovereign wea...,28660,2023-09-03 00:41:14.053,Why won't American cities use sovereign wealth...,<united-states><law><finance>,0,1,american city use sovereign wealth fund fund c...,american city use sovereign wealth fund fund city,-0.8116,0.4939


### Aggregate per OwnerUserId

### Save Results

In [None]:
#comments_result.to_pickle('comments_result.pkl')
#posts_result.to_pickle('posts_result.pkl')

In [None]:
# Calculate average sentiment for answers
avg_answer_sentiment = comments_result.groupby('PostId')['Sentiment'].mean().rename('AvgAnswerSentiment')
active_user_answers = active_users.merge(avg_answer_sentiment, left_on='Id', right_on='PostId', how='left').fillna({'AvgAnswerSentiment': 0}) \
                                  .merge(df_topics, left_on='ParentId', right_on='Id', how='left').fillna({'Topic': 'None'})

In [None]:
def replace_sentiment(x):
    topic = x['Topic']
    if topic == 'None':
        return 0
    sentiment = x['AvgAnswerSentiment']
    return [sentiment if int(val) > 0 else 0 for val in topic]

active_user_answers['TopicSentiment'] = active_user_answers.apply(replace_sentiment, axis=1)


In [None]:
def calculate_topic_sentiment(group):
    # Extract the 'Topic' column as a list of lists
    transposed_topics_sentiment = group['TopicSentiment'].transpose()
    
    # Calculate the mean for each row
    mean_values = transposed_topics_sentiment.apply(lambda x: pd.to_numeric(x, errors='coerce')).mean(axis=0)
    
    return mean_values.tolist()

In [None]:
# Get average answer sentiment for each user
user_avg_answer_sentiment = active_user_answers.groupby('OwnerUserId_x')['AvgAnswerSentiment'].mean().rename('AnswerSentiment')
user_answers_drop_no_sent = active_user_answers.loc[active_user_answers['TopicSentiment'] != 0]
user_avg_answer_sentiment_topic = active_user_answers[active_user_answers['TopicSentiment'] != 0].groupby('OwnerUserId_x').apply(calculate_topic_sentiment).rename('AvgTopicSentiment')

active_users = active_users.merge(user_avg_answer_sentiment, left_on='Id', right_on='OwnerUserId_x', how='left').fillna({'AnswerSentiment': 0})
active_users = active_users.merge(user_avg_answer_sentiment_topic, left_on='Id', right_on='OwnerUserId_x', how='left').fillna({'AvgTopicSentiment': 0})
