In [1]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import numpy as np
import psutil
import pandas as pd
import pickle as pkl
import re
import time

In [2]:
# Read the data
posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

questions = pd.read_pickle('./pickle_dataframes/questions_with_topics.pkl')
answers = pd.read_pickle('./pickle_dataframes/answers_with_topics.pkl')
comments = pd.read_pickle('./pickle_dataframes/comments_with_topics.pkl')

users = pd.read_pickle('./pickle_dataframes/users.pkl')

In [117]:
#comments = comments.sample(frac=0.1, random_state=0)
#posts = posts.sample(frac=0.1, random_state=0)

In [118]:
print(f"Users: {users.shape[0]}")
print(f"Questions: {questions.shape[0]}")
print(f"Answers: {answers.shape[0]}")
print(f"Comments: {comments.shape[0]}")

Users: 38788
Questions: 16002
Answers: 36090
Comments: 184620


### Filter Users on Min. Activity

In [9]:
# How many posts do we have from missing/deleted users?
print(len(posts[posts['OwnerUserId'] == -1]))
print(len(comments[comments['UserId']== -1]))

2201
8879


In [10]:
# Calculate user activity counts
question_count = questions.groupby('OwnerUserId').size().rename('QuestionCount')
answer_count = answers.groupby('OwnerUserId').size().rename('AnswerCount')
comment_count = comments.groupby('UserId').size().rename('CommentCount')

In [11]:
# Merge activity counts with user data
user_activity = users.merge(question_count, left_on='Id',  right_index=True, how='left') \
                     .merge(answer_count, left_on='Id', right_index=True, how='left') \
                     .merge(comment_count, left_on='Id', right_index=True, how='left') \
                     .fillna({'QuestionCount': 0, 'AnswerCount': 0, 'CommentCount': 0})

# Identify and process active users
active_users = user_activity.assign(TotalActivity=lambda x: x['QuestionCount'] + x['AnswerCount'] + x['CommentCount'])
active_users = active_users[active_users['TotalActivity'] >= 15]
active_user_ids = set(active_users['Id'])
print(f"Active users: {active_users.shape[0]}")

Active users: 1464


In [12]:
active_questions = questions[questions['OwnerUserId'].isin(active_user_ids)]
active_answers = answers[answers['OwnerUserId'].isin(active_user_ids)]
active_comments = comments[comments['UserId'].isin(active_user_ids)]

print("Active Users' Questions: ", active_questions.shape[0])
print("Active Users' Answers: ", active_answers.shape[0])
print("Active Users' Comments: ", active_comments.shape[0])

#active_posts = pd.concat([active_questions, active_answers]).drop_duplicates()
#print("Active Users' posts: ", active_posts.shape[0])

Active Users' Questions:  11435
Active Users' Answers:  31672
Active Users' Comments:  167141


Now that we have gathered all questions, answers, and comments from Active users, we proceed to our sentiment analysis. 

If you don't to run the preprocessing, skip straight to the sentiment analysis where we read the preprocessed dataframes from pickle files.

### Preprocess text

In [107]:
# Modify preprocess_text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>|[^a-zA-Z0-9]', ' ', text.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]
    text = ' '.join(words)

    return text

In [10]:
# Questions
ddf_questions = dd.from_pandas(active_questions, npartitions=8)
ddf_questions['Body_Processed'] = ddf_questions['Body'].map_partitions(lambda df: df.apply(preprocess_text))
ddf_questions['Title_Processed'] = ddf_questions['Title'].map_partitions(lambda df: df.apply(preprocess_text))
questions_processed = ddf_questions.compute()

In [90]:
# Answers
ddf_answers = dd.from_pandas(active_answers, npartitions=8)
ddf_answers['Body_Processed'] = ddf_answers['Body'].map_partitions(lambda df: df.apply(preprocess_text))
answers_processed = ddf_answers.compute()

In [11]:
# Apply preprocessing
# Comments
ddf_comments = dd.from_pandas(active_comments, npartitions=8)
ddf_comments['Text_Processed'] = ddf_comments['Text'].map_partitions(lambda df: df.apply(preprocess_text))
comments_processed = ddf_comments.compute()

In [92]:
# Save pre processed dataframes
# questions_processed.to_pickle('./pickle_dataframes/questions_preprocessed.pkl')
# answers_processed1, answers_processed2 = np.array_split(answers_processed, 2)
# 
# #answers_processed.to_pickle('./pickle_dataframes/answers_preprocessed.pkl')
# answers_processed1.to_pickle('./pickle_dataframes/answers_preprocessed1.pkl')
# answers_processed2.to_pickle('./pickle_dataframes/answers_preprocessed2.pkl')
# 
# comments_processed.to_pickle('./pickle_dataframes/comments_preprocessed.pkl')

### Sentiment Analysis Time

In [3]:
# Read in the preprocessed dataframes (answers df is split because of size)

questions_processed = pd.read_pickle('./pickle_dataframes/questions_preprocessed.pkl')
answers_processed = pd.concat([pd.read_pickle('./pickle_dataframes/answers_preprocessed1.pkl'),
                   pd.read_pickle('./pickle_dataframes/answers_preprocessed2.pkl')]).reset_index(drop=True)
comments_processed = pd.read_pickle('./pickle_dataframes/comments_preprocessed.pkl')

# Remove unused columns
# questions_processed = questions_processed.drop(columns=['CreationDate', 'LastActivityDate'])
# answers_processed = answers_processed.drop(columns=['CreationDate', 'LastActivityDate', 'Title', 'Tags'])
# comments_processed = comments_processed.drop(columns=['CreationDate'])

In [4]:
print(questions_processed.shape[0])
print(answers_processed.shape[0])
print(comments_processed.shape[0])

11435
31672
167141


In [125]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/phog/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [73]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    # Check if the text is missing or NaN, return 0.0 in such cases
    if pd.isna(text):
        return 0.0
    # Ensure the text is encoded as a string
    text = str(text)
    return sia.polarity_scores(text)['compound']

In [95]:
# # Convert pandas DataFrame to Dask DataFrame
questions_dask = dd.from_pandas(questions_processed, npartitions=8)  # Adjust npartitions based on available memory
answers_dask = dd.from_pandas(answers_processed, npartitions=8) 
comments_dask = dd.from_pandas(comments_processed, npartitions=8)  

# Apply sentiment analysis to questions, answers and comments
questions_dask['BodySentiment'] = questions_dask['Body'].map(analyze_sentiment)
questions_dask['TitleSentiment'] = questions_dask['Title'].map(analyze_sentiment)

answers_dask['BodySentiment'] = answers_dask['Body'].map(analyze_sentiment)

comments_dask['TextSentiment'] = comments_dask['Text'].map(analyze_sentiment)

# Compute results with progress bar
with ProgressBar():
    questions_result = questions_dask.compute()
    answers_result = answers_dask.compute()
    comments_result = comments_dask.compute()

[########################################] | 100% Completed | 5.06 sms
[########################################] | 100% Completed | 46.80 ss
[########################################] | 100% Completed | 34.46 ss


In [96]:
# Save dataframes with sentiment to pickle

# answers_result = answers_result.drop(columns=['Id_y'])
# answers_result.rename(columns={'Id_x': 'Id'}, inplace=True)
# 
# questions_result.to_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
# 
# answers_result1, answers_result2 = np.array_split(answers_result, 2)
# answers_result1.to_pickle('./pickle_dataframes/answers_with_sentiment1.pkl')
# answers_result2.to_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')
# 
# 
# comments_result.to_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

In [5]:
questions_with_sentiment = pd.read_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
answers_with_sentiment = pd.concat([pd.read_pickle('./pickle_dataframes/answers_with_sentiment1.pkl'), pd.read_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')]).reset_index(drop=True)
comments_with_sentiment = pd.read_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

In [37]:
# Topic
# AnswerTopic
# CommentTopic

Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,CommentTopic,Text_Processed,TextSentiment
0,1,1,9,Is it fair to inquire about the disadvantages ...,2012-12-04 22:00:00.933,28,7,fair inquire disadvantage without taking accou...,0.2732
1,3,1,3,"I could have reformulated the question, but at...",2012-12-04 22:02:37.737,18,7,could reformulated question least disadvantage...,0.3269
6,24,24,0,possible duplicate of [What challenges remain ...,2012-12-04 22:41:28.760,29,21,possible duplicate challenge remain online vot...,0.0772
10,30,9,0,[Wikipedia](http://en.wikipedia.org/wiki/Insta...,2012-12-04 22:49:14.307,52,21,wikipedia http en wikipedia org wiki instant r...,0.5095
11,32,26,0,I think this question needs to be made more sp...,2012-12-04 22:52:18.147,52,8,think question need made specific check balanc...,-0.4137
...,...,...,...,...,...,...,...,...,...
184613,351985,81107,0,But then you introduce the theory that you mus...,2023-09-03 03:29:26.147,43072,21,introduce theory must convince voter express s...,-0.7469
184614,351988,81024,0,@Steve your link to Pew https://www.pewresearc...,2023-09-03 04:02:18.527,8016,1,steve link pew http www pewresearch org short ...,0.7375
184615,351989,81024,0,@Steve Prime age (25-54) LFPR is much higher t...,2023-09-03 04:02:29.193,8016,1,steve prime age 25 54 lfpr much higher 1960s h...,-0.8000
184618,351994,81024,0,@Steve There has been some per capita GDP prog...,2023-09-03 04:14:38.957,8016,1,steve per caput gdp progress u k http fred stl...,0.4215


### Assign Attributes to Users

#### Sentiment Attributes

In [6]:
# Calculate average sentiment for questions, answers, and comments (per user)
avg_question_body_sentiment = questions_with_sentiment.groupby('OwnerUserId')['BodySentiment'].mean().rename('AvgQuestionBodySentiment')
avg_question_title_sentiment = questions_with_sentiment.groupby('OwnerUserId')['TitleSentiment'].mean().rename('AvgQuestionTitleSentiment')

avg_answer_body_sentiment = answers_with_sentiment.groupby('OwnerUserId')['BodySentiment'].mean().rename('AvgAnswerSentiment')

avg_comment_sentiment = comments_with_sentiment.groupby('UserId')['TextSentiment'].mean().rename('AvgCommentSentiment')

In [13]:
# Merge avg_question_body_sentiment
users_with_sentiments = active_users.merge(avg_question_body_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_question_title_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_question_title_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_answer_body_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_answer_body_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_comment_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_comment_sentiment, left_on='Id', right_index=True, how='left')

# Fill missing values with 0 or an appropriate default value
users_with_sentiments.fillna(0, inplace=True)

In [14]:
users_with_sentiments
# users_with_sentiments[users_with_sentiments['AvgCommentSentiment'] != 0]

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.000000,0.000000,0.185433,0.469406
8,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,23.0,0.000000,0.000000,0.689277,0.049030
18,18,5325,2012-12-04 21:37:27.683,2013-12-18 15:57:41.670,427,110,11,16.0,20.0,65.0,101.0,0.661300,0.212294,0.287115,0.189645
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,36.0,124.0,277.0,437.0,0.381967,0.141828,0.374322,0.199965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37644,46025,141,2023-03-19 12:16:59.447,2023-06-27 09:27:37.477,50,0,0,2.0,0.0,16.0,18.0,0.159400,0.000000,0.000000,-0.111800
37863,46253,251,2023-04-07 18:35:35.903,2023-07-22 21:32:54.697,6,3,9,0.0,6.0,10.0,16.0,0.000000,0.000000,0.666883,-0.105290
38111,46524,448,2023-05-17 19:13:21.650,2023-08-19 21:16:59.250,100,37,31,8.0,0.0,15.0,23.0,0.181063,0.299013,0.000000,0.189967
38246,46665,597,2023-06-07 21:14:53.120,2023-08-17 09:54:23.023,3,0,0,2.0,6.0,11.0,19.0,0.620450,0.255300,-0.036367,0.034573


#### Score Attributes

In [17]:
# Calculate average scores
avg_question_score = questions_with_sentiment.groupby('OwnerUserId')['Score'].mean().rename('AvgQuestionScore')
avg_answer_score = answers_with_sentiment.groupby('OwnerUserId')['Score'].mean().rename('AvgAnswerScore')
avg_comment_score = comments_with_sentiment.groupby('UserId')['Score'].mean().rename('AvgCommentScore')

In [20]:
users_with_sentiments_and_scores = users_with_sentiments.merge(avg_question_score, left_on='Id', right_index=True, how='left')
users_with_sentiments_and_scores = users_with_sentiments_and_scores.merge(avg_answer_score, left_on='Id', right_index=True, how='left')
users_with_sentiments_and_scores = users_with_sentiments_and_scores.merge(avg_comment_score, left_on='Id', right_index=True, how='left')

users_with_sentiments_and_scores

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment,AvgQuestionScore,AvgAnswerScore,AvgCommentScore
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863,6.820658,7.738636,1.578669
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.000000,0.000000,0.185433,0.469406,,7.000000,1.193548
8,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,23.0,0.000000,0.000000,0.689277,0.049030,,20.230769,1.500000
18,18,5325,2012-12-04 21:37:27.683,2013-12-18 15:57:41.670,427,110,11,16.0,20.0,65.0,101.0,0.661300,0.212294,0.287115,0.189645,13.562500,13.700000,0.769231
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,36.0,124.0,277.0,437.0,0.381967,0.141828,0.374322,0.199965,13.361111,7.782258,1.321300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37644,46025,141,2023-03-19 12:16:59.447,2023-06-27 09:27:37.477,50,0,0,2.0,0.0,16.0,18.0,0.159400,0.000000,0.000000,-0.111800,-2.000000,,0.000000
37863,46253,251,2023-04-07 18:35:35.903,2023-07-22 21:32:54.697,6,3,9,0.0,6.0,10.0,16.0,0.000000,0.000000,0.666883,-0.105290,,1.833333,1.200000
38111,46524,448,2023-05-17 19:13:21.650,2023-08-19 21:16:59.250,100,37,31,8.0,0.0,15.0,23.0,0.181063,0.299013,0.000000,0.189967,2.250000,,0.600000
38246,46665,597,2023-06-07 21:14:53.120,2023-08-17 09:54:23.023,3,0,0,2.0,6.0,11.0,19.0,0.620450,0.255300,-0.036367,0.034573,6.500000,5.333333,0.727273


### Topic distribution attribute (topic modelling vector)

Combine the DataFrames: Merge questions_with_sentiment, answers_with_sentiment, and comments_with_sentiment into a single DataFrame. Since the user IDs are in different columns (OwnerUserId in questions and answers, UserId in comments), you'll need to standardize this.

Aggregate Topics per User: Group the combined DataFrame by user ID and aggregate the topics they have participated in.

One-Hot Encode Topics: Convert the topics into one-hot encoded columns. This means creating a separate column for each topic and marking 1 if the user has participated in that topic and 0 otherwise.

Normalize the Topic Engagement: For each user, normalize their topic participation so that the sum across all topic columns equals 1. This represents the proportion of their participation in each topic.

### Miscellaneous User Attributes

In [25]:
accepted_answers = set(questions_with_sentiment[questions_with_sentiment['OwnerUserId'] > -1]['AcceptedAnswerId'])
accepted_answers_count = answers_with_sentiment[answers_with_sentiment['Id'].isin(accepted_answers)].groupby('OwnerUserId').size().rename('AcceptedAnswerCount')
accepted_answers_count

OwnerUserId
-1        135
 5          1
 8          7
 18        10
 23        44
         ... 
 20709      2
 20713      1
 21385      1
 21620      1
 46665      2
Name: AcceptedAnswerCount, Length: 326, dtype: int64

In [30]:
users_with_attributes = users_with_sentiments_and_scores.merge(accepted_answers_count, left_on='Id', right_index=True, how='left')
users_with_attributes['AcceptedAnswerFraction'] = users_with_attributes['AcceptedAnswerCount'] / users_with_attributes['AnswerCount']
users_with_attributes.head()

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment,AvgQuestionScore,AvgAnswerScore,AvgCommentScore,AcceptedAnswerCount,AcceptedAnswerFraction
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863,6.820658,7.738636,1.578669,135.0,0.102273
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.0,0.0,0.185433,0.469406,,7.0,1.193548,1.0,0.333333
8,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,23.0,0.0,0.0,0.689277,0.04903,,20.230769,1.5,7.0,0.538462
18,18,5325,2012-12-04 21:37:27.683,2013-12-18 15:57:41.670,427,110,11,16.0,20.0,65.0,101.0,0.6613,0.212294,0.287115,0.189645,13.5625,13.7,0.769231,10.0,0.5
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,36.0,124.0,277.0,437.0,0.381967,0.141828,0.374322,0.199965,13.361111,7.782258,1.3213,44.0,0.354839


### Save Results

In [31]:
#comments_result.to_pickle('comments_result.pkl')
#posts_result.to_pickle('posts_result.pkl')
#users_with_attributes.to_pickle('./pickle_dataframes/users_with_attributes.pkl')

In [34]:
users_with_attributes.head(2)

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment,AvgQuestionScore,AvgAnswerScore,AvgCommentScore,AcceptedAnswerCount,AcceptedAnswerFraction
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863,6.820658,7.738636,1.578669,135.0,0.102273
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.0,0.0,0.185433,0.469406,,7.0,1.193548,1.0,0.333333
