In [115]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import numpy as np
import psutil
import pandas as pd
import pickle as pkl
import re
import time

In [116]:
# Read the data
posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

questions = pd.read_pickle('./pickle_dataframes/questions_with_topics.pkl')
answers = pd.read_pickle('./pickle_dataframes/answers_with_topics.pkl')
comments = pd.read_pickle('./pickle_dataframes/comments_with_topics.pkl')

users = pd.read_pickle('./pickle_dataframes/users.pkl')

In [117]:
#comments = comments.sample(frac=0.1, random_state=0)
#posts = posts.sample(frac=0.1, random_state=0)

In [118]:
print(f"Users: {users.shape[0]}")
print(f"Questions: {questions.shape[0]}")
print(f"Answers: {answers.shape[0]}")
print(f"Comments: {comments.shape[0]}")

Users: 38788
Questions: 16002
Answers: 36090
Comments: 184620


### Filter Users on Min. Activity

In [120]:
# How many posts do we have from missing/deleted users?
print(len(posts[posts['OwnerUserId'] == -1]))
print(len(comments[comments['UserId']== -1]))

2201
8879


In [119]:
# Calculate user activity counts
question_count = questions.groupby('OwnerUserId').size().rename('QuestionCount')
answer_count = answers.groupby('OwnerUserId').size().rename('AnswerCount')
comment_count = comments.groupby('UserId').size().rename('CommentCount')

In [121]:
# Merge activity counts with user data
user_activity = users.merge(question_count, left_on='Id',  right_index=True, how='left') \
                     .merge(answer_count, left_on='Id', right_index=True, how='left') \
                     .merge(comment_count, left_on='Id', right_index=True, how='left') \
                     .fillna({'QuestionCount': 0, 'AnswerCount': 0, 'CommentCount': 0})

# Identify and process active users
active_users = user_activity.assign(TotalActivity=lambda x: x['QuestionCount'] + x['AnswerCount'] + x['CommentCount'])
active_users = active_users[active_users['TotalActivity'] >= 15]
active_user_ids = set(active_users['Id'])
print(f"Active users: {active_users.shape[0]}")

Active users: 1464


In [122]:
active_questions = questions[questions['OwnerUserId'].isin(active_user_ids)]
active_answers = answers[answers['OwnerUserId'].isin(active_user_ids)]
active_comments = comments[comments['UserId'].isin(active_user_ids)]

print("Active Users' Questions: ", active_questions.shape[0])
print("Active Users' Answers: ", active_answers.shape[0])
print("Active Users' Comments: ", active_comments.shape[0])

#active_posts = pd.concat([active_questions, active_answers]).drop_duplicates()
#print("Active Users' posts: ", active_posts.shape[0])

Active Users' Questions:  11435
Active Users' Answers:  31672
Active Users' Comments:  167141


Now that we have gathered all questions, answers, and comments from Active users, we proceed to our sentiment analysis. 

If you don't to run the preprocessing, skip straight to the sentiment analysis where we read the preprocessed dataframes from pickle files.

### Preprocess text

In [107]:
# Modify preprocess_text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>|[^a-zA-Z0-9]', ' ', text.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]
    text = ' '.join(words)

    return text

In [10]:
# Questions
ddf_questions = dd.from_pandas(active_questions, npartitions=8)
ddf_questions['Body_Processed'] = ddf_questions['Body'].map_partitions(lambda df: df.apply(preprocess_text))
ddf_questions['Title_Processed'] = ddf_questions['Title'].map_partitions(lambda df: df.apply(preprocess_text))
questions_processed = ddf_questions.compute()

In [90]:
# Answers
ddf_answers = dd.from_pandas(active_answers, npartitions=8)
ddf_answers['Body_Processed'] = ddf_answers['Body'].map_partitions(lambda df: df.apply(preprocess_text))
answers_processed = ddf_answers.compute()

In [11]:
# Apply preprocessing
# Comments
ddf_comments = dd.from_pandas(active_comments, npartitions=8)
ddf_comments['Text_Processed'] = ddf_comments['Text'].map_partitions(lambda df: df.apply(preprocess_text))
comments_processed = ddf_comments.compute()

In [92]:
questions_processed.to_pickle('./pickle_dataframes/questions_preprocessed.pkl')
answers_processed1, answers_processed2 = np.array_split(answers_processed, 2)

#answers_processed.to_pickle('./pickle_dataframes/answers_preprocessed.pkl')
answers_processed1.to_pickle('./pickle_dataframes/answers_preprocessed1.pkl')
answers_processed2.to_pickle('./pickle_dataframes/answers_preprocessed2.pkl')

comments_processed.to_pickle('./pickle_dataframes/comments_preprocessed.pkl')

### Sentiment Analysis Time

In [123]:
# Read in the preprocessed dataframes (answers df is split because of size)

questions_processed = pd.read_pickle('./pickle_dataframes/questions_preprocessed.pkl')
answers_processed = pd.concat([pd.read_pickle('./pickle_dataframes/answers_preprocessed1.pkl'),
                   pd.read_pickle('./pickle_dataframes/answers_preprocessed2.pkl')]).reset_index(drop=True)
comments_processed = pd.read_pickle('./pickle_dataframes/comments_preprocessed.pkl')

# Remove unused columns
# questions_processed = questions_processed.drop(columns=['CreationDate', 'LastActivityDate'])
# answers_processed = answers_processed.drop(columns=['CreationDate', 'LastActivityDate', 'Title', 'Tags'])
# comments_processed = comments_processed.drop(columns=['CreationDate'])

In [124]:
print(questions_processed.shape[0])
print(answers_processed.shape[0])
print(comments_processed.shape[0])

11435
31672
167141


In [125]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/phog/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [73]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    # Check if the text is missing or NaN, return 0.0 in such cases
    if pd.isna(text):
        return 0.0
    # Ensure the text is encoded as a string
    text = str(text)
    return sia.polarity_scores(text)['compound']

In [95]:
# # Convert pandas DataFrame to Dask DataFrame
questions_dask = dd.from_pandas(questions_processed, npartitions=8)  # Adjust npartitions based on available memory
answers_dask = dd.from_pandas(answers_processed, npartitions=8) 
comments_dask = dd.from_pandas(comments_processed, npartitions=8)  

# Apply sentiment analysis to questions, answers and comments
questions_dask['BodySentiment'] = questions_dask['Body'].map(analyze_sentiment)
questions_dask['TitleSentiment'] = questions_dask['Title'].map(analyze_sentiment)

answers_dask['BodySentiment'] = answers_dask['Body'].map(analyze_sentiment)

comments_dask['TextSentiment'] = comments_dask['Text'].map(analyze_sentiment)

# Compute results with progress bar
with ProgressBar():
    questions_result = questions_dask.compute()
    answers_result = answers_dask.compute()
    comments_result = comments_dask.compute()

[########################################] | 100% Completed | 5.06 sms
[########################################] | 100% Completed | 46.80 ss
[########################################] | 100% Completed | 34.46 ss


In [96]:
answers_result = answers_result.drop(columns=['Id_y'])
answers_result.rename(columns={'Id_x': 'Id'}, inplace=True)

questions_result.to_pickle('./pickle_dataframes/questions_with_sentiment.pkl')

answers_result1, answers_result2 = np.array_split(answers_result, 2)
answers_result1.to_pickle('./pickle_dataframes/answers_with_sentiment1.pkl')
answers_result2.to_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')


comments_result.to_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

In [126]:
questions_with_sentiment = pd.read_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
answers_with_sentiment = pd.concat([pd.read_pickle('./pickle_dataframes/answers_with_sentiment1.pkl'), pd.read_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')]).reset_index(drop=True)
comments_with_sentiment = pd.read_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

### Assign Attributes to Users

#### Sentiment Attributes

In [None]:
# Calculate average sentiment for answers
avg_question_body_sentiment = questions_result.groupby('OwnerUserId')['BodySentiment'].mean().rename('AvgQuestionBodySentiment')
avg_question_title_sentiment = questions_result.groupby('OwnerUserId')['TitleSentiment'].mean().rename('AvgQuestionTitleSentiment')

avg_answer_body_sentiment = questions_result.groupby('OwnerUserId')['BodySentiment'].mean().rename('AvgQuestionBodySentiment')

avg_comment_sentiment = answers_result.groupby('OwnerUserId')['Sentiment'].mean().rename('AvgCommentSentiment')

In [92]:
# Merge avg_question_body_sentiment
users_with_sentiments = users.merge(avg_question_body_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_question_title_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_question_title_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_answer_body_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_answer_body_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_comment_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_comment_sentiment, left_on='Id', right_index=True, how='left')

# Fill missing values with 0 or an appropriate default value
users_with_sentiments.fillna(0, inplace=True)

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0
8,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,23.0
18,18,5325,2012-12-04 21:37:27.683,2013-12-18 15:57:41.670,427,110,11,16.0,20.0,65.0,101.0
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,36.0,124.0,277.0,437.0
...,...,...,...,...,...,...,...,...,...,...,...
37644,46025,141,2023-03-19 12:16:59.447,2023-06-27 09:27:37.477,50,0,0,2.0,0.0,16.0,18.0
37863,46253,251,2023-04-07 18:35:35.903,2023-07-22 21:32:54.697,6,3,9,0.0,6.0,10.0,16.0
38111,46524,448,2023-05-17 19:13:21.650,2023-08-19 21:16:59.250,100,37,31,8.0,0.0,15.0,23.0
38246,46665,597,2023-06-07 21:14:53.120,2023-08-17 09:54:23.023,3,0,0,2.0,6.0,11.0,19.0


In [None]:
# Calculate deviation from mean sentiment


#### Score Attributes

In [None]:
# Calculate average scores
avg_question_score = questions_result.groupby('OwnerUserId')['Score'].mean().rename('AvgQuestionScore')
avg_answer_score = answer_result.groupby('OwnerUserId')['Score'].mean().rename('AvgAnswerScore')
avg_comment_score = comments_result.groupby('OwnerUserId')['Score'].mean().rename('AvgCommentScore')

# Merge the avg scores into the users DataFrame
#users = users.merge(avg_question_score

In [None]:
# Calculate average scores
avg_answer_score = active_posts[active_posts['PostTypeId'] == 2].groupby('OwnerUserId')['Score'].mean().rename('AvgAnswerScore')
avg_post_score = active_posts[active_posts['PostTypeId'] == 1].groupby('OwnerUserId')['Score'].mean().rename('AvgPostScore')

In [None]:
active_users = active_users.merge(avg_answer_score, left_on='Id', right_index=True, how='left') \
                           .merge(avg_post_score, left_on='Id', right_index=True, how='left') \
                           .fillna({'AvgAnswerScore': 0, 'AvgPostScore': 0})

In [None]:
active_users['AcceptedAnswerFraction'] = active_users['AcceptedAnswerCount'] / active_users['AnswerCount']

In [None]:
# Calculate number of accepted answers per user
accepted_answers = set(questions_with_topics['OwnerUserId'] > -1)]['AcceptedAnswerId'])
accepted_answers_count = posts_qa[posts_qa['Id'].isin(accepted_answers)].groupby('OwnerUserId').size().rename('AcceptedAnswerCount')

# Calculate number of accepted answers per user
accepted_answers = set(posts_qa[(posts_qa['PostTypeId'] == 1) & (posts_qa['OwnerUserId'] > -1)]['AcceptedAnswerId'])
accepted_answers_count = posts_qa[posts_qa['Id'].isin(accepted_answers)].groupby('OwnerUserId').size().rename('AcceptedAnswerCount')

### Save Results

In [None]:
#comments_result.to_pickle('comments_result.pkl')
#posts_result.to_pickle('posts_result.pkl')

In [None]:
def replace_sentiment(x):
    topic = x['Topic']
    if topic == 'None':
        return 0
    sentiment = x['AvgAnswerSentiment']
    return [sentiment if int(val) > 0 else 0 for val in topic]

active_user_answers['TopicSentiment'] = active_user_answers.apply(replace_sentiment, axis=1)


In [None]:
def calculate_topic_sentiment(group):
    # Extract the 'Topic' column as a list of lists
    transposed_topics_sentiment = group['TopicSentiment'].transpose()
    
    # Calculate the mean for each row
    mean_values = transposed_topics_sentiment.apply(lambda x: pd.to_numeric(x, errors='coerce')).mean(axis=0)
    
    return mean_values.tolist()