In [114]:
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import dask.dataframe as dd
import html
import numpy as np
import psutil
import pandas as pd
import pickle as pkl
import re
import time

In [115]:
# Read the data
posts = pd.concat([pd.read_pickle('./pickle_dataframes/posts1.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts2.pkl'),
                   pd.read_pickle('./pickle_dataframes/posts3.pkl')]).reset_index(drop=True)

questions = pd.read_pickle('./pickle_dataframes/questions_with_topics.pkl')
answers = pd.read_pickle('./pickle_dataframes/answers_with_topics.pkl')
comments = pd.read_pickle('./pickle_dataframes/comments_with_topics.pkl')

users = pd.read_pickle('./pickle_dataframes/users.pkl')

### Filter Users on Min. Activity

In [119]:
# Calculate user activity counts
question_count = questions.groupby('OwnerUserId').size().rename('QuestionCount')
answer_count = answers.groupby('OwnerUserId').size().rename('AnswerCount')
comment_count = comments.groupby('UserId').size().rename('CommentCount')

In [120]:
# Merge activity counts with user data
user_activity = users.merge(question_count, left_on='Id',  right_index=True, how='left') \
                     .merge(answer_count, left_on='Id', right_index=True, how='left') \
                     .merge(comment_count, left_on='Id', right_index=True, how='left') \
                     .fillna({'QuestionCount': 0, 'AnswerCount': 0, 'CommentCount': 0})

# Identify and process active users
active_users = user_activity.assign(TotalActivity=lambda x: x['QuestionCount'] + x['AnswerCount'] + x['CommentCount'])
active_users = active_users[active_users['TotalActivity'] >= 15]
active_user_ids = set(active_users['Id'])
print(f"Active users: {active_users.shape[0]}")

Active users: 1464


In [121]:
active_questions = questions[questions['OwnerUserId'].isin(active_user_ids)]
active_answers = answers[answers['OwnerUserId'].isin(active_user_ids)]
active_comments = comments[comments['UserId'].isin(active_user_ids)]

print("Active Users' Questions: ", active_questions.shape[0])
print("Active Users' Answers: ", active_answers.shape[0])
print("Active Users' Comments: ", active_comments.shape[0])

#active_posts = pd.concat([active_questions, active_answers]).drop_duplicates()
#print("Active Users' posts: ", active_posts.shape[0])

Active Users' Questions:  11435
Active Users' Answers:  31672
Active Users' Comments:  167141


Now that we have gathered all questions, answers, and comments from Active users, we proceed to our sentiment analysis. 

If you don't to run the preprocessing, skip straight to the sentiment analysis where we read the preprocessed dataframes from pickle files.

### Preprocess text

In [122]:
# Modify preprocess_text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>|[^a-zA-Z0-9]', ' ', text.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')]
    text = ' '.join(words)

    return text

In [10]:
# Questions
ddf_questions = dd.from_pandas(active_questions, npartitions=8)
ddf_questions['Body_Processed'] = ddf_questions['Body'].map_partitions(lambda df: df.apply(preprocess_text))
ddf_questions['Title_Processed'] = ddf_questions['Title'].map_partitions(lambda df: df.apply(preprocess_text))
questions_processed = ddf_questions.compute()

In [90]:
# Answers
ddf_answers = dd.from_pandas(active_answers, npartitions=8)
ddf_answers['Body_Processed'] = ddf_answers['Body'].map_partitions(lambda df: df.apply(preprocess_text))
answers_processed = ddf_answers.compute()

In [11]:
# Apply preprocessing
# Comments
ddf_comments = dd.from_pandas(active_comments, npartitions=8)
ddf_comments['Text_Processed'] = ddf_comments['Text'].map_partitions(lambda df: df.apply(preprocess_text))
comments_processed = ddf_comments.compute()

In [92]:
# Save pre processed dataframes
# questions_processed.to_pickle('./pickle_dataframes/questions_preprocessed.pkl')
# answers_processed1, answers_processed2 = np.array_split(answers_processed, 2)
# 
# #answers_processed.to_pickle('./pickle_dataframes/answers_preprocessed.pkl')
# answers_processed1.to_pickle('./pickle_dataframes/answers_preprocessed1.pkl')
# answers_processed2.to_pickle('./pickle_dataframes/answers_preprocessed2.pkl')
# 
# comments_processed.to_pickle('./pickle_dataframes/comments_preprocessed.pkl')

### Sentiment Analysis Time

In [123]:
# Read in the preprocessed dataframes (answers df is split because of size)

questions_processed = pd.read_pickle('./pickle_dataframes/questions_preprocessed.pkl')
answers_processed = pd.concat([pd.read_pickle('./pickle_dataframes/answers_preprocessed1.pkl'),
                   pd.read_pickle('./pickle_dataframes/answers_preprocessed2.pkl')]).reset_index(drop=True)
comments_processed = pd.read_pickle('./pickle_dataframes/comments_preprocessed.pkl')

In [124]:
print(questions_processed.shape[0])
print(answers_processed.shape[0])
print(comments_processed.shape[0])

11435
31672
167141


In [125]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/phog/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [126]:
# Initialize SentimentIntensityAnalyzer once
sia = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis
def analyze_sentiment(text):
    # Check if the text is missing or NaN, return 0.0 in such cases
    if pd.isna(text):
        return 0.0
    # Ensure the text is encoded as a string
    text = str(text)
    return sia.polarity_scores(text)['compound']

In [95]:
# # Convert pandas DataFrame to Dask DataFrame
questions_dask = dd.from_pandas(questions_processed, npartitions=8)  # Adjust npartitions based on available memory
answers_dask = dd.from_pandas(answers_processed, npartitions=8) 
comments_dask = dd.from_pandas(comments_processed, npartitions=8)  

# Apply sentiment analysis to questions, answers and comments
questions_dask['BodySentiment'] = questions_dask['Body'].map(analyze_sentiment)
questions_dask['TitleSentiment'] = questions_dask['Title'].map(analyze_sentiment)

answers_dask['BodySentiment'] = answers_dask['Body'].map(analyze_sentiment)

comments_dask['TextSentiment'] = comments_dask['Text'].map(analyze_sentiment)

# Compute results with progress bar
with ProgressBar():
    questions_result = questions_dask.compute()
    answers_result = answers_dask.compute()
    comments_result = comments_dask.compute()

[########################################] | 100% Completed | 5.06 sms
[########################################] | 100% Completed | 46.80 ss
[########################################] | 100% Completed | 34.46 ss


In [96]:
# Save dataframes with sentiment to pickle

# answers_result = answers_result.drop(columns=['Id_y'])
# answers_result.rename(columns={'Id_x': 'Id'}, inplace=True)
# 
# questions_result.to_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
# 
# answers_result1, answers_result2 = np.array_split(answers_result, 2)
# answers_result1.to_pickle('./pickle_dataframes/answers_with_sentiment1.pkl')
# answers_result2.to_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')
# 
# 
# comments_result.to_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

In [127]:
questions_with_sentiment = pd.read_pickle('./pickle_dataframes/questions_with_sentiment.pkl')
answers_with_sentiment = pd.concat([pd.read_pickle('./pickle_dataframes/answers_with_sentiment1.pkl'), pd.read_pickle('./pickle_dataframes/answers_with_sentiment2.pkl')]).reset_index(drop=True)
comments_with_sentiment = pd.read_pickle('./pickle_dataframes/comments_with_sentiment.pkl')

### Assign Attributes to Users
1. Sentiment Attributes
2. Score Attributes
3. Accepted Answer Attributes
4. Topic Engagement Normalized Vector

#### Sentiment Attributes

In [128]:
# Calculate average sentiment for questions, answers, and comments (per user)

# Questions
avg_question_body_sentiment = questions_with_sentiment.groupby('OwnerUserId')['BodySentiment'].mean().rename('AvgQuestionBodySentiment')
avg_question_title_sentiment = questions_with_sentiment.groupby('OwnerUserId')['TitleSentiment'].mean().rename('AvgQuestionTitleSentiment')

# Answers
avg_answer_body_sentiment = answers_with_sentiment.groupby('OwnerUserId')['BodySentiment'].mean().rename('AvgAnswerSentiment')

# Comments
avg_comment_sentiment = comments_with_sentiment.groupby('UserId')['TextSentiment'].mean().rename('AvgCommentSentiment')

In [129]:
# Merge avg_question_body_sentiment onto active_users dataframe
users_with_sentiments = active_users.merge(avg_question_body_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_question_title_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_question_title_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_answer_body_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_answer_body_sentiment, left_on='Id', right_index=True, how='left')

# Merge avg_comment_sentiment
users_with_sentiments = users_with_sentiments.merge(avg_comment_sentiment, left_on='Id', right_index=True, how='left')

# Fill the ~1300 missing sentiment values with 0
users_with_sentiments.fillna(0, inplace=True)

users_with_sentiments.head(2)

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.0,0.0,0.185433,0.469406


In [None]:
# Calculate sentiment deviation from mean?


#### Score Attributes

In [89]:
# Calculate average scores

# Average Question Score
avg_question_score = questions_with_sentiment.groupby('OwnerUserId')['Score'].mean().rename('AvgQuestionScore')

# Average Answer Score
avg_answer_score = answers_with_sentiment.groupby('OwnerUserId')['Score'].mean().rename('AvgAnswerScore')

# Average Comment Score
avg_comment_score = comments_with_sentiment.groupby('UserId')['Score'].mean().rename('AvgCommentScore')

In [90]:
users_with_sentiments_and_scores = users_with_sentiments.merge(avg_question_score, left_on='Id', right_index=True, how='left')
users_with_sentiments_and_scores = users_with_sentiments_and_scores.merge(avg_answer_score, left_on='Id', right_index=True, how='left')
users_with_sentiments_and_scores = users_with_sentiments_and_scores.merge(avg_comment_score, left_on='Id', right_index=True, how='left')

users_with_sentiments_and_scores.head(2)

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment,AvgQuestionScore,AvgAnswerScore,AvgCommentScore
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863,6.820658,7.738636,1.578669
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.0,0.0,0.185433,0.469406,,7.0,1.193548


### Accepted Answer Attributes

In [91]:
accepted_answers = set(questions_with_sentiment[questions_with_sentiment['OwnerUserId'] > -1]['AcceptedAnswerId'])
accepted_answers_count = answers_with_sentiment[answers_with_sentiment['Id'].isin(accepted_answers)].groupby('OwnerUserId').size().rename('AcceptedAnswerCount')
accepted_answers_count

OwnerUserId
-1        135
 5          1
 8          7
 18        10
 23        44
         ... 
 20709      2
 20713      1
 21385      1
 21620      1
 46665      2
Name: AcceptedAnswerCount, Length: 326, dtype: int64

In [92]:
users_with_attributes = users_with_sentiments_and_scores.merge(accepted_answers_count, left_on='Id', right_index=True, how='left')
users_with_attributes['AcceptedAnswerFraction'] = users_with_attributes['AcceptedAnswerCount'] / users_with_attributes['AnswerCount']
users_with_attributes.head()

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,TotalActivity,AvgQuestionBodySentiment,AvgQuestionTitleSentiment,AvgAnswerSentiment,AvgCommentSentiment,AvgQuestionScore,AvgAnswerScore,AvgCommentScore,AcceptedAnswerCount,AcceptedAnswerFraction
0,-1,1,2012-12-04 20:12:06.337,2012-12-04 20:12:06.337,267,5442,6725,881.0,1320.0,8879.0,11080.0,0.182188,0.013791,0.178233,0.085863,6.820658,7.738636,1.578669,135.0,0.102273
5,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,34.0,0.0,0.0,0.185433,0.469406,,7.0,1.193548,1.0,0.333333
8,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,23.0,0.0,0.0,0.689277,0.04903,,20.230769,1.5,7.0,0.538462
18,18,5325,2012-12-04 21:37:27.683,2013-12-18 15:57:41.670,427,110,11,16.0,20.0,65.0,101.0,0.6613,0.212294,0.287115,0.189645,13.5625,13.7,0.769231,10.0,0.5
23,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,36.0,124.0,277.0,437.0,0.381967,0.141828,0.374322,0.199965,13.361111,7.782258,1.3213,44.0,0.354839


### Topic distribution attribute (topic modelling vector)

- Combine the DataFrames
- Aggregate Topics per User
- One-Hot Encode Topics
- Normalize the Topic Engagement: For each user, normalize their topic participation so that the sum across all topic columns equals 1. This represents the proportion of their participation in each topic.

In [93]:
questions_topic_distribution_vector = questions_with_sentiment[['OwnerUserId', 'Topic']].copy()
answers_topic_distribution_vector = answers_with_sentiment[['OwnerUserId', 'AnswerTopic']].copy()
comments_topic_distribution_vector = comments_with_sentiment[['UserId', 'CommentTopic']].copy()

questions_topic_distribution_vector['UserId'] = questions_topic_distribution_vector['OwnerUserId']
answers_topic_distribution_vector['UserId'] = answers_topic_distribution_vector['OwnerUserId']
answers_topic_distribution_vector['Topic'] = answers_topic_distribution_vector['AnswerTopic']
comments_topic_distribution_vector['Topic'] = comments_topic_distribution_vector['CommentTopic']

questions_topic_distribution_vector.head(2)

Unnamed: 0,OwnerUserId,Topic,UserId
0,18,7,18
4,18,21,18


In [94]:
combined_topic_distribution_df = pd.concat([
    questions_topic_distribution_vector[['UserId', 'Topic']],
    answers_topic_distribution_vector[['UserId', 'Topic']],
    comments_topic_distribution_vector[['UserId', 'Topic']]
])

combined_topic_distribution_df = combined_topic_distribution_df[combined_topic_distribution_df['UserId'] != -1]

In [95]:
# Step 1: One-Hot Encoding
one_hot_encoded_topics = pd.get_dummies(combined_topic_distribution_df['Topic'])
one_hot_encoded_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
26,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
184614,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
184615,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
184618,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
# Add UserId back for aggregation
one_hot_encoded_topics['UserId'] = combined_topic_distribution_df['UserId']
one_hot_encoded_topics.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,UserId
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,18
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,18


In [97]:
# Step 2: Aggregation
user_topic_engagement = one_hot_encoded_topics.groupby('UserId').sum()
user_topic_engagement

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0,7,0,8,0,0,0,0,7,1,...,0,0,1,1,0,0,2,0,0,6
8,0,0,0,3,4,0,0,4,0,0,...,0,0,0,0,0,0,9,0,0,2
18,0,6,0,10,3,1,0,4,8,2,...,0,0,2,0,0,0,30,4,0,24
23,8,66,0,78,33,8,0,19,55,0,...,0,7,7,19,0,1,42,2,4,53
26,0,1,0,1,0,0,0,4,1,0,...,0,0,0,0,0,0,4,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,14
46253,0,1,0,0,0,0,0,0,2,0,...,0,0,0,0,1,0,0,0,0,12
46524,6,0,0,2,0,0,1,0,0,3,...,0,0,0,0,0,0,5,0,0,6
46665,1,1,0,2,0,0,0,0,0,2,...,0,0,0,0,0,0,4,0,0,4


In [98]:
# Step 3: Normalization
user_topic_engagement_normalized = user_topic_engagement.div(user_topic_engagement.sum(axis=1), axis=0)
user_topic_engagement_normalized

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.000000,0.205882,0.0,0.235294,0.000000,0.000000,0.000000,0.000000,0.205882,0.029412,...,0.000000,0.000000,0.029412,0.029412,0.000000,0.000000,0.058824,0.000000,0.000000,0.176471
8,0.000000,0.000000,0.0,0.130435,0.173913,0.000000,0.000000,0.173913,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.391304,0.000000,0.000000,0.086957
18,0.000000,0.059406,0.0,0.099010,0.029703,0.009901,0.000000,0.039604,0.079208,0.019802,...,0.000000,0.000000,0.019802,0.000000,0.000000,0.000000,0.297030,0.039604,0.000000,0.237624
23,0.018307,0.151030,0.0,0.178490,0.075515,0.018307,0.000000,0.043478,0.125858,0.000000,...,0.000000,0.016018,0.016018,0.043478,0.000000,0.002288,0.096110,0.004577,0.009153,0.121281
26,0.000000,0.066667,0.0,0.066667,0.000000,0.000000,0.000000,0.266667,0.066667,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.266667,0.000000,0.000000,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46025,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.111111,0.000000,0.000000,0.777778
46253,0.000000,0.062500,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.750000
46524,0.260870,0.000000,0.0,0.086957,0.000000,0.000000,0.043478,0.000000,0.000000,0.130435,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.217391,0.000000,0.000000,0.260870
46665,0.052632,0.052632,0.0,0.105263,0.000000,0.000000,0.000000,0.000000,0.000000,0.105263,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.210526,0.000000,0.000000,0.210526


In [99]:
# Step 4: Preparation for Clustering
user_topic_engagement_normalized['MostEngagedTopic'] = user_topic_engagement_normalized.idxmax(axis=1)
user_topic_distribution = user_topic_engagement_normalized.reset_index()
user_topic_distribution

Unnamed: 0,UserId,0,1,2,3,4,5,6,7,8,...,16,17,18,19,20,21,22,23,24,MostEngagedTopic
0,5,0.000000,0.205882,0.0,0.235294,0.000000,0.000000,0.000000,0.000000,0.205882,...,0.000000,0.029412,0.029412,0.000000,0.000000,0.058824,0.000000,0.000000,0.176471,3
1,8,0.000000,0.000000,0.0,0.130435,0.173913,0.000000,0.000000,0.173913,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.391304,0.000000,0.000000,0.086957,21
2,18,0.000000,0.059406,0.0,0.099010,0.029703,0.009901,0.000000,0.039604,0.079208,...,0.000000,0.019802,0.000000,0.000000,0.000000,0.297030,0.039604,0.000000,0.237624,21
3,23,0.018307,0.151030,0.0,0.178490,0.075515,0.018307,0.000000,0.043478,0.125858,...,0.016018,0.016018,0.043478,0.000000,0.002288,0.096110,0.004577,0.009153,0.121281,3
4,26,0.000000,0.066667,0.0,0.066667,0.000000,0.000000,0.000000,0.266667,0.066667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.266667,0.000000,0.000000,0.066667,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458,46025,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.111111,0.000000,0.000000,0.777778,24
1459,46253,0.000000,0.062500,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,...,0.000000,0.000000,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.750000,24
1460,46524,0.260870,0.000000,0.0,0.086957,0.000000,0.000000,0.043478,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.217391,0.000000,0.000000,0.260870,0
1461,46665,0.052632,0.052632,0.0,0.105263,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.210526,0.000000,0.000000,0.210526,14


In [111]:
users_with_all_attributes = users_with_attributes.merge(user_topic_distribution, left_on='Id', right_on='UserId', how='left')
users_with_all_attributes = users_with_all_attributes[users_with_all_attributes['Id'] != -1]
users_with_all_attributes = users_with_all_attributes.drop(columns='UserId')
users_with_all_attributes.head(2)

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,...,16,17,18,19,20,21,22,23,24,MostEngagedTopic
1,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,...,0.0,0.029412,0.029412,0.0,0.0,0.058824,0.0,0.0,0.176471,3.0
2,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.391304,0.0,0.0,0.086957,21.0


### Save Results

In [113]:
#users_with_all_attributes.to_pickle('./pickle_dataframes/users_with_all_attributes.pkl')
users_with_all_attributes = pd.read_pickle('./pickle_dataframes/users_with_all_attributes.pkl')
users_with_all_attributes.head()

Unnamed: 0,Id,Reputation,CreationDate,LastAccessDate,Views,UpVotes,DownVotes,QuestionCount,AnswerCount,CommentCount,...,16,17,18,19,20,21,22,23,24,MostEngagedTopic
1,5,315,2012-12-04 20:36:06.517,2021-07-05 18:03:41.037,49,14,58,0.0,3.0,31.0,...,0.0,0.029412,0.029412,0.0,0.0,0.058824,0.0,0.0,0.176471,3.0
2,8,2777,2012-12-04 20:52:37.450,2022-07-31 22:39:03.850,832,37,5,0.0,13.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.391304,0.0,0.0,0.086957,21.0
3,18,5325,2012-12-04 21:37:27.683,2013-12-18 15:57:41.670,427,110,11,16.0,20.0,65.0,...,0.0,0.019802,0.0,0.0,0.0,0.29703,0.039604,0.0,0.237624,21.0
4,23,16227,2012-12-04 21:49:39.360,2015-02-14 02:38:09.917,1449,803,54,36.0,124.0,277.0,...,0.016018,0.016018,0.043478,0.0,0.002288,0.09611,0.004577,0.009153,0.121281,3.0
5,26,1794,2012-12-04 21:52:54.060,2020-11-19 03:57:56.387,24,34,3,1.0,7.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.266667,0.0,0.0,0.066667,7.0
