# Import data

In [1]:
import os
import pandas as pd
import numpy as np

directory = './picklefiles'
# List all files that start with the specified pattern

def load_pickle_files(directory, file_pattern):
    matching_files = [file for file in os.listdir(directory) if file.startswith(file_pattern)]
    matching_files
    l = []
    for file in matching_files:
        l.append(pd.read_pickle(os.path.join(directory,file)))
    return pd.concat(l)

import pickle
posts = pickle.load(open(f'{directory}/posts_with_topic.pkl', 'rb'))
print("Posts loaded")

answers = load_pickle_files(directory, 'posts_typecasted_')
answers = answers[answers["PostTypeId"] == 2]
print("Answers loaded")

comments = pickle.load(open(f'{directory}/comments_typecasted.pkl', 'rb'))
print("Comments loaded")


Posts loaded
Answers loaded
Comments loaded


In [2]:
TAG_COUNTS = pickle.load(open(f'{directory}/tag_counts.pkl', 'rb'))
print("Tag counts loaded")
# Dictionary TOPIC -> {TAGS}
TOPIC_TAGS = pickle.load(open(f'{directory}/communities_louvain.pkl', 'rb'))
TOPIC_TAGS = [{key: value for (key,value) in topic} for topic in TOPIC_TAGS] # Convert to list of dictionaries
print("Topic tags loaded")

Tag counts loaded
Topic tags loaded


In [4]:
USERS = pickle.load(open(f'{directory}/active_users_with_sentiment.pkl', 'rb'))

# Clean data

In [5]:
def percentage_of_nan(column):
    return round(100*sum(column.isna())/len(column),2)

In [6]:
posts_mapping = dict(zip(posts['Id'], posts['ViewCount']))
answers_mapping = dict(zip(answers['Id'], answers['ViewCount']))

all_mapping = {**posts_mapping, **answers_mapping}

In [7]:
answers = answers[answers['ParentId'].map(posts_mapping).notna()]

In [8]:
comments = comments[comments['PostId'].map(all_mapping).notna()]

In [9]:
percentage_of_nan(comments['PostId'].map(all_mapping))

0.0

In [10]:
len(comments)

6435484

In [11]:
question_topic_mapping = dict(zip(posts['Id'], posts['Topic']))
answers['Topic'] = answers['ParentId'].map(question_topic_mapping)

In [12]:
answer_topic_mapping = dict(zip(answers['Id'], answers['Topic']))

posts_topic_mapping = {**question_topic_mapping, **answer_topic_mapping} 

comments['Topic'] = comments['PostId'].map(posts_topic_mapping)

In [15]:
pickle.dump(answers, open(f'{directory}/answers_clean.pkl', 'wb'))
pickle.dump(comments, open(f'{directory}/comments_clean.pkl', 'wb'))
pickle.dump(TOPIC_TAGS, open(f'{directory}/topic_tags_clean.pkl', 'wb'))

# Create new columns

## Views and ViewCount

In [13]:
post_probability = 0.5

posts_views_mapping = dict(zip(posts['Id'], posts['ViewCount']*post_probability))
answers['ViewCount'] = answers['ParentId'].map(posts_views_mapping)

In [14]:
comment_probability = 0.7
posts_views_mapping = dict(zip(posts['Id'], posts['ViewCount']*comment_probability))
answers_views_mapping = dict(zip(answers['Id'], answers['ViewCount']*comment_probability))
comments['Views'] = comments['PostId'].map({**posts_views_mapping, **answers_views_mapping})

In [None]:
pickle.dump(answers, open(f'{directory}/answers_clean.pkl', 'wb'))
pickle.dump(comments, open(f'{directory}/comments_clean.pkl', 'wb'))

## Main topic

In [10]:
import os
import pandas as pd
import numpy as np
import pickle

directory = './picklefiles'

posts = pickle.load(open(f'{directory}/posts_with_topic.pkl', 'rb'))
print("Posts loaded")
answers = pickle.load(open(f'{directory}/answers_clean.pkl', 'rb'))
print("Answers loaded")
comments = pickle.load(open(f'{directory}/comments_clean.pkl', 'rb'))
print("Comments loaded")
TOPIC_TAGS = pickle.load(open(f'{directory}/topic_tags_clean.pkl', 'rb'))
print("Topic tags loaded")
TAG_COUNTS = pickle.load(open(f'{directory}/tag_counts.pkl', 'rb'))
print("Tag counts loaded")
USERS = pickle.load(open(f'{directory}/active_users_with_sentiment.pkl', 'rb'))

Posts loaded
Answers loaded
Comments loaded
Topic tags loaded
Tag counts loaded


Building a dictionary that links each tag to their corresponding topic.

In [4]:
def find_topic(tag):
    for i, topic in enumerate(TOPIC_TAGS):
        if tag in topic:
            return i
    return None

# Dictionary TAG -> TOPIC
TAG_TOPICS = {tag: find_topic(tag) for tag in TAG_COUNTS}
print("Tag topics built")

Tag topics built


Get main topic for a post (the topic that we will consider to analyze the post). We follow this criteria:
1. The main topic is the one with the most amount of tags.
2. If there are more than one, the tags with less instances gets picked.

In [8]:
def get_main_topic(row, verbosity = 0):
    topic_tag_counts = row["Topic"]    
    max_tag = max(topic_tag_counts)
    count_max = sum(1 for el2 in [el == max_tag for el in topic_tag_counts] if el2)
    if verbosity > 0:
        print(f"Max: {max_tag} - Count: {count_max}")
    if count_max == 0:
        return None
    elif count_max == 1:
        return np.argmax(topic_tag_counts)
    else:
        if verbosity > 0:
            print("else")
        # WHICH TAGS ARE IN THE TOPICS THAT ARE DRAWN
        # WE SUM THE NUMBER OF TAG INSTANCES IN EACH TOPIC -> ITERATE OVER TOPIC -> ITERATE OVER TAG -> SUM -> RETURN THE TOPIC WITH THE MINIMUM VALUE
        drawn_topics = [i for (i, tag_count) in enumerate(topic_tag_counts) if tag_count == max_tag]
        topic_tag = {i: [tag for tag in row["Tags"] if tag in TOPIC_TAGS[i]] for i in drawn_topics}
        instance_count = {i: sum([TAG_COUNTS[tag] for tag in topic_tag[i]]) for i in drawn_topics}
        topic = min(instance_count, key=instance_count.get)
        return topic

In [6]:
test = {"Topic": [1,0,1,0,0,0,0,0,0,0], "Tags": ["do-while", "android"]}
get_main_topic(test)

0

In [13]:
# import pandas as pd
# import multiprocessing as mp
# from tqdm import tqdm

# # Assuming 'posts' is your DataFrame and 'get_main_topic' is your function

# # Function to apply in parallel
# def apply_parallel(df_chunk):
#     return df_chunk.apply(lambda row: get_main_topic(row), axis=1)

# # Split DataFrame into chunks
# def split_dataframe(df, num_partitions):
#     return np.array_split(df, num_partitions)

# # Parallelize the apply function
# def parallelize_dataframe(df, func, num_partitions):
#     with mp.Pool(mp.cpu_count()) as pool:
#         df_split = split_dataframe(df, num_partitions)
#         results = list(tqdm(pool.imap(func, df_split), total=len(df_split), desc="Processing"))
#     return pd.concat(results)

# # Apply the function in parallel
# num_partitions = mp.cpu_count()  # Number of partitions to split dataframe
# posts['MainTopic'] = parallelize_dataframe(posts, apply_parallel, num_partitions)


Processing:   0%|          | 0/12 [00:00<?, ?it/s]

In [11]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Convert the pandas DataFrame to a Dask DataFrame
dask_posts = dd.from_pandas(posts, npartitions=8) # Adjust the number of partitions based on your dataset

# Define a lambda function to be applied to each row
lambda_function = lambda row: get_main_topic(row) 

# Apply the function
# Specify meta as a list, since the function returns a list
dask_posts['MainTopic'] = dask_posts.apply(lambda_function, axis=1, meta=('MainTopic', 'object'))

# Compute the results to get back a pandas DataFrame with a progress bar
with ProgressBar():
    posts = dask_posts.compute()

[########################################] | 100% Completed | 71.25 s


In [14]:
posts

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,Topic,MainTopic
160,337,1,-1,342,2008-08-02 03:35:55.697,82,10424,<p>I am about to build a piece of a project th...,111,2021-11-12 18:56:21.143,XML Processing in Python,"(xml,)",12,1,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",5
195,469,1,-1,3040,2008-08-02 15:11:16.430,48,4837,<p>I am using the Photoshop's javascript API t...,147,2022-12-15 07:51:30.287,How can I find the full path to a font from it...,"(macos, fonts, photoshop)",6,0,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",3
202,502,1,-1,7090,2008-08-02 17:01:58.500,59,17726,<p>I have a cross-platform (Python) applicatio...,147,2020-06-17 20:47:48.287,Get a preview JPEG of a PDF on Windows?,"(windows, image, pdf)",3,0,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",9
215,535,1,-1,541,2008-08-02 18:43:54.787,69,9590,<p>I am starting to work on a hobby project wi...,154,2018-05-14 17:46:14.650,Continuous Integration System for a Python Cod...,"(continuous-integration, extreme-programming)",7,0,"[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2
240,594,1,-1,595,2008-08-03 01:15:08.507,55,58973,<p>There are several ways to iterate over a re...,116,2016-10-15 20:47:11.027,cx_Oracle: How do I iterate over a result set?,"(sql, database, oracle, cx-oracle)",3,1,"[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59171855,77031662,1,-1,-1,2023-09-03 09:21:50.950,0,9,<p>Write a Python function named bookStore tha...,22491996,2023-09-03 09:21:50.950,How do I make sure I'm only taking a certain v...,"(function, dictionary)",0,1,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0
59171857,77031664,1,-1,-1,2023-09-03 09:22:27.673,-1,7,<p>i have a problem with updating the text box...,21080209,2023-09-03 09:22:27.673,Updating Textbox on a real time Chat,"(function, pycharm, chat)",0,0,"[1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2
59171873,77031681,1,-1,-1,2023-09-03 09:28:20.777,0,5,<p>I've setup a django cms site for a friend s...,5568154,2023-09-03 09:28:20.777,Django CMS cms_plugin setup for a message form...,"(django, django-cms)",0,0,"[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1
59171876,77031684,1,-1,-1,2023-09-03 09:30:08.627,0,11,<p>I think I'm having an integer overflow issu...,9884278,2023-09-03 09:34:06.857,Integer Overflow when Passing Arguments in C,"(c, integer-overflow)",1,1,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0


In [15]:
# posts['MainTopic'] = posts.apply(lambda row : get_main_topic(row), axis=1)

Assign the topic of parent post to the answers.

In [None]:
question_maintopic_mapping = dict(zip(posts['Id'], posts['MainTopic']))
answers['MainTopic'] = answers['ParentId'].map(question_maintopic_mapping)
print("Answers topic assigned")

Assign the topic of parent post or answer to the comments.

In [None]:
answer_maintopic_mapping = dict(zip(answers['Id'], answers['MainTopic']))

posts_topic_mapping = {**question_maintopic_mapping, **answer_maintopic_mapping} 

comments['MainTopic'] = comments['PostId'].map(posts_topic_mapping)
print("Comments topic assigned")

## Scoring

In [12]:
def get_sentiment(sentiment, i):
    if type(sentiment) == list:
        return sentiment[i]
    else:
        return 0

In [13]:
#Original method

def scoring(topic, weights, row):
    userAnswers = answers[(answers["OwnerUserId"] == row['Id']) and (answers["MainTopic"] == topic)]
    userComments = comments[(comments["UserId"] == row['Id']) and (comments["MainTopic"] == topic)]
    userPosts = posts[(posts["OwnerUserId"] == row['Id']) and (posts["MainTopic"] == topic)]
    return len(userAnswers) * (get_sentiment(row["AnswerSentiment"], topic) * 20*sum(userAnswers["Score"])/sum(userAnswers["ViewCount"]) ) * weights[0] +\
        len(userComments) * min(sum(userComments["Score"])/sum(userComments["Views"]),1) * weights[1] +\
        min(sum(userPosts["Score"])/sum(userPosts["Views"]),1) * weights[2]

In [13]:
WEIGHTS = [0.5, 0.5, 0]


answers_grouped = answers.groupby(['OwnerUserId', 'MainTopic']).agg({'Score': 'sum', 'ViewCount': 'sum', 'Id': 'size'})
comments_grouped = comments.groupby(['UserId', 'MainTopic']).agg({'Score': 'sum', 'Views': 'sum', 'Id': 'size'})
posts_grouped = posts.groupby(['OwnerUserId', 'MainTopic']).agg({'Score': 'sum', 'Views': 'sum', 'Id': 'size'})

def scoring(topic, weights, row):
    # Retrieve data for the specific user and topic, if available
    userAnswers = answers_grouped.loc[(row['Id'], topic)] if (row['Id'], topic) in answers_grouped.index else pd.Series({'Score': 0, 'ViewCount': 1, 'Id': 0})
    userComments = comments_grouped.loc[(row['UserId'], topic)] if (row['UserId'], topic) in comments_grouped.index else pd.Series({'Score': 0, 'Views': 1, 'Id': 0})
    userPosts = posts_grouped.loc[(row['Id'], topic)] if (row['Id'], topic) in posts_grouped.index else pd.Series({'Score': 0, 'Views': 1, 'Id': 0})

    # Calculate the score
    score = userAnswers['Id'] * (get_sentiment(row["AnswerSentiment"], topic) * 20 * userAnswers['Score'] / userAnswers['ViewCount']) * weights[0] +\
            userComments['Id'] * max(userComments['Score'] / userComments['Views'], 1) * weights[1] +\
            max(userPosts['Score'] / userPosts['Views'], 1) * weights[2]

    return score

KeyError: 'MainTopic'

In [None]:
# USERS["Score"] = USERS.apply(lambda row: [scoring(i, WEIGHTS, row) for i,_ in enumerate(TOPIC_TAGS)])

In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Convert the pandas DataFrame to a Dask DataFrame
dask_users = dd.from_pandas(USERS, npartitions=8) # Adjust the number of partitions based on your dataset

# Define a lambda function to be applied to each row
lambda_function = lambda row: [scoring(i, WEIGHTS, row) for i, _ in enumerate(TOPIC_TAGS)]

# Apply the function
# Specify meta as a list, since the function returns a list
dask_users['Score'] = dask_users.apply(lambda_function, axis=1, meta=('Score', 'object'))

# Compute the results to get back a pandas DataFrame with a progress bar
with ProgressBar():
    USERS = dask_users.compute()