# Rec Sys of Answerers for StackOverflow
## Version 1.0 - NLP Approach

Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import torch

Loading Data

In [2]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [3]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [4]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [5]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


In [6]:
type(df_question['QuestionTags'][0])

list

### Filtering Dataset to include only top 5 tags

In [7]:
tags_to_filter = {'java', 'python', 'javascript', 'c#', 'android'}  # Faster lookup structure

# Filter rows where any tag matches one of the target tags
filtered_df = df_question[df_question['QuestionTags'].apply(lambda tags: any(tag in tags_to_filter for tag in tags))]


### Dividing the question dataset into Train and Test

In [8]:
train_data, test_data = train_test_split(filtered_df, test_size=0.1, random_state=42)

Merging QuestionTags to UserAnswerId

In [9]:
df_merged = train_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

In [10]:
dic_user_questions = df_merged.groupby("AnswerOwnerId").apply(lambda x: x[['QuestionTitle', 'QuestionTags']].to_dict(orient='records')).to_dict()

# Extract list of question of the same user
# user_question_list = df_merged.groupby("AnswerOwnerId")["QuestionTitle", "QuestionTags"].apply(list).to_dict()
# user_question_list = df_merged.iloc[:1000].groupby("AnswerOwnerId")["QuestionTitle"].apply(list).to_dict() #REMOVE: here for testing

### Extrating User Profile Vector

In [11]:
# # Load transformer model 

# # Check if GPU is available
# device = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {device}")
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# # Computing user vector 
# user_vectors = {}
# for user, questions in tqdm(user_question_list.items(), desc="Computing user profile vector"):
#     question_embeddings = model.encode(questions)  # Process in batch the questions of the same user
#     user_profile_vector = np.mean(question_embeddings, axis=0)  
#     user_vectors[user] = user_profile_vector 
    


In [12]:
# Check device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

Using device: mps


Tags

In [13]:
# Extract unique tags
unique_tags = {tag for sublist in df_merged['QuestionTags'] for tag in sublist}

# Convert the set to a list (optional)
unique_tags = list(unique_tags)

# Get Embedding for all tags
tags_embedding =  model.encode(unique_tags, batch_size=64, show_progress_bar=True)

dic_tags_embedding = {tag: embedding for tag, embedding in zip(unique_tags, tags_embedding)}


Batches:   0%|          | 0/296 [00:00<?, ?it/s]

Questions

In [28]:
# Join all question together to speed up embedding process
all_questions = []
user_question_indices = {}

index_counter = 0

for user, questions in dic_user_questions.items():
    user_question_indices[user] = (index_counter, index_counter + len(questions)) #Get start and end of user indices
    for question in questions:
        all_questions.append(question["QuestionTitle"])
    index_counter += len(questions)

In [None]:
# Encode all questions in a single batch
all_embeddings = model.encode(all_questions, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/6392 [00:00<?, ?it/s]

In [91]:
all_embeddings.shape

(409027, 384)

In [92]:
len(user_vectors.keys())

127581

Tags

In [30]:
## Concatenate title and tag embedding 
user_tag_embedding = {}
for user, questions in dic_user_questions.items():
    user_vectors_tags = []
    for question in questions:
        user_vectors_tags.extend(question["QuestionTags"])
    unique_embedding_user = set(user_vectors_tags)
    
    tag_embeddings = [dic_tags_embedding[tag_emb] for tag_emb in unique_embedding_user]
    user_tag_embedding[user]= np.mean(tag_embeddings, axis=0)
    

In [31]:
# Compute user profile vectors from embeddings
user_vectors = {}
for user, (start, end) in user_question_indices.items():
    titles_embedding = np.mean(all_embeddings[start:end], axis=0)
    user_vectors[user] = np.concatenate((titles_embedding, user_tag_embedding[user]))

In [32]:
user_ids = list(user_vectors.keys()) # used later to when getting user for recommendation
user_embeddings = np.vstack(list(user_vectors.values()))  # Shape: (num_users, 384)
user_embeddings = user_embeddings.astype('float32')

In [33]:
print(f"Number of users: {user_embeddings.shape[0]} - Vector size: {user_embeddings.shape[1]}")

Number of users: 127581 - Vector size: 768


In [34]:
# Normalize user embeddings for cosine similarity
user_embeddings = user_embeddings / np.linalg.norm(user_embeddings, axis=1, keepdims=True)

### Computing Unanswered questions embeddings

In [35]:
df_merged_test = test_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )
# df_merged_test = df_merged_test.iloc[:1000] #REMOVE: Here for testing

In [36]:
unanswered_embedding_index = df_merged_test["QuestionId"].to_list()

In [37]:
unanswered_questions = df_merged_test["QuestionTitle"].to_list()


Getting Embedding from Unanswered Questions

In [53]:
# Compute embeddings for unanswered questions
unanswered_embeddings = model.encode(unanswered_questions, batch_size=64, show_progress_bar=True) 

Batches:   0%|          | 0/714 [00:00<?, ?it/s]

Getting Tag Embedding from Unanswered Questions

In [48]:
# Extract unique tags
unique_tags_test = {tag for sublist in df_merged_test['QuestionTags'] for tag in sublist}

# Convert the set to a list (optional)
unique_tags_test = list(unique_tags_test)

# Get Embedding for all tags
tags_embedding_test =  model.encode(unique_tags_test, batch_size=64, show_progress_bar=True)

dic_tags_embedding_test = {tag: embedding for tag, embedding in zip(unique_tags_test, tags_embedding_test)}

Batches:   0%|          | 0/119 [00:00<?, ?it/s]

In [49]:
list_tags_new_questions = df_merged_test['QuestionTags'].to_list()
new_question_tag_embedding = []
for tags in list_tags_new_questions:
    tag_embeddings = [dic_tags_embedding_test[tag] for tag in tags]
    new_question_tag_embedding.append(np.mean(tag_embeddings, axis=0))
    

In [57]:
unanswered_embeddings.shape

(45645, 384)

Concatenate the arrays

In [60]:
unanswered_embeddings_concat = np.concatenate((unanswered_embeddings, new_question_tag_embedding), axis=1)

In [69]:
unanswered_embeddings_concat.shape

(45645, 768)

Normalizing

In [70]:
# Normalize unanswered embeddings
unanswered_embeddings = unanswered_embeddings_concat / np.linalg.norm(unanswered_embeddings_concat, axis=1, keepdims=True)

In [71]:
unanswered_embeddings = unanswered_embeddings.astype('float32')

### Providing Recommendation based on Embedding Similarity

In [72]:
# Calculate cosine similarity (Matrix multiplication)
# Each row is a question and column is a user
# similarities = np.dot(unanswered_embeddings, user_embeddings.T)  

Creating dictionary QuestionId and Top Suggestions

In [73]:
# def get_recommendation_top(top_n):
#     top_indices = np.argsort(-similarities, axis=1)[:, :top_n]  # Negative for descending sort
#     # top_scores = np.sort(-similarities, axis=1)[:, :top_n]
#     # top_scores = - top_scores
    
#     # Map indices to the ones in the dataset
#     top_system_id = []
#     for indices_list in top_indices: 
#         mapping = []
#         for i in indices_list:
#             mapping.append(user_ids[i])
#         top_system_id.append(mapping)
#         dic_recommendation = {question_id: top_system for question_id, top_system in zip(unanswered_embedding_index,top_system_id)}
    
#     return dic_recommendation

Alternative method to avoid memory overload

In [76]:
import gc  # Garbage collector

def get_recommendation_top_batched(top_n, batch_size=1000):
    num_test = unanswered_embeddings.shape[0]

    for start in tqdm(range(0, num_test, batch_size), desc="Processing Batches"):
        end = min(start + batch_size, num_test)
        batch = unanswered_embeddings[start:end]

        # Compute similarities (rows = questions, columns = users)
        batch_similarities = np.dot(batch, user_embeddings.T) 

        # Get top-N indices 
        top_indices = np.argsort(-batch_similarities, axis=1)[:, :top_n] 
        
        del batch_similarities  
        gc.collect()  # Force Python to release memory

        # Yield results one batch at a time
        for i, question_id in enumerate(unanswered_embedding_index[start:end]):
            yield question_id, [user_ids[j] for j in top_indices[i]]

# Use the generator directly instead of storing everything
dic_recommendation = {qid: recommendations for qid, recommendations in get_recommendation_top_batched(top_n=50)}


Processing Batches: 100%|██████████| 46/46 [07:45<00:00, 10.12s/it]


## Evaluation

Grouping our **ground truth** that is the users that answered the question in real life

In [83]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags", "QuestionTitle"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [84]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Compare Ground Truth with Recommendation

In [85]:
def top_n_accuracy(ground_truth_dic, top_users_list, top_n):
    tp = 0
    
    for question in ground_truth_dic:    
        # Checking if it is in the answer
        users_recommendation = top_users_list[question["QuestionId"]][:top_n]
        users_ground_truth = question["GroundTruth"]
        if set(users_recommendation) & set(users_ground_truth):
            tp += 1
            
    print(tp)
    return tp/len(ground_truth_dic)

In [None]:
top_users = [10, 20, 50]
results = []

for top_n in top_users:
    results.append({
        "acc": top_n_accuracy(ground_truth_dic, dic_recommendation, top_n),
        "top": top_n,
        "type": "nlp_sentence_transformer_tag"
    })
    

1694
2420
3809


In [88]:
results

[{'acc': 0.059713067080263665, 'top': 10, 'type': 'nlp_sentence_transformer'},
 {'acc': 0.08530438154323382, 'top': 20, 'type': 'nlp_sentence_transformer'},
 {'acc': 0.13426627656949486, 'top': 50, 'type': 'nlp_sentence_transformer'}]

In [89]:
pd.DataFrame(results).to_csv("prototype_embedding_tags.csv", index=False)