# Rec Sys of Answerers for StackOverflow
## Version 2.0 - NLP Approach

Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import torch

Loading Data

In [3]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [4]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [5]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [6]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


In [7]:
type(df_question['QuestionTags'][0])

list

### Filtering Dataset to include only top 5 tags

In [8]:
# tags_to_filter = {'java', 'python', 'javascript', 'c#', 'android'}  # Faster lookup structure

# # Filter rows where any tag matches one of the target tags
# df_question = df_question[df_question['QuestionTags'].apply(lambda tags: any(tag in tags_to_filter for tag in tags))]


### Dividing the question dataset into Train and Test

In [9]:
# Ordering dataframe before split
# Convert 'QuestionCreationDate' to datetime format using .loc
df_question.loc[:, 'QuestionCreationDate'] = pd.to_datetime(df_question['QuestionCreationDate'])

# Sort the DataFrame by 'QuestionCreationDate' without using inplace
df_question = df_question.sort_values(by='QuestionCreationDate')

In [10]:
# Determine the split index
split_index = int(len(df_question) * 0.9) #10% for test 

# Split the DataFrame into training and testing sets
train_data = df_question.iloc[:split_index]
test_data = df_question.iloc[split_index:]

Merging QuestionTags to UserAnswerId

In [11]:
df_merged = train_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

In [12]:
# Extract list of question of the same user
user_question_list = df_merged.groupby("AnswerOwnerId")["QuestionTitle"].apply(list).to_dict()

### Extrating User Profile Vector

In [13]:
#Load Model
# Check device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Join all question together to speed up embedding process
all_questions = []
user_question_indices = {}

index_counter = 0

for user, questions in user_question_list.items():
    user_question_indices[user] = (index_counter, index_counter + len(questions)) #Get start and end of user indices
    all_questions.extend(questions)
    index_counter += len(questions)

# Encode all questions in a single batch
all_embeddings = model.encode(all_questions, batch_size=64, show_progress_bar=True)

# Compute user profile vectors from embeddings
user_vectors = {}
for user, (start, end) in user_question_indices.items():
    user_vectors[user] = np.mean(all_embeddings[start:end], axis=0)


Using device: mps


Batches:   0%|          | 0/15566 [00:00<?, ?it/s]

In [14]:
user_ids = list(user_vectors.keys())
user_embeddings = np.vstack(list(user_vectors.values()))  # Shape: (num_users, 384)
user_embeddings = user_embeddings.astype('float32')

In [15]:
print(f"Number of users: {user_embeddings.shape[0]} - Vector size: {user_embeddings.shape[1]}")

Number of users: 261369 - Vector size: 384


In [16]:
# Normalize user embeddings for cosine similarity
user_embeddings = user_embeddings / np.linalg.norm(user_embeddings, axis=1, keepdims=True)

In [17]:
# List of user IDs corresponding to embeddings position
user_ids = [i for i in user_vectors.keys()]

### Computing Unanswered questions embeddings

In [18]:
df_merged_test = test_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId" )

In [19]:
unanswered_embedding_index = df_merged_test["QuestionId"].to_list()
unanswered_questions = df_merged_test["QuestionTitle"].to_list()

Getting Embedding from Unanswered Questions

In [20]:
# Compute embeddings for unanswered questions
unanswered_embeddings = model.encode(unanswered_questions, batch_size=64, show_progress_bar=True) 

Batches:   0%|          | 0/1711 [00:00<?, ?it/s]

In [21]:
# Normalize unanswered embeddings
unanswered_embeddings = unanswered_embeddings / np.linalg.norm(unanswered_embeddings, axis=1, keepdims=True)

In [22]:
unanswered_embeddings = unanswered_embeddings.astype('float32')

### Providing Recommendation based on Embedding Similarity

In [23]:
# Calculate cosine similarity (Matrix multiplication)
# Each row is a question and column is a user
# similarities = np.dot(unanswered_embeddings, user_embeddings.T)  

Alternative method to avoid memory overload

In [24]:
import gc  # Garbage collector

def get_recommendation_top_batched(top_n, batch_size=1000):
    num_test = unanswered_embeddings.shape[0]

    for start in tqdm(range(0, num_test, batch_size), desc="Processing Batches"):
        end = min(start + batch_size, num_test)
        batch = unanswered_embeddings[start:end]

        # Compute similarities (rows = questions, columns = users)
        batch_similarities = np.dot(batch, user_embeddings.T) 

        # Get top-N indices 
        top_indices = np.argsort(-batch_similarities, axis=1)[:, :top_n] 
        
        del batch_similarities  
        gc.collect()  # Force Python to release memory

        # Yield results one batch at a time
        for i, question_id in enumerate(unanswered_embedding_index[start:end]):
            yield question_id, [user_ids[j] for j in top_indices[i]]

# Use the generator directly instead of storing everything
dic_recommendation = {qid: recommendations for qid, recommendations in get_recommendation_top_batched(top_n=100)}


Processing Batches: 100%|██████████| 110/110 [39:21<00:00, 21.47s/it]


## Evaluation

Grouping our **ground truth** that is the users that answered the question in real life

In [25]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags", "QuestionTitle"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [26]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Compare Ground Truth with Recommendation

In [27]:
def top_n_accuracy(ground_truth_dic, top_users_list, top_n):
    tp = 0
    
    for question in ground_truth_dic:    
        # Checking if it is in the answer
        users_recommendation = top_users_list[question["QuestionId"]][:top_n]
        users_ground_truth = question["GroundTruth"]
        if set(users_recommendation) & set(users_ground_truth):
            tp += 1
            
    print(tp)
    return tp/len(ground_truth_dic)

In [28]:
def evaluate_recommendations(users_ground_truth_list, users_recommendation_list, top_n):

    total_recall = 0
    successful_recommendations = 0
    num_questions = len(users_ground_truth_list)

    for users_ground_truth, users_rec in zip(users_ground_truth_list, users_recommendation_list):
        # Get the top-N recommended users
        users_recommendation = users_rec[:top_n]
        # Convert to sets for intersection
        recommended_set = set(users_recommendation)
        ground_truth_set = set(users_ground_truth)
        # Calculate the number of relevant users in the top-N recommendations
        true_positives = len(recommended_set & ground_truth_set)
        # Recall@N: Proportion of relevant users that are recommended in top-N
        recall = true_positives / len(ground_truth_set) if ground_truth_set else 0
        # Accumulate recall
        total_recall += recall
        # Accuracy@N: Check if there's at least one relevant user in the top-N recommendations
        if true_positives > 0:
            successful_recommendations += 1

    # Calculate average recall over all questions
    avg_recall = total_recall / num_questions if num_questions > 0 else 0
    # Calculate accuracy over all questions
    accuracy = successful_recommendations / num_questions if num_questions > 0 else 0

    return avg_recall, accuracy

In [29]:
users_ground_truth = [ground_truth_item["GroundTruth"] for ground_truth_item in ground_truth_dic]
users_recommendation_list = [dic_recommendation[item["QuestionId"]] for item in ground_truth_dic]


In [30]:
top_users = [5, 10, 20, 50, 100]
results = []

for top in top_users:
    recall, acc = evaluate_recommendations(users_ground_truth, users_recommendation_list, top)
    result = {
        "acc": acc,
        "recall": recall,
        "top": top,
        "type": "nlp_sentence_transformer_user_profile"
    }
    
    print(result)
    results.append(result)
    

{'acc': 0.0528874873233958, 'recall': 0.043657847593740716, 'top': 5, 'type': 'nlp_sentence_transformer_user_profile'}
{'acc': 0.07728213605990303, 'recall': 0.06353413723807252, 'top': 10, 'type': 'nlp_sentence_transformer_user_profile'}
{'acc': 0.10903963435811233, 'recall': 0.08873387465402718, 'top': 20, 'type': 'nlp_sentence_transformer_user_profile'}
{'acc': 0.16245502410291318, 'recall': 0.13149090241888634, 'top': 50, 'type': 'nlp_sentence_transformer_user_profile'}
{'acc': 0.20903546670741702, 'recall': 0.16957004716970364, 'top': 100, 'type': 'nlp_sentence_transformer_user_profile'}


In [31]:
results

[{'acc': 0.0528874873233958,
  'recall': 0.043657847593740716,
  'top': 5,
  'type': 'nlp_sentence_transformer_user_profile'},
 {'acc': 0.07728213605990303,
  'recall': 0.06353413723807252,
  'top': 10,
  'type': 'nlp_sentence_transformer_user_profile'},
 {'acc': 0.10903963435811233,
  'recall': 0.08873387465402718,
  'top': 20,
  'type': 'nlp_sentence_transformer_user_profile'},
 {'acc': 0.16245502410291318,
  'recall': 0.13149090241888634,
  'top': 50,
  'type': 'nlp_sentence_transformer_user_profile'},
 {'acc': 0.20903546670741702,
  'recall': 0.16957004716970364,
  'top': 100,
  'type': 'nlp_sentence_transformer_user_profile'}]

In [32]:
pd.DataFrame(results).to_csv("setup2.csv", index=False)