# Rec Sys of Answerers for StackOverflow
## Version 2.1 - NLP Approach

Importing Libraries

In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import torch

Loading Data

In [43]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [44]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [45]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [46]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)
# df_question = df_question.head(1000) # For testing

In [47]:
type(df_question['QuestionTags'][0])

list

### Filtering Dataset to include only top 5 tags

In [48]:
# tags_to_filter = {'java', 'python', 'javascript', 'c#', 'android'}  # Faster lookup structure

# # Filter rows where any tag matches one of the target tags
# df_question = df_question[df_question['QuestionTags'].apply(lambda tags: any(tag in tags_to_filter for tag in tags))]


### Dividing the question dataset into Train and Test

In [49]:
# Ordering dataframe before split
# Convert 'QuestionCreationDate' to datetime format using .loc
df_question.loc[:, 'QuestionCreationDate'] = pd.to_datetime(df_question['QuestionCreationDate'])

# Sort the DataFrame by 'QuestionCreationDate' without using inplace
df_question = df_question.sort_values(by='QuestionCreationDate')

In [50]:
# Determine the split index
split_index = int(len(df_question) * 0.9) #10% for test 

# Split the DataFrame into training and testing sets
train_data = df_question.iloc[:split_index]
test_data = df_question.iloc[split_index:]

In [51]:
test_data

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
568215,56546965,11631967,Percentage for each subgroup,[powerbi],2,2019-06-11 15:22:52,1
568216,56546966,10744078,Managing floating point precision - Best pract...,"[python, pandas]",0,2019-06-11 15:23:01,0
568217,56546968,4855106,Is the json import failing because of the JSON...,"[json, python-3.x, visual-studio-code]",0,2019-06-11 15:23:12,0
568218,56546969,9877445,ggplot add text to the center of a donut chart...,"[r, ggplot2, text, data-visualization, donut-c...",2,2019-06-11 15:23:14,1
568219,56546970,11379139,"In Angular application, I keep getting this er...","[node.js, angular]",2,2019-06-11 15:23:16,1
...,...,...,...,...,...,...,...
156669,56828823,8684836,Embedding localized satellite dll into exe app...,"[c++, winforms, c++-cli, clr]",0,2019-06-30 23:55:26,2
156670,56828825,11721401,"PS/2 keyboard won't send keypress interrupts, ...","[c, assembly, x86, interrupt, osdev]",9,2019-06-30 23:55:45,1
156671,56828831,11721524,How to convert a string to an integer in Power...,"[string, powershell, integer, int]",0,2019-06-30 23:58:22,3
156672,56828833,1889720,How do I return the results from a query in Fi...,"[node.js, typescript, firebase, google-cloud-f...",1,2019-06-30 23:58:37,2


In [52]:
train_data

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
767371,53992215,1758023,Using a ScheduledExecutorService to run a task...,"[java, executorservice, java-threads]",0,2019-01-01 00:00:04,1
767372,53992219,4298538,How to programmatically change style sheet of ...,"[c++, qt, qt5, qtstylesheets, qpushbutton]",1,2019-01-01 00:01:55,2
767373,53992220,6266364,Trying to put website into Maintenance Mode (3...,[],2,2019-01-01 00:02:08,2
767374,53992221,9706003,Node.js Lambda Async return Undefined,"[node.js, lambda]",2,2019-01-01 00:02:27,3
767375,53992223,7035033,Unable to print a class list attribute using i...,"[python, python-3.x, list, class, printing]",0,2019-01-01 00:02:37,2
...,...,...,...,...,...,...,...
568210,56546947,114029,Resource 'GUID value here' does not exist or o...,"[authentication, microsoft-graph-api, access-t...",1,2019-06-11 15:21:47,1
568211,56546948,11055403,How to add rows to a table based on widget inp...,"[r, shiny]",0,2019-06-11 15:21:48,1
568212,56546956,10152003,Is there any code for recording python seleniu...,"[python, selenium, testing, selenium-webdriver...",1,2019-06-11 15:22:18,0
568213,56546958,9687794,Laravel 5.8.* using GuzzleHttp on production,"[php, laravel, laravel-5, guzzle, laravel-5.8]",0,2019-06-11 15:22:32,2


Merging QuestionTags to UserAnswerId

In [53]:
df_merged = train_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

Getting values to be embedded and indexes

In [54]:
list_questions = df_merged["QuestionTitle"].tolist()
list_users = df_merged["AnswerOwnerId"].tolist()

### Extrating User Profile Vector

In [55]:
# Check device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Encode all questions in a single batch
all_embeddings = model.encode(list_questions, batch_size=64, show_progress_bar=True)

Using device: mps


Batches:   0%|          | 0/15566 [00:00<?, ?it/s]

In [56]:
# Normalize user embeddings for cosine similarity
all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)

### Computing Unanswered questions embeddings

In [57]:
df_merged_test = test_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId")

In [58]:
unanswered_embedding_index = df_merged_test["QuestionId"].to_list()

In [59]:
unanswered_questions = df_merged_test["QuestionTitle"].to_list()


Getting Embedding from Unanswered Questions

In [60]:
# Compute embeddings for unanswered questions
unanswered_embeddings = model.encode(unanswered_questions, batch_size=64, show_progress_bar=True) 

Batches:   0%|          | 0/1711 [00:00<?, ?it/s]

In [61]:
# Normalize unanswered embeddings
unanswered_embeddings = unanswered_embeddings / np.linalg.norm(unanswered_embeddings, axis=1, keepdims=True)

In [62]:
unanswered_embeddings = unanswered_embeddings.astype('float32')

### Providing Recommendation based on Embedding Similarity

In [63]:
# Calculate cosine similarity (Matrix multiplication)
# Each row is a question and column is a user
# similarities = np.dot(unanswered_embeddings, user_embeddings.T)  

Alternative method to avoid memory overload

In [64]:
import gc  # Garbage collector

def get_recommendation_top_batched(top_n, batch_size=1000):
    num_test = unanswered_embeddings.shape[0]

    for start in tqdm(range(0, num_test, batch_size), desc="Processing Batches"):
        end = min(start + batch_size, num_test)
        batch = unanswered_embeddings[start:end]

        # Compute similarities (rows = questions, columns = question of a user)
        batch_similarities = np.dot(batch, all_embeddings.T) 

        # Get top-N indices 
        top_indices = np.argsort(-batch_similarities, axis=1)[:, :top_n] 
        
        del batch_similarities  
        gc.collect()  # Force Python to release memory
        # print(top_indices)
        # Yield results one batch at a time
        for i, question_id in enumerate(unanswered_embedding_index[start:end]):
            yield question_id, [list_users[j] for j in top_indices[i]]

# Use the generator directly instead of storing everything
dic_recommendation = {qid: recommendations for qid, recommendations in get_recommendation_top_batched(top_n=1000)}


Processing Batches: 100%|██████████| 110/110 [2:54:51<00:00, 95.38s/it]  


In [65]:
def remove_repeats_numpy(arr):
    row, idx = np.unique(arr, return_index=True)  # Get unique values and first occurrence index
    sorted_row = row[idx.argsort()] # Sort by the original order
    return np.array(sorted_row, dtype=int)

In [66]:
dic_recommendation =  {qid: remove_repeats_numpy(recommendations) for qid, recommendations in dic_recommendation.items()}

## Evaluation

Grouping our **ground truth** that is the users that answered the question in real life

In [67]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags", "QuestionTitle"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [68]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Compare Ground Truth with Recommendation

In [69]:
def top_n_accuracy(ground_truth_dic, top_users_list, top_n):
    tp = 0
    
    for question in ground_truth_dic:    
        # Checking if it is in the answer
        users_recommendation = top_users_list[question["QuestionId"]][:top_n]
        users_ground_truth = question["GroundTruth"]
        if set(users_recommendation) & set(users_ground_truth):
            tp += 1
            
    print(tp)
    return tp/len(ground_truth_dic)

In [70]:
def evaluate_recommendations(users_ground_truth_list, users_recommendation_list, top_n):

    total_recall = 0
    successful_recommendations = 0
    num_questions = len(users_ground_truth_list)

    for users_ground_truth, users_rec in zip(users_ground_truth_list, users_recommendation_list):
        # Get the top-N recommended users
        users_recommendation = users_rec[:top_n]
        # Convert to sets for intersection
        recommended_set = set(users_recommendation)
        ground_truth_set = set(users_ground_truth)
        # Calculate the number of relevant users in the top-N recommendations
        true_positives = len(recommended_set & ground_truth_set)
        # Recall@N: Proportion of relevant users that are recommended in top-N
        recall = true_positives / len(ground_truth_set) if ground_truth_set else 0
        # Accumulate recall
        total_recall += recall
        # Accuracy@N: Check if there's at least one relevant user in the top-N recommendations
        if true_positives > 0:
            successful_recommendations += 1

    # Calculate average recall over all questions
    avg_recall = total_recall / num_questions if num_questions > 0 else 0
    # Calculate accuracy over all questions
    accuracy = successful_recommendations / num_questions if num_questions > 0 else 0

    return avg_recall, accuracy

In [71]:
users_ground_truth = [ground_truth_item["GroundTruth"] for ground_truth_item in ground_truth_dic]
users_recommendation_list = [dic_recommendation[item["QuestionId"]] for item in ground_truth_dic]

In [72]:
top_users = [5, 10, 20, 50, 100]
results = []

for top in top_users:
    recall, acc = evaluate_recommendations(users_ground_truth, users_recommendation_list, top)
    result = {
        "acc": acc,
        "recall": recall,
        "top": top,
        "type": "nlp_sentence_transformer_question"
    }
    
    print(result)
    results.append(result)
    

{'acc': 0.07857410777544697, 'recall': 0.062397348573387204, 'top': 5, 'type': 'nlp_sentence_transformer_question'}
{'acc': 0.114040815192476, 'recall': 0.09042604538429572, 'top': 10, 'type': 'nlp_sentence_transformer_question'}
{'acc': 0.1554672631037884, 'recall': 0.12352941057529682, 'top': 20, 'type': 'nlp_sentence_transformer_question'}
{'acc': 0.2182876512509898, 'recall': 0.17434166728841202, 'top': 50, 'type': 'nlp_sentence_transformer_question'}
{'acc': 0.2705083144631371, 'recall': 0.21748667604127372, 'top': 100, 'type': 'nlp_sentence_transformer_question'}


In [73]:
results

[{'acc': 0.07857410777544697,
  'recall': 0.062397348573387204,
  'top': 5,
  'type': 'nlp_sentence_transformer_question'},
 {'acc': 0.114040815192476,
  'recall': 0.09042604538429572,
  'top': 10,
  'type': 'nlp_sentence_transformer_question'},
 {'acc': 0.1554672631037884,
  'recall': 0.12352941057529682,
  'top': 20,
  'type': 'nlp_sentence_transformer_question'},
 {'acc': 0.2182876512509898,
  'recall': 0.17434166728841202,
  'top': 50,
  'type': 'nlp_sentence_transformer_question'},
 {'acc': 0.2705083144631371,
  'recall': 0.21748667604127372,
  'top': 100,
  'type': 'nlp_sentence_transformer_question'}]

In [74]:
pd.DataFrame(results).to_csv("setup3.csv", index=False)