# Rec Sys of Answerers for StackOverflow
## Version 1.0 - NLP Approach

Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import torch

Loading Data

In [2]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [3]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [4]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [5]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


In [6]:
type(df_question['QuestionTags'][0])

list

### Filtering Dataset to include only top 5 tags

In [7]:
tags_to_filter = {'java', 'python', 'javascript', 'c#', 'android'}  # Faster lookup structure

# Filter rows where any tag matches one of the target tags
filtered_df = df_question[df_question['QuestionTags'].apply(lambda tags: any(tag in tags_to_filter for tag in tags))]


In [8]:
filtered_df

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
6,54936118,11072710,Getting location of tweets in Twitter API and ...,"[python, twitter, tweepy]",0,2019-03-01 00:03:33,1
8,54936122,9695341,Android version no longer working after upgrad...,"[android, gradle, dart, flutter]",1,2019-03-01 00:04:10,1
9,54936125,8821525,Plotly gives an empty field as output in jupyt...,"[python, jupyter-notebook, plotly, jupyter, ju...",27,2019-03-01 00:04:50,4
...,...,...,...,...,...,...,...
878707,54272387,6865540,Google API + Service Account for impersonate user,"[python, python-3.x, google-api, google-worksp...",2,2019-01-19 23:54:02,0
878711,54272397,10764633,Grouping not working as expected returning mor...,[c#],0,2019-01-19 23:55:52,1
878712,54272404,5615842,How to remove style element from page via Java...,"[javascript, python, selenium]",0,2019-01-19 23:56:36,0
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1


### Dividing the question dataset into Train and Test

In [9]:
train_data, test_data = train_test_split(filtered_df, test_size=0.1, random_state=42)

Merging QuestionTags to UserAnswerId

In [10]:
df_merged = train_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

In [11]:
# dic_user_questions = df_merged.groupby("AnswerOwnerId").apply(lambda x: x[['QuestionTitle', 'QuestionTags']].to_dict(orient='records')).to_dict()

# Extract list of question of the same user
# user_question_list = df_merged.groupby("AnswerOwnerId")["QuestionTitle"].apply(list).to_dict()
# user_question_list = df_merged.iloc[:1000].groupby("AnswerOwnerId")["QuestionTitle"].apply(list).to_dict() #REMOVE: here for testing

In [12]:
list_questions = df_merged["QuestionTitle"].tolist()
list_tags =  [' '.join(tags) for tags in df_merged["QuestionTags"].tolist()]
list_users = df_merged["AnswerOwnerId"].tolist()

### Extrating Question Vector

In [13]:
# Check device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Encode all questions in a single batch
all_embeddings = model.encode(list_questions, batch_size=64, show_progress_bar=True)

Using device: mps


Batches:   0%|          | 0/6392 [00:00<?, ?it/s]

In [14]:
all_embeddings_tags = model.encode(list_tags, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/6392 [00:00<?, ?it/s]

In [15]:
concat_embedding = np.concatenate((all_embeddings, all_embeddings_tags), axis=1)

In [16]:
# Normalize user embeddings for cosine similarity
concat_embedding = concat_embedding / np.linalg.norm(concat_embedding, axis=1, keepdims=True)

### Computing Unanswered questions embeddings

In [17]:
df_merged_test = test_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )
# df_merged_test = df_merged_test.iloc[:1000] #REMOVE: Here for testing

In [18]:
unanswered_embedding_index = df_merged_test["QuestionId"].to_list()

In [30]:
unanswered_questions = df_merged_test["QuestionTitle"].to_list()
unanswered_questions_tags =  [' '.join(tags) for tags in df_merged_test["QuestionTags"].tolist()]


Getting Embedding from Unanswered Questions

In [20]:
# Compute embeddings for unanswered questions
unanswered_embeddings = model.encode(unanswered_questions, batch_size=64, show_progress_bar=True) 

Batches:   0%|          | 0/714 [00:00<?, ?it/s]

In [31]:
unanswered_tags_embeddings = model.encode(unanswered_questions_tags, batch_size=64, show_progress_bar=True) 

Batches:   0%|          | 0/714 [00:00<?, ?it/s]

In [32]:
concat_unanswered_embedding = np.concatenate((unanswered_embeddings, unanswered_tags_embeddings), axis=1)

In [33]:
# Normalize unanswered embeddings
concat_unanswered_embedding = concat_unanswered_embedding / np.linalg.norm(concat_unanswered_embedding, axis=1, keepdims=True)

In [34]:
concat_unanswered_embedding = concat_unanswered_embedding.astype('float32')

### Providing Recommendation based on Embedding Similarity

In [None]:
# Calculate cosine similarity (Matrix multiplication)
# Each row is a question and column is a user
# similarities = np.dot(unanswered_embeddings, user_embeddings.T)  

Creating dictionary QuestionId and Top Suggestions

In [None]:
# def get_recommendation_top(top_n):
#     top_indices = np.argsort(-similarities, axis=1)[:, :top_n]  # Negative for descending sort
#     # top_scores = np.sort(-similarities, axis=1)[:, :top_n]
#     # top_scores = - top_scores
    
#     # Map indices to the ones in the dataset
#     top_system_id = []
#     for indices_list in top_indices: 
#         mapping = []
#         for i in indices_list:
#             mapping.append(user_ids[i])
#         top_system_id.append(mapping)
#         dic_recommendation = {question_id: top_system for question_id, top_system in zip(unanswered_embedding_index,top_system_id)}
    
#     return dic_recommendation

Alternative method to avoid memory overload

In [35]:
import gc  # Garbage collector

def get_recommendation_top_batched(top_n, batch_size=1000):
    num_test = concat_unanswered_embedding.shape[0]

    for start in tqdm(range(0, num_test, batch_size), desc="Processing Batches"):
        end = min(start + batch_size, num_test)
        batch = concat_unanswered_embedding[start:end]

        # Compute similarities (rows = questions, columns = question of a user)
        batch_similarities = np.dot(batch, concat_embedding.T) 

        # Get top-N indices 
        top_indices = np.argsort(-batch_similarities, axis=1)[:, :top_n] 
        
        del batch_similarities  
        gc.collect()  # Force Python to release memory

        # Yield results one batch at a time
        for i, question_id in enumerate(unanswered_embedding_index[start:end]):
            yield question_id, [list_users[j] for j in top_indices[i]]

# Use the generator directly instead of storing everything
dic_recommendation = {qid: recommendations for qid, recommendations in get_recommendation_top_batched(top_n=50)}


Processing Batches: 100%|██████████| 46/46 [25:21<00:00, 33.07s/it]


## Evaluation

Grouping our **ground truth** that is the users that answered the question in real life

In [36]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags", "QuestionTitle"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [37]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Compare Ground Truth with Recommendation

In [38]:
def top_n_accuracy(ground_truth_dic, top_users_list, top_n):
    tp = 0
    
    for question in ground_truth_dic:    
        # Checking if it is in the answer
        users_recommendation = top_users_list[question["QuestionId"]][:top_n]
        users_ground_truth = question["GroundTruth"]
        if set(users_recommendation) & set(users_ground_truth):
            tp += 1
            
    print(tp)
    return tp/len(ground_truth_dic)

In [39]:
top_users = [10, 20, 50]
results = []

for top_n in top_users:
    results.append({
        "acc": top_n_accuracy(ground_truth_dic, dic_recommendation, top_n),
        "top": top_n,
        "type": "nlp_sentence_transformer"
    })
    

3935
5356
7517


In [40]:
results

[{'acc': 0.13870774436885333, 'top': 10, 'type': 'nlp_sentence_transformer'},
 {'acc': 0.1887976312171737, 'top': 20, 'type': 'nlp_sentence_transformer'},
 {'acc': 0.2649723289506151, 'top': 50, 'type': 'nlp_sentence_transformer'}]

In [None]:
pd.DataFrame(results).to_csv("prototype_embedding.csv", index=False)