# Rec Sys of Answerers for StackOverflow
## Version 1.0 - NLP Approach

Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import torch

Loading Data

In [2]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [3]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [4]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [5]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


### Dividing the question dataset into Train and Test

In [6]:
train_data, test_data = train_test_split(df_question, test_size=0.2, random_state=42)

Merging QuestionTags to UserAnswerId

In [7]:
df_merged = train_data[["QuestionId","QuestionTitle","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )
df_merged

Unnamed: 0,QuestionId,QuestionTitle,QuestionTags,AnswerOwnerId
0,55832406,How to get height of an element at each page u...,"[reactjs, react-redux, react-lifecycle]",249871
1,55832406,How to get height of an element at each page u...,"[reactjs, react-redux, react-lifecycle]",166168
2,56213955,Python worker failed to connect back in Pyspar...,"[apache-spark, pyspark]",12787236
3,56213955,Python worker failed to connect back in Pyspar...,"[apache-spark, pyspark]",6014418
4,56213955,Python worker failed to connect back in Pyspar...,"[apache-spark, pyspark]",5029185
...,...,...,...,...
884878,56453223,Positioning of selected p values using ggboxplot,"[r, ggplot2, boxplot, p-value, ggpubr]",1222578
884879,54630747,Sequelize - How to get entries from one table ...,"[node.js, orm, sequelize.js]",5044890
884880,56278363,Parse list type data into different format to ...,"[javascript, json, reactjs, react-data-grid, d...",117030
884881,56108342,Not getting the right text after stemming in t...,"[r, tm, stemming, snowball]",5028841


In [20]:
# dic_user_questions = df_merged.head().groupby("AnswerOwnerId").apply(lambda x: x[['QuestionTitle', 'QuestionTags']].to_dict(orient='records')).to_dict()

# Extract list of question of the same user
user_question_list = df_merged.iloc[:1000].groupby("AnswerOwnerId")["QuestionTitle"].apply(list).to_dict()

### Extrating User Profile Vector

In [21]:
# Load transformer model 

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Computing user vector 
user_vectors = {}
for user, questions in tqdm(user_question_list.items(), desc="Computing user profile vector"):
    question_embeddings = model.encode(questions)  # Process in batch the questions of the same user
    user_profile_vector = np.mean(question_embeddings, axis=0)  
    user_vectors[user] = user_profile_vector 
    


Computing user profile vectors: 100%|██████████| 958/958 [01:35<00:00, 10.05it/s]


In [None]:
user_ids = list(user_vectors.keys())
user_embeddings = np.vstack(list(user_vectors.values()))  # Shape: (num_users, 384)

In [27]:
print(f"Number of users: {user_embeddings.shape[0]} - Vector size: {user_embeddings.shape[1]}")

Number of users: 958 - Vector size: 384


### Computing Unanswered questions embeddings