# Rec Sys of Answerers for StackOverflow
## Version 1.0

Loading Libraries

In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast


Loading Data

In [35]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [36]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [37]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [38]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)

In [39]:
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


Tranforming upvotes (df_answer)

In [40]:
epsilon = 1

# Adjust upvotes
df_answer['adjusted_upvotes'] = df_answer['AnswerVotes'].apply(lambda x: x + epsilon if x >= 0 else x)

# Apply transformation (for considering also negative values)
df_answer['transformed_upvotes'] = df_answer['adjusted_upvotes'].apply(
    lambda x: np.log1p(x) if x > 0 else -np.log1p(abs(x))
)

## Prototype 1 - Approach Collaborative Filtering


- **Goal:** Find answerers to new posted question.
  
- **Approach:** Collaborative Filtering (Item Based) --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags
2. Calculate cosine similarity between tags
3. Calculate the the score of the user given the number of times they answerered certain question and the similarity of that tag with the the tags of the new question.

### Dividing the question dataset into Train and Test

In [41]:
train_data, test_data = train_test_split(df_question, test_size=0.05, random_state=42)

Merging QuestionTags to UserAnswerId

In [42]:
df_merged = train_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId", "transformed_upvotes"]], how='inner', on = "QuestionId", )

### Indexing tags and users

In [43]:
dic_merged = df_merged.to_dict(orient="records")

In [44]:
unique_tags = list({tag for question in dic_merged for tag in question['QuestionTags']})
unique_users = list({question['AnswerOwnerId'] for question in dic_merged})

tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
user_to_index = {user: idx for idx, user in enumerate(unique_users)}

In [45]:
dic_merged

[{'QuestionId': 56131754,
  'QuestionTags': ['c'],
  'AnswerOwnerId': 45685,
  'transformed_upvotes': 1.0986122886681098},
 {'QuestionId': 56131754,
  'QuestionTags': ['c'],
  'AnswerOwnerId': 7310570,
  'transformed_upvotes': 1.0986122886681098},
 {'QuestionId': 55161024,
  'QuestionTags': ['java', 'spring', 'spring-boot', 'aspectj'],
  'AnswerOwnerId': 12029935,
  'transformed_upvotes': 0.6931471805599453},
 {'QuestionId': 55161024,
  'QuestionTags': ['java', 'spring', 'spring-boot', 'aspectj'],
  'AnswerOwnerId': 14349319,
  'transformed_upvotes': 0.6931471805599453},
 {'QuestionId': 54288605,
  'QuestionTags': ['php', 'mysql'],
  'AnswerOwnerId': 4903440,
  'transformed_upvotes': 0.6931471805599453},
 {'QuestionId': 54566354,
  'QuestionTags': ['java', 'knapsack-problem'],
  'AnswerOwnerId': 2952011,
  'transformed_upvotes': 1.0986122886681098},
 {'QuestionId': 54566354,
  'QuestionTags': ['java', 'knapsack-problem'],
  'AnswerOwnerId': 10232790,
  'transformed_upvotes': 0.69314718

### Creating User-Tag Matrix

In [51]:
# Step 1: Create the Frequency Matrix (count occurrences)
rows_freq, cols_freq, data_freq = [], [], []
rows_upvote, cols_upvote, data_upvote = [], [], []

for question in dic_merged:
    user_idx = user_to_index[question['AnswerOwnerId']]
    for tag in question['QuestionTags']:
        tag_idx = tag_to_index[tag]
        
        # Frequency: Count interactions
        rows_freq.append(user_idx)
        cols_freq.append(tag_idx)
        data_freq.append(1)
        
        # Upvote: Sum upvotes
        upvotes = question['transformed_upvotes']
        if isinstance(upvotes, (int, float)):
            rows_upvote.append(user_idx)
            cols_upvote.append(tag_idx)
            data_upvote.append(upvotes)

# Step 2: Construct the matrices
frequency_matrix = csr_matrix((data_freq, (rows_freq, cols_freq)), shape=(len(unique_users), len(unique_tags)))
upvote_matrix = csr_matrix((data_upvote, (rows_upvote, cols_upvote)), shape=(len(unique_users), len(unique_tags)))

# Step 3: Compute the weighted matrix by element-wise multiplication
user_tag_matrix_sparse = frequency_matrix.multiply(upvote_matrix)

# Optional: Convert to a dense matrix to inspect the result (only if it's small)
# print(weighted_matrix.todense())


### Calculating Tags Similarity Matrix

In [52]:
# Compute Tag Similarity
cos_sim_matrix = cosine_similarity(user_tag_matrix_sparse.T)

### Recommending User

In [53]:
## Example to 1 question
# tags = test_data["QuestionTags"].iloc[0]
# tags

tags = ["r", "radar-chart", "spider-chart"]

In [54]:
def recommend_users_collaborative(tags, top_n=10):
    # Get indices of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
    
    # Combine similarities for the requested tags
    similarity_tags = [cos_sim_matrix[:, tag_index] for tag_index in tag_indices]
    combined_similarity = sum(similarity_tags)

    # Compute user scores 
    user_scores_vector = user_tag_matrix_sparse.dot(combined_similarity)

    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    # return ranked_users[:top_n]
    return ranked_users

In [55]:
return_1 = recommend_users_collaborative(tags)

## Prototype 2 - Approach Content Based-Filtering


- **Goal:** Find answerers to new posted question.
  
- **Approach:** Content Based-Filtering --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags
2. Create vector of to the new tag based on the Matrix of User-tags 
3. Calculate cosine similarity for each user with vector created in step 2
4. Order list to show the most recommended users

In [56]:
# We are going start using some User-Tag matrix of the previous approach

def recommend_users_content(tags, top_n=10):
    # Get index of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
            
    # Create vector of the tags in question
    question_vector = np.zeros(user_tag_matrix_sparse.shape[1])
    question_vector[tag_indices] = 1
    question_vector= question_vector.reshape(1, -1) #Reshaping array to calculate similarity

    user_scores_vector = cosine_similarity(user_tag_matrix_sparse, question_vector)
    user_scores_vector = user_scores_vector.flatten()

    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    # return ranked_users[:10]
    return ranked_users


In [57]:
return_2 = recommend_users_content(tags)

## Evaluation

Top-N Accuracy

In [58]:
df_merged_test = test_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

Joining the Users to form a single ground truth

In [59]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [60]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Calculate the results for all questions

In [61]:
import threading
from functools import lru_cache

# Cached recommendation function wrapper
@lru_cache(maxsize=len(ground_truth_dic))
def cached_recommendation(tags, recommendation_function, top_n):
    return recommendation_function(tuple(tags))[:top_n]

# Threaded version of the evaluation process with caching
def threaded_top_n_accuracy_with_cache(ground_truth_dic, recommendation_function, top_n=5):
    def evaluate_question(question, result, index):
        # Get recommendations using cached function
        top_users = cached_recommendation(tuple(question["QuestionTags"]), recommendation_function, top_n)
        top_users_list = [user[0] for user in top_users]
        
        # Check for overlap with ground truth
        users_groundtruth = question["GroundTruth"]
        result[index] = int(bool(set(top_users_list) & set(users_groundtruth)))

    # Shared list to store results
    results = [0] * len(ground_truth_dic)
    threads = []

    # Create and start threads
    for idx, question in enumerate(ground_truth_dic):
        thread = threading.Thread(target=evaluate_question, args=(question, results, idx))
        threads.append(thread)
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()
        

    # Calculate accuracy
    return sum(results) / len(ground_truth_dic)


Collaborative

In [65]:
top_n = [10, 20, 50, 100]
results = []

for top in top_n:
    result = {
        "acc": threaded_top_n_accuracy_with_cache(ground_truth_dic[:100], recommend_users_collaborative, top),
        "top": top,
        "type": "collaborative"
    }
    
    print(result)
    results.append(result)

{'acc': 0.1, 'top': 10, 'type': 'collaborative'}
{'acc': 0.14, 'top': 20, 'type': 'collaborative'}
{'acc': 0.19, 'top': 50, 'type': 'collaborative'}
{'acc': 0.3, 'top': 100, 'type': 'collaborative'}


Content

In [64]:
# Clean cache
cached_recommendation.cache_clear()
for top in top_n:
    result = {
        "acc": threaded_top_n_accuracy_with_cache(ground_truth_dic[:100], recommend_users_content, top),
        "top": top,
        "type": "content"
    }
    print(result)
    results.append(result)

{'acc': 0.05, 'top': 10, 'type': 'content'}
{'acc': 0.06, 'top': 20, 'type': 'content'}
{'acc': 0.13, 'top': 50, 'type': 'content'}


In [None]:
pd.DataFrame(results).to_csv("prototype_v1.csv", index=False)