# Rec Sys of Answerers for StackOverflow
## Version 1.0

Loading Libraries

In [82]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast


Loading Data

In [83]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [84]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [85]:
df_question['QuestionCreationDate'] = pd.to_datetime(df_question['QuestionCreationDate'])
df_question.sort_values(by='QuestionCreationDate', inplace=True)

In [86]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [87]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)

In [88]:
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
767371,53992215,1758023,Using a ScheduledExecutorService to run a task...,"[java, executorservice, java-threads]",0,2019-01-01 00:00:04,1
767372,53992219,4298538,How to programmatically change style sheet of ...,"[c++, qt, qt5, qtstylesheets, qpushbutton]",1,2019-01-01 00:01:55,2
767373,53992220,6266364,Trying to put website into Maintenance Mode (3...,[],2,2019-01-01 00:02:08,2
767374,53992221,9706003,Node.js Lambda Async return Undefined,"[node.js, lambda]",2,2019-01-01 00:02:27,3
767375,53992223,7035033,Unable to print a class list attribute using i...,"[python, python-3.x, list, class, printing]",0,2019-01-01 00:02:37,2
...,...,...,...,...,...,...,...
156669,56828823,8684836,Embedding localized satellite dll into exe app...,"[c++, winforms, c++-cli, clr]",0,2019-06-30 23:55:26,2
156670,56828825,11721401,"PS/2 keyboard won't send keypress interrupts, ...","[c, assembly, x86, interrupt, osdev]",9,2019-06-30 23:55:45,1
156671,56828831,11721524,How to convert a string to an integer in Power...,"[string, powershell, integer, int]",0,2019-06-30 23:58:22,3
156672,56828833,1889720,How do I return the results from a query in Fi...,"[node.js, typescript, firebase, google-cloud-f...",1,2019-06-30 23:58:37,2


### Filtering Dataset to include only top 5 tags

In [89]:
tags_to_filter = {'java', 'python', 'javascript', 'c#', 'android'}  # Faster lookup structure

# Filter rows where any tag matches one of the target tags
filtered_df = df_question[df_question['QuestionTags'].apply(lambda tags: any(tag in tags_to_filter for tag in tags))]


## Prototype 1 - Approach Collaborative Filtering


- **Goal:** Find answerers to new posted question.
  
- **Approach:** Collaborative Filtering (Item Based) --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags
2. Calculate cosine similarity between tags
3. Calculate the the score of the user given the number of times they answerered certain question and the similarity of that tag with the the tags of the new question.

### Dividing the question dataset into Train and Test

In [90]:
# train_data, test_data = train_test_split(df_question, test_size=0.05, random_state=42)

# Determine the split index
split_index = int(len(df_question) * 0.2)

# Split the DataFrame into training and testing sets
train_data = df_question.iloc[:split_index]
test_data = df_question.iloc[split_index:]

Merging QuestionTags to UserAnswerId

In [91]:
df_merged = train_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

In [92]:
1943329

1943329

### Indexing tags and users

In [93]:
dic_merged = df_merged.to_dict(orient="records")

In [94]:
unique_tags = list({tag for question in dic_merged for tag in question['QuestionTags']})
unique_users = list({question['AnswerOwnerId'] for question in dic_merged})

tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
user_to_index = {user: idx for idx, user in enumerate(unique_users)}

### Creating User-Tag Matrix

In [95]:
# Step 2: Create a Sparse User-Tag Matrix
rows, cols, data = [], [], []
for question in dic_merged:
    user_idx = user_to_index[question['AnswerOwnerId']]
    for tag in question['QuestionTags']:
        tag_idx = tag_to_index[tag]
        rows.append(user_idx)
        cols.append(tag_idx)
        data.append(1)

# Create a sparse matrix 
user_tag_matrix_sparse = csr_matrix((data, (rows, cols)), shape=(len(unique_users), len(unique_tags)))


### Calculating Tags Similarity Matrix

In [96]:
# Compute Tag Similarity
cos_sim_matrix = cosine_similarity(user_tag_matrix_sparse.T)

### Recommending User

In [97]:
test_data

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
724783,54567464,3292629,Session not getting created after upgrading to...,"[hibernate, hibernate-5.x]",0,2019-02-07 06:25:08,0
724784,54567469,5933293,How to stop printing log4j FileNotFound Error ...,"[java, log4j, executable-jar]",0,2019-02-07 06:25:38,1
724785,54567476,10939832,How to fix Checkbox bug?,"[javascript, css, reactjs]",1,2019-02-07 06:26:09,4
724786,54567484,6438795,Timeout expired. The timeout period elapsed pr...,"[sql-server, vb.net]",0,2019-02-07 06:26:54,1
724787,54567485,11026609,Is there a way to increment after a for-loop u...,"[python, python-3.x]",-1,2019-02-07 06:27:12,2
...,...,...,...,...,...,...,...
156669,56828823,8684836,Embedding localized satellite dll into exe app...,"[c++, winforms, c++-cli, clr]",0,2019-06-30 23:55:26,2
156670,56828825,11721401,"PS/2 keyboard won't send keypress interrupts, ...","[c, assembly, x86, interrupt, osdev]",9,2019-06-30 23:55:45,1
156671,56828831,11721524,How to convert a string to an integer in Power...,"[string, powershell, integer, int]",0,2019-06-30 23:58:22,3
156672,56828833,1889720,How do I return the results from a query in Fi...,"[node.js, typescript, firebase, google-cloud-f...",1,2019-06-30 23:58:37,2


In [98]:
## Example to 1 question
# tags = test_data["QuestionTags"].iloc[0]
# tags

tags = ["r", "radar-chart", "spider-chart"]

In [99]:
def recommend_users_collaborative(tags, top_n=10):
    # Get indices of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
    
    # Combine similarities for the requested tags
    similarity_tags = [cos_sim_matrix[:, tag_index] for tag_index in tag_indices]
    combined_similarity = sum(similarity_tags)

    # Compute user scores 
    user_scores_vector = user_tag_matrix_sparse.dot(combined_similarity)

    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    # return ranked_users[:top_n]
    return ranked_users

In [100]:
return_1 = recommend_users_collaborative(tags)

## Prototype 2 - Approach Content Based-Filtering


- **Goal:** Find answerers to new posted question.
  
- **Approach:** Content Based-Filtering --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags
2. Create vector of to the new tag based on the Matrix of User-tags 
3. Calculate cosine similarity for each user with vector created in step 2
4. Order list to show the most recommended users

In [101]:
# We are going start using some User-Tag matrix of the previous approach

def recommend_users_content(tags, top_n=10):
    # Get index of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
            
    # Create vector of the tags in question
    question_vector = np.zeros(user_tag_matrix_sparse.shape[1])
    question_vector[tag_indices] = 1
    question_vector= question_vector.reshape(1, -1) #Reshaping array to calculate similarity

    user_scores_vector = cosine_similarity(user_tag_matrix_sparse, question_vector)
    user_scores_vector = user_scores_vector.flatten()

    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    # return ranked_users[:10]
    return ranked_users


In [102]:
return_2 = recommend_users_content(tags)

## Evaluation

Top-N Accuracy

In [103]:
df_merged_test = test_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

Joining the Users to form a single ground truth

In [104]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [105]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Calculate the results for all questions

In [106]:
import threading
from functools import lru_cache

# Cached recommendation function wrapper
@lru_cache(maxsize=len(ground_truth_dic))
def cached_recommendation(tags, recommendation_function, top_n):
    return recommendation_function(tuple(tags))[:top_n]

# Threaded version of the evaluation process with caching
def threaded_top_n_accuracy_with_cache(ground_truth_dic, recommendation_function, top_n=5):
    def evaluate_question(question, result, index):
        # Get recommendations using cached function
        top_users = cached_recommendation(tuple(question["QuestionTags"]), recommendation_function, top_n)
        top_users_list = [user[0] for user in top_users]
        
        # Check for overlap with ground truth
        users_groundtruth = question["GroundTruth"]
        result[index] = int(bool(set(top_users_list) & set(users_groundtruth)))

    # Shared list to store results
    results = [0] * len(ground_truth_dic)
    threads = []

    # Create and start threads
    for idx, question in enumerate(ground_truth_dic):
        thread = threading.Thread(target=evaluate_question, args=(question, results, idx))
        threads.append(thread)
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()
        

    # Calculate accuracy
    return sum(results) / len(ground_truth_dic)


Collaborative

In [107]:
top_n = [10, 20, 50]
results = []

for top in top_n:
    result = {
        "acc": threaded_top_n_accuracy_with_cache(ground_truth_dic[:200], recommend_users_collaborative, top),
        "top": top,
        "type": "collaborative"
    }
    
    print(result)
    results.append(result)

{'acc': 0.195, 'top': 10, 'type': 'collaborative'}
{'acc': 0.235, 'top': 20, 'type': 'collaborative'}
{'acc': 0.305, 'top': 50, 'type': 'collaborative'}


Content

In [108]:
# Clean cache
cached_recommendation.cache_clear()
for top in top_n:
    result = {
        "acc": threaded_top_n_accuracy_with_cache(ground_truth_dic, recommend_users_content, top),
        "top": top,
        "type": "content"
    }
    print(result)
    results.append(result)

KeyboardInterrupt: 

In [None]:
pd.DataFrame(results).to_csv("prototype_simple_5tags.csv", index=False)

Random Pick

In [None]:
import random

In [None]:
users = list(set(df_merged["AnswerOwnerId"].to_list()))

In [None]:
def top_n_accuracy(ground_truth_dic, top_n):
    tp = 0
    
    for question in ground_truth_dic:    
        # Checking if it is in the answer
        users_recommendation = random.sample(users, top_n)
        users_ground_truth = question["GroundTruth"]
        if set(users_recommendation) & set(users_ground_truth):
            tp += 1
            
    print(tp)
    return tp/len(ground_truth_dic)

In [None]:
# Clean cache
cached_recommendation.cache_clear()
for top in top_n:
    result = {
        "acc": top_n_accuracy(ground_truth_dic, top),
        "top": top,
        "type": "random"
    }
    print(result)
    results.append(result)

2
{'acc': 5.553550106905839e-05, 'top': 10, 'type': 'random'}
9
{'acc': 0.00024990975481076276, 'top': 20, 'type': 'random'}
6
{'acc': 0.0001666065032071752, 'top': 50, 'type': 'random'}
