# Rec Sys of Answerers for StackOverflow
## 1 - Collaborative and Content Based Approach

Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast
from tqdm import tqdm
from functools import lru_cache

Loading Data

In [2]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [3]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [4]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [5]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)

In [None]:
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


: 

: 

: 

: 

: 

### Filtering Dataset to include only top 5 tags

In [None]:
# tags_to_filter = {'java', 'python', 'javascript', 'c#', 'android'}  # Faster lookup structure


# # Filter rows where any tag matches one of the target tags
# df_question = df_question[df_question['QuestionTags'].apply(lambda tags: any(tag in tags_to_filter for tag in tags))]


## Prototype 1 - Approach Collaborative Filtering


- **Goal:** Find answerers to new posted question.
  
- **Approach:** Collaborative Filtering (Item Based) --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags
2. Calculate cosine similarity between tags
3. Calculate the the score of the user given the number of times they answerered certain question and the similarity of that tag with the the tags of the new question.

### Dividing the question dataset into Train and Test

In [7]:
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
767371,53992215,1758023,Using a ScheduledExecutorService to run a task...,"[java, executorservice, java-threads]",0,2019-01-01 00:00:04,1
767372,53992219,4298538,How to programmatically change style sheet of ...,"[c++, qt, qt5, qtstylesheets, qpushbutton]",1,2019-01-01 00:01:55,2
767373,53992220,6266364,Trying to put website into Maintenance Mode (3...,[],2,2019-01-01 00:02:08,2
767374,53992221,9706003,Node.js Lambda Async return Undefined,"[node.js, lambda]",2,2019-01-01 00:02:27,3
767375,53992223,7035033,Unable to print a class list attribute using i...,"[python, python-3.x, list, class, printing]",0,2019-01-01 00:02:37,2
...,...,...,...,...,...,...,...
156669,56828823,8684836,Embedding localized satellite dll into exe app...,"[c++, winforms, c++-cli, clr]",0,2019-06-30 23:55:26,2
156670,56828825,11721401,"PS/2 keyboard won't send keypress interrupts, ...","[c, assembly, x86, interrupt, osdev]",9,2019-06-30 23:55:45,1
156671,56828831,11721524,How to convert a string to an integer in Power...,"[string, powershell, integer, int]",0,2019-06-30 23:58:22,3
156672,56828833,1889720,How do I return the results from a query in Fi...,"[node.js, typescript, firebase, google-cloud-f...",1,2019-06-30 23:58:37,2


In [6]:
# Ordering dataframe before split
# Convert 'QuestionCreationDate' to datetime format using .loc
df_question.loc[:, 'QuestionCreationDate'] = pd.to_datetime(df_question['QuestionCreationDate'])

# Sort the DataFrame by 'QuestionCreationDate' without using inplace
df_question = df_question.sort_values(by='QuestionCreationDate')

In [8]:
# Determine the split index
split_index = int(len(df_question) * 0.9) #10% for test 

# Split the DataFrame into training and testing sets
train_data = df_question.iloc[:split_index]
test_data = df_question.iloc[split_index:]

In [9]:
test_data

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
568215,56546965,11631967,Percentage for each subgroup,[powerbi],2,2019-06-11 15:22:52,1
568216,56546966,10744078,Managing floating point precision - Best pract...,"[python, pandas]",0,2019-06-11 15:23:01,0
568217,56546968,4855106,Is the json import failing because of the JSON...,"[json, python-3.x, visual-studio-code]",0,2019-06-11 15:23:12,0
568218,56546969,9877445,ggplot add text to the center of a donut chart...,"[r, ggplot2, text, data-visualization, donut-c...",2,2019-06-11 15:23:14,1
568219,56546970,11379139,"In Angular application, I keep getting this er...","[node.js, angular]",2,2019-06-11 15:23:16,1
...,...,...,...,...,...,...,...
156669,56828823,8684836,Embedding localized satellite dll into exe app...,"[c++, winforms, c++-cli, clr]",0,2019-06-30 23:55:26,2
156670,56828825,11721401,"PS/2 keyboard won't send keypress interrupts, ...","[c, assembly, x86, interrupt, osdev]",9,2019-06-30 23:55:45,1
156671,56828831,11721524,How to convert a string to an integer in Power...,"[string, powershell, integer, int]",0,2019-06-30 23:58:22,3
156672,56828833,1889720,How do I return the results from a query in Fi...,"[node.js, typescript, firebase, google-cloud-f...",1,2019-06-30 23:58:37,2


Merging QuestionTags to UserAnswerId

In [10]:
df_merged = train_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

In [18]:
df_merged["QuestionId"].nunique()

650900

Getting all tags of a user -> for checking purposes

In [144]:
# df_grouped_tags = (
#     df_merged.groupby("AnswerOwnerId")["QuestionTags"]
#     .apply(lambda tags: list(set(tag for sublist in tags for tag in sublist)))  # Flatten and deduplicate tags
#     .reset_index()
# )

# df_grouped_tags.columns = ["AnswerOwnerId", "CombinedTags"]

### Indexing tags and users

In [145]:
dic_merged = df_merged.to_dict(orient="records")

In [146]:
unique_tags = list({tag for question in dic_merged for tag in question['QuestionTags']})
unique_users = list({question['AnswerOwnerId'] for question in dic_merged})

tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
user_to_index = {user: idx for idx, user in enumerate(unique_users)}

### Creating User-Tag Matrix

In [147]:
# Step 2: Create a Sparse User-Tag Matrix
rows, cols, data = [], [], []
for question in dic_merged:
    user_idx = user_to_index[question['AnswerOwnerId']]
    for tag in question['QuestionTags']:
        tag_idx = tag_to_index[tag]
        rows.append(user_idx)
        cols.append(tag_idx)
        data.append(1)

# Create a sparse matrix 
user_tag_matrix_sparse = csr_matrix((data, (rows, cols)), shape=(len(unique_users), len(unique_tags)))


### Calculating Tags Similarity Matrix

In [148]:
# Compute Tag Similarity
cos_sim_matrix = cosine_similarity(user_tag_matrix_sparse.T)

### Recommending User

In [149]:
## Example to 1 question
# tags = test_data["QuestionTags"].iloc[0]
# tags

tags = ["r", "radar-chart", "spider-chart"]

In [150]:
# Divide by the user activity (Normalization to avoid bias)
user_activity = user_tag_matrix_sparse.sum(axis=1).A1  # Total user interactions


In [151]:
user_activity

array([  3,   8,   2, ..., 859,  25,   3])

In [152]:
user_average_activity = np.array(user_tag_matrix_sparse.mean(axis=1)).flatten()

In [153]:
def recommend_users_collaborative(tags):
    # Get indices of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
    
    # Combine similarities for the requested tags
    similarity_tags = cos_sim_matrix[:, tag_indices] #selected tags
    # combined_similarity = np.sum(similarity_tags, axis=1)
    # combined_similarity = np.mean(similarity_tags, axis=1)
    combined_similarity = np.prod(similarity_tags, axis=1) # Best values

    # Compute weighted user scores using item-based collaborative filtering
    numerator = user_tag_matrix_sparse.dot(combined_similarity)  # Weighted sum of ratings
    denominator = np.sum(combined_similarity)  # Correct sum of similarity scores

    # Avoid division by zero
    user_scores_vector = np.where(denominator != 0, numerator / denominator, 0)  

    # Adjust scores using user average ratings
    user_scores_vector += user_average_activity  # Incorporating user mean rating


    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_users

In [154]:
return_1 = recommend_users_collaborative(tags)

## Prototype 2 - Approach Content Based-Filtering


- **Goal:** Find answerers to new posted question.
  
- **Approach:** Content Based-Filtering --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags
2. Create vector of to the new tag based on the Matrix of User-tags 
3. Calculate cosine similarity for each user with vector created in step 2
4. Order list to show the most recommended users

In [155]:
# We are going start using some User-Tag matrix of the previous approach

def recommend_users_content(tags):
    # Get index of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
            
    # Create vector of the tags in question
    question_vector = np.zeros(user_tag_matrix_sparse.shape[1])
    question_vector[tag_indices] = 1
    question_vector= question_vector.reshape(1, -1) #Reshaping array to calculate similarity

    user_scores_vector = cosine_similarity(user_tag_matrix_sparse, question_vector)
    user_scores_vector = user_scores_vector.flatten()

    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    # return ranked_users[:10]
    return ranked_users


In [156]:
return_2 = recommend_users_content(tags)

## Evaluation

Top-N Accuracy

In [19]:
df_merged_test = test_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

Joining the Users to form a single ground truth

In [20]:
# Convert QuestionTags from lists to tuples
df_merged_test["QuestionTags"] = df_merged_test["QuestionTags"].apply(tuple)

# Group AnswerOwnerId by QuestionId and QuestionTags
grouped_df = (
    df_merged_test.groupby(["QuestionId", "QuestionTags"])["AnswerOwnerId"]
    .apply(list)  # Combine AnswerOwnerId values into a list
    .reset_index()
    .rename(columns={"AnswerOwnerId": "GroundTruth"})
)

In [21]:
df_merged_test["QuestionId"].nunique()

71983

In [159]:
ground_truth_dic = grouped_df.to_dict(orient='records')

Calculate the results for all questions

In [160]:
# def top_n_accuracy(users_ground_truth_list, users_recommendation_list, top_n):
#     tp = 0
    
#     for users_ground_truth, users_rec in zip(users_ground_truth_list, users_recommendation_list):    
#         # Checking if it is in the answer
#         users_recommendation = users_rec[:top_n]
#         if set(users_recommendation) & set(users_ground_truth):
#             tp += 1
            
#     print(tp)
#     return tp/len(users_ground_truth_list)

In [161]:
def evaluate_recommendations(users_ground_truth_list, users_recommendation_list, top_n):

    total_recall = 0
    successful_recommendations = 0
    num_questions = len(users_ground_truth_list)

    for users_ground_truth, users_rec in zip(users_ground_truth_list, users_recommendation_list):
        # Get the top-N recommended users
        users_recommendation = users_rec[:top_n]
        # Convert to sets for intersection
        recommended_set = set(users_recommendation)
        ground_truth_set = set(users_ground_truth)
        # Calculate the number of relevant users in the top-N recommendations
        true_positives = len(recommended_set & ground_truth_set)
        # Recall@N: Proportion of relevant users that are recommended in top-N
        recall = true_positives / len(ground_truth_set) if ground_truth_set else 0
        # Accumulate recall
        total_recall += recall
        # Accuracy@N: Check if there's at least one relevant user in the top-N recommendations
        if true_positives > 0:
            successful_recommendations += 1

    # Calculate average recall over all questions
    avg_recall = total_recall / num_questions if num_questions > 0 else 0
    # Calculate accuracy over all questions
    accuracy = successful_recommendations / num_questions if num_questions > 0 else 0

    return avg_recall, accuracy

In [162]:
users_ground_truth = [ground_truth_item["GroundTruth"] for ground_truth_item in ground_truth_dic]
tags_ground_truth = [tuple(sorted(ground_truth_item["QuestionTags"])) for ground_truth_item in ground_truth_dic]

Collaborative

In [163]:
top_n = [5, 10, 20, 50, 100]

users_recommendation_list = []
for tags_tuple in tqdm(tags_ground_truth, desc="Processing Recommendations"):
    recommendation = recommend_users_collaborative(tags_tuple)[:max(top_n)]
    users_rec = [rec[0] for rec in recommendation]
    users_recommendation_list.append(users_rec)

  user_scores_vector = np.where(denominator != 0, numerator / denominator, 0)
Processing Recommendations: 100%|██████████| 71983/71983 [7:04:33<00:00,  2.83it/s]   


In [164]:
# Evaluate Recommendation
results = []

for top in top_n:
    recall, acc = evaluate_recommendations(users_ground_truth, users_recommendation_list, top)
    result = {
        "acc": acc,
        "recall": recall,
        "top": top,
        "type": "collaborative"
    }
    
    print(result)
    results.append(result)

{'acc': 0.14276982065209842, 'recall': 0.11186542374776025, 'top': 5, 'type': 'collaborative'}
{'acc': 0.18357112095911535, 'recall': 0.14531995052289914, 'top': 10, 'type': 'collaborative'}
{'acc': 0.23026270091549395, 'recall': 0.1831991424912531, 'top': 20, 'type': 'collaborative'}
{'acc': 0.3035994609838434, 'recall': 0.24429204196146714, 'top': 50, 'type': 'collaborative'}
{'acc': 0.3599322062153564, 'recall': 0.2928825330744972, 'top': 100, 'type': 'collaborative'}


Content

In [31]:
top_n = [5, 10, 20, 50, 100]

users_recommendation_list_content = []
for tags_tuple in tqdm(tags_ground_truth, desc="Processing Recommendations"):
    recommendation = recommend_users_content(tags_tuple)[:max(top_n)]
    users_rec = [rec[0] for rec in recommendation]
    users_recommendation_list_content.append(users_rec)

Processing Recommendations: 100%|██████████| 71983/71983 [2:08:41<00:00,  9.32it/s]  


In [35]:
# Evaluate Recommendation
for top in top_n:
    recall, acc = evaluate_recommendations(users_ground_truth, users_recommendation_list_content, top)
    result = {
        "acc": acc,
        "recall": recall,
        "top": top,
        "type": "content"
    }
    
    print(result)
    results.append(result)

{'acc': 0.0323965380714891, 'recall': 0.027661734064634715, 'top': 5, 'type': 'content'}
{'acc': 0.05451287109456399, 'recall': 0.04647207266089512, 'top': 10, 'type': 'content'}
{'acc': 0.08392259283441923, 'recall': 0.07104401744768207, 'top': 20, 'type': 'content'}
{'acc': 0.13562924579414584, 'recall': 0.11485829406675252, 'top': 50, 'type': 'content'}
{'acc': 0.18982259700206994, 'recall': 0.15924446583636914, 'top': 100, 'type': 'content'}


Random Pick

In [64]:
df_merged

Unnamed: 0,QuestionId,QuestionTags,AnswerOwnerId
0,53992215,"[java, executorservice, java-threads]",5004157
1,53992219,"[c++, qt, qt5, qtstylesheets, qpushbutton]",10605013
2,53992219,"[c++, qt, qt5, qtstylesheets, qpushbutton]",6622587
3,53992220,[],369434
4,53992220,[],6266364
...,...,...,...
996212,56546947,"[authentication, microsoft-graph-api, access-t...",114029
996213,56546948,"[r, shiny]",11427002
996214,56546958,"[php, laravel, laravel-5, guzzle, laravel-5.8]",11824426
996215,56546958,"[php, laravel, laravel-5, guzzle, laravel-5.8]",9687794


In [68]:
answer_distribution = df_merged.groupby("AnswerOwnerId").size().reset_index(name="AnswerCount")
answer_distribution = answer_distribution.sort_values(by="AnswerCount", ascending=False)
top_user_100 = answer_distribution["AnswerOwnerId"].to_list()


In [None]:
def recommend_users_collaborative(tags):
    # Get indices of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
    
    # Combine similarities for the requested tags
    similarity_tags = cos_sim_matrix[:, tag_indices] #selected tags
    # combined_similarity = np.sum(similarity_tags, axis=1)
    # combined_similarity = np.mean(similarity_tags, axis=1)
    combined_similarity = np.prod(similarity_tags, axis=1) # Best values

    # Compute weighted user scores using item-based collaborative filtering
    numerator = user_tag_matrix_sparse.dot(combined_similarity)  # Weighted sum of ratings
    denominator = np.sum(combined_similarity)  # Correct sum of similarity scores

    # Avoid division by zero
    user_scores_vector = np.where(denominator != 0, numerator / denominator, 0)  

    # Adjust scores using user average ratings
    user_scores_vector += user_average_activity  # Incorporating user mean rating


    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_users

In [None]:
import random
users = list(set(df_merged["AnswerOwnerId"].to_list()))
sample = [top_user_100 for i in range(len(users_ground_truth))]

for top in top_n:
    result = {
        "acc": evaluate_recommendations(users_ground_truth, sample, top),
        "top": top,
        "type": "top_users"
    }
    print(result)
    results.append(result)

{'acc': (0.014166666666666666, 0.02), 'top': 5, 'type': 'random'}
{'acc': (0.019566666666666663, 0.028), 'top': 10, 'type': 'random'}
{'acc': (0.021566666666666668, 0.03), 'top': 20, 'type': 'random'}
{'acc': (0.041133333333333334, 0.054), 'top': 50, 'type': 'random'}
{'acc': (0.06988333333333335, 0.09), 'top': 100, 'type': 'random'}


Saving Results

In [165]:
pd.DataFrame(results).to_csv("setup1_1.csv", index=False)