# Rec Sys of Answerers for StackOverflow
## Version 1.0

Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
from sklearn.model_selection import train_test_split
import ast


Loading Data

In [2]:
df_question = pd.read_csv("dataset/questions_2019.csv")
df_answer = pd.read_csv("dataset/answers_2019.csv")

In [3]:
df_question.head()

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,['r'],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"['functional-programming', 'lodash']",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"['webpack', 'webpack-dev-server']",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,['c#'],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"['c#', 'winforms', 'charts', 'column-chart']",0,2019-03-01 00:02:47,0


In [4]:
df_answer.head()

Unnamed: 0,QuestionId,AnswerOwnerId,AnswerVotes,AnswerCreationDate
0,56140111,10245958,3,2019-05-15 00:07:23
1,56140157,1226963,0,2019-05-15 00:12:34
2,56140125,6841773,0,2019-05-15 00:12:44
3,56140150,1440565,5,2019-05-15 00:12:52
4,56140150,11015427,15,2019-05-15 00:16:46


Transfoming list of tags

In [5]:
df_question['QuestionTags'] = df_question['QuestionTags'].apply(ast.literal_eval)

In [6]:
df_question

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
0,54936100,3419772,R 3.5.2: Error in loading stock data from zoo,[r],0,2019-03-01 00:00:10,1
1,54936106,3997132,Different behaviour of range with lodash/fp,"[functional-programming, lodash]",1,2019-03-01 00:01:48,1
2,54936108,4992551,webPack dev server proxy rewrite URLs in response,"[webpack, webpack-dev-server]",2,2019-03-01 00:01:55,1
3,54936109,2239552,EF Core 2.0 Global Filter,[c#],0,2019-03-01 00:02:36,0
4,54936112,5505171,Clustered column chart in C# using Chart in Wi...,"[c#, winforms, charts, column-chart]",0,2019-03-01 00:02:47,0
...,...,...,...,...,...,...,...
878713,54272409,10245420,Selenium iframe Data issue - python,"[python, selenium, selenium-webdriver, iframe]",1,2019-01-19 23:57:14,1
878714,54272411,8906835,Problem on adding Bootstrap carousel component,"[bootstrap-4, bootstrap-carousel]",0,2019-01-19 23:57:26,1
878715,54272412,10893334,NativeScript Vue nativescript-sqlite cannot as...,"[sqlite, nativescript, nativescript-vue]",0,2019-01-19 23:57:36,1
878716,54272414,10564619,How to fix 'TypeError' that comes up for some ...,"[python, typeerror]",1,2019-01-19 23:57:54,1


## Prototype 1

- **Goal:** Find answerers to new posted question.
  
- **Approach:** Collaborative Filtering (Item Based) --> Utilize question tags of previous answered questions, to indicate the most appropriate user.

- **Steps:**

1. Create Matrix of User-Tags

### Dividing the question dataset into Train and Test

In [7]:
train_data, test_data = train_test_split(df_question, test_size=0.1, random_state=42)

Merging QuestionTags to UserAnswerId

In [8]:
df_merged = train_data[["QuestionId","QuestionTags"]].merge(df_answer[["QuestionId", "AnswerOwnerId"]], how='inner', on = "QuestionId", )

Getting all tags of a user -> for checking purposes

In [56]:
df_grouped_tags = (
    df_merged.groupby("AnswerOwnerId")["QuestionTags"]
    .apply(lambda tags: list(set(tag for sublist in tags for tag in sublist)))  # Flatten and deduplicate tags
    .reset_index()
)

df_grouped_tags.columns = ["AnswerOwnerId", "CombinedTags"]

In [64]:
df_grouped_tags[df_grouped_tags["AnswerOwnerId"] == 10221765]["CombinedTags"].iloc[0]


['big-o',
 'date',
 'polymer-3.x',
 'json',
 'firebase-authentication',
 'submit',
 'syntax',
 'logical-operators',
 'api',
 'angularjs',
 'node-kafka',
 'nested',
 'dry',
 'electron',
 'digits',
 'menu',
 'conditional-statements',
 'return',
 'es6-modules',
 'fizzbuzz',
 'decorator',
 'eslint',
 'hoisting',
 'screen-rotation',
 'indexof',
 'autocomplete',
 'ecmascript-2018',
 'iframe',
 'indexing',
 'for-loop',
 'key',
 'frontend',
 'styles',
 'this',
 'href',
 'primefaces',
 'express',
 'dom-events',
 'compiler-optimization',
 'semantic-ui',
 'p5.js',
 'document-ready',
 'parseint',
 'console',
 'replace',
 'prompt',
 'es6-class',
 'variable-declaration',
 'html',
 'nodemailer',
 'html-table',
 'switch-statement',
 'logic',
 'drop-down-menu',
 'finally',
 'csv',
 'grouping',
 'arrays',
 'array.prototype.map',
 'sql',
 'output',
 'google-cloud-spanner',
 'node.js',
 'jsx',
 'babel-jest',
 'template-literals',
 'mustache',
 'google-cloud-firestore',
 'destructuring',
 'react-redux',
 '

In [None]:
1943329

### Indexing tags and users

In [10]:
dic_merged = df_merged.to_dict(orient="records")

In [11]:
unique_tags = list({tag for question in dic_merged for tag in question['QuestionTags']})
unique_users = list({question['AnswerOwnerId'] for question in dic_merged})

tag_to_index = {tag: idx for idx, tag in enumerate(unique_tags)}
user_to_index = {user: idx for idx, user in enumerate(unique_users)}

### Creating User-Tag Matrix

In [12]:
# Step 2: Create a Sparse User-Tag Matrix
rows, cols, data = [], [], []
for question in dic_merged:
    user_idx = user_to_index[question['AnswerOwnerId']]
    for tag in question['QuestionTags']:
        tag_idx = tag_to_index[tag]
        rows.append(user_idx)
        cols.append(tag_idx)
        data.append(1)

# Create a sparse matrix 
user_tag_matrix_sparse = csr_matrix((data, (rows, cols)), shape=(len(unique_users), len(unique_tags)))


### Calculating Tags Similarity Matrix

In [13]:
# Compute Tag Similarity
cos_sim_matrix = cosine_similarity(user_tag_matrix_sparse.T)

### Recommending User

In [15]:
test_data

Unnamed: 0,QuestionId,QuestionOwnerId,QuestionTitle,QuestionTags,QuestionVotes,QuestionCreationDate,AnswerCount
275130,56705914,11659955,How to style my react select when it is disabl...,"[javascript, html, css, reactjs]",0,2019-06-21 15:00:47,1
623441,55720363,2579263,Serializing into dictionary Stored Procedure r...,"[python, stored-procedures, flask, sqlalchemy,...",0,2019-04-17 05:17:42,1
603387,55655622,11199969,Refresh page from another (page),"[javascript, php, jquery, html, ajax]",3,2019-04-12 16:05:10,2
351087,56204717,11519243,Slow loading web page served by Node.js HTTP s...,"[node.js, video.js, httpserver]",1,2019-05-19 04:18:20,1
44361,55080751,11176740,Castle Windsor 5.0 PerWebRequest Lifestyle,[asp.net],2,2019-03-09 18:39:45,1
...,...,...,...,...,...,...,...
655321,54666432,1090791,What does interface annotated with jsr305 Thre...,"[java, jsr305]",-1,2019-02-13 09:16:44,2
673240,54725077,9972840,How to add a color gradient to a radar plot ba...,"[r, radar-chart, spider-chart]",1,2019-02-16 16:16:22,0
212070,55915405,11183751,Convert to nginx rules,"[nginx, url-rewriting]",1,2019-04-30 06:52:46,0
185854,56393739,11548820,write function not working properly in python,"[python, python-3.x]",0,2019-05-31 10:52:55,2


In [17]:
## Example to 1 question
tags = test_data["QuestionTags"].iloc[0]
tags

['javascript', 'html', 'css', 'reactjs']

In [71]:
def recommend_users(tags, top_n=10):
    # Get indices of the tags
    tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
    if len(tag_indices)==0:
        return [] ## No tag found to make the suggestion
    
    # Combine similarities for the requested tags
    similarity_tags = [cos_sim_matrix[:, tag_index] for tag_index in tag_indices]
    combined_similarity = sum(similarity_tags)

    # Compute user scores 
    user_scores_vector = user_tag_matrix_sparse.dot(combined_similarity)

    # Map scores to users
    user_scores = {unique_users[idx]: score for idx, score in enumerate(user_scores_vector)}

    # Rank users by scores
    ranked_users = sorted(user_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_users[:top_n]

In [69]:
recommend_users(tags)

[(10221765, 6961.702194773498),
 (1447675, 4605.880020042108),
 (8620333, 4087.286736965725),
 (9819146, 3710.3940115563155),
 (9515207, 3539.796342985606),
 (9624435, 2442.6682320166447),
 (19068, 2440.0811180705978),
 (6766919, 2413.3835273305526),
 (5260024, 2327.524737705065),
 (157247, 2152.4099257192697)]