In [481]:
import os
import ast
import pandas as pd
import psycopg2
from collections import defaultdict
from sqlalchemy import create_engine
from urllib.parse import quote_plus

nr_of_recommendations = 50

In [482]:
# connection details
db_user = 'postgres'
db_password = 'admin'
db_host = 'localhost'
db_port = '5432'
db_name = 'AOI'

# format special characters
password = quote_plus(db_password)

conn = psycopg2.connect(
    dbname=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)

# SQLAlchemy engine
engine = create_engine(f'postgresql://{db_user}:{password}@{db_host}:{db_port}/{db_name}')

# PostgreSQL tables -> pandas DataFrames
researchers_query = 'SELECT "Researcher ID", "Full Name", "Expertise", "Appreciated", "Random Recommendation" FROM researchers_table'
articles_query = 'SELECT id, title, authors, url, subject_split FROM articles_table'

researchers_df = pd.read_sql(researchers_query, engine)
articles_df = pd.read_sql(articles_query, engine)

# researchers_df.head(33)
articles_df.head(10)

Unnamed: 0,id,title,authors,url,subject_split
0,arXiv:2310.03720,HeaP: Hierarchical Policies for Web Actions us...,"Paloma Sodhi, S.R.K. Branavan, Ryan McDonald",https://arxiv.org/pdf/2310.03720.pdf,['Machine Learning']
1,arXiv:2310.02794,Stability Improvements for Fast Matrix Multipl...,"Charlotte Vermeylen, Marc Van Barel",https://arxiv.org/pdf/2310.02794.pdf,['Numerical Analysis']
2,arXiv:2310.02656,Blend: A Unified Data Discovery System,"Mahdi Esmailoghli, Christoph Schnell, Renée J....",https://arxiv.org/pdf/2310.02656.pdf,['Databases']
3,arXiv:2310.00977,Position Sensing Errors in Synchronous Motor D...,Prerit Pramod,https://arxiv.org/pdf/2310.00977.pdf,"['Systems', 'Control']"
4,arXiv:2310.00605,The Generalized Matrix Norm Problem,Adrian Kulmburg,https://arxiv.org/pdf/2310.00605.pdf,['Numerical Analysis']
5,arXiv:2310.00105,Latent Space Symmetry Discovery,"Jianke Yang, Nima Dehmamy, Robin Walters, Rose Yu",https://arxiv.org/pdf/2310.00105.pdf,['Machine Learning']
6,arXiv:2310.00073,Multi-Objective Sparse Sensing with Ergodic Op...,"Ananya Rao, Howie Choset",https://arxiv.org/pdf/2310.00073.pdf,"['Robotics', 'Optimization']"
7,arXiv:2310.00817,Learning to Make Adherence-Aware Advice,"Guanting Chen, Xiaocheng Li, Chunlin Sun, Hanz...",https://arxiv.org/pdf/2310.00817.pdf,['Machine Learning']
8,arXiv:2309.16809,GraB-sampler: Optimal Permutation-based SGD Da...,Guanghao Wei,https://arxiv.org/pdf/2309.16809.pdf,['Machine Learning']
9,arXiv:2310.08450,Monotone discretizations of levelset convex ge...,"Jeff Calder, Wonjun Lee",https://arxiv.org/pdf/2310.08450.pdf,['Numerical Analysis']


In [483]:
def find_most_matches(target_researcher_id, researchers_df):

    max_common_count = 0
    most_common_person = 'Not Found'

    target_appreciated_articles = researchers_df.loc[researchers_df['Researcher ID'] == target_researcher_id, 'Appreciated'].values[0].split(', ')
    # target_appreciated_articles.extend(researchers_df.loc[researchers_df['Researcher ID'] == target_researcher_id, 'Random Recommendation'].values[0].split(', '))

    for _, researcher in researchers_df.iterrows():
        if researcher['Researcher ID'] != target_researcher_id:

            other_appreciated_articles = researcher['Appreciated'].split(', ')
            # other_appreciated_articles.extend(researcher['Random Recommendation'].split(', '))
            # print("target ", target_appreciated_articles)
            # print("other ", other_appreciated_articles)

            common_articles = set(other_appreciated_articles) & set(target_appreciated_articles)
            common_count = len(common_articles)

            if common_count > max_common_count and common_count < nr_of_recommendations:
                max_common_count = common_count
                most_common_person = researcher['Researcher ID']

                
    return most_common_person, max_common_count

In [484]:
best_matches = defaultdict()
nr_of_matches = defaultdict()

for index, researcher in researchers_df.iterrows():

    researcher_id = researcher['Researcher ID']
    most_common_person, max_common_count = find_most_matches(target_researcher_id=researcher['Researcher ID'], researchers_df=researchers_df)

    if max_common_count != 0:
        found_person_name = researchers_df.loc[researchers_df['Researcher ID'] == most_common_person, 'Full Name'].values[0]
        found_person_id = researchers_df.loc[researchers_df['Researcher ID'] == most_common_person, 'Researcher ID'].values[0]
        print(f"For researcher {researcher['Full Name']}, the best match is {found_person_name} with a match of {max_common_count} common articles")
        
        best_matches[researcher_id] = found_person_id
        nr_of_matches[researcher_id] = max_common_count

    else:
        print(f"No match found for {researcher['Full Name']}")
        best_matches[researcher_id] = 'None'



For researcher ROSNER, Daniel, the best match is MOISESCU, Mihnea Alexandru with a match of 9 common articles
For researcher RUSETI, Stefan, the best match is MOCANU, Mariana Ionela with a match of 41 common articles
For researcher TAPUS, Nicolae, the best match is NEGRU, Catalin with a match of 40 common articles
For researcher CIRTOAJE, Cristina, the best match is FLOREA, Adina Magda with a match of 1 common articles
For researcher REBEDEA, Traian, the best match is CHIRU, Costin Gabriel with a match of 11 common articles
For researcher MOCANU, Irina, the best match is CHIRU, Costin Gabriel with a match of 4 common articles
For researcher DEACONESCU, Razvan, the best match is TIGANOAIA, Bogdan with a match of 1 common articles
For researcher DASCALU, Mihai, the best match is CHIRU, Costin Gabriel with a match of 16 common articles
For researcher MOCANU, Bogdan Costel, the best match is TAPUS, Nicolae with a match of 39 common articles
For researcher DOBRE, Ciprian Mihai, the best mat

### Save best match to database

In [485]:
conn = psycopg2.connect(
    dbname=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)
# cursor object
cursor = conn.cursor()

for index, researcher in researchers_df.iterrows():
    researcher_id = researcher['Researcher ID']
    match_id = best_matches[researcher_id]
    # print(f"For researcher with id {researcher_id} the best match is {match_id}")
    update_query = f"UPDATE researchers_table SET \"Best Match ID\" = '{match_id}' WHERE \"Researcher ID\" = '{researcher_id}'"
    cursor.execute(update_query)

conn.commit()

cursor.close()
conn.close()

## Hybrid Recommendation

### Fetch new changes

In [486]:
researchers_query = 'SELECT * FROM researchers_table ORDER BY \"Full Name\"'
researchers_df = pd.read_sql(researchers_query, engine)
researchers_df.head(5)

Unnamed: 0,Researcher ID,Full Name,Expertise,Appreciated,Random Recommendation,Best Match ID
0,AAG-9392-2021,"APOSTOL, Elena Simona","['Distributed systems', 'IT security', 'Parall...","[""arXiv:2310.02113"", ""arXiv:2310.05269"", ""arXi...",[],D-7296-2012
1,JCE-1061-2023,"CARABAS, Costin","['Computer Science', 'Software Engineering', '...","[""arXiv:2310.00562"", ""arXiv:2310.03736"", ""arXi...",[],E-4073-2016
2,AAY-5210-2020,"CHIRU, Costin Gabriel","['NLP', 'Machine learning', 'Artificial intell...","[""arXiv:2310.02357"", ""arXiv:2310.14261"", ""arXi...",[],O-4984-2014
3,C-5751-2009,"CIRTOAJE, Cristina","['Liquid crystal', 'Liquid crystals, polarised...","[""arXiv:2310.04022"", ""arXiv:2310.10524"", ""arXi...",[],G-5326-2016
4,O-4984-2014,"DASCALU, Mihai","['NLP', 'Discourse analysis', 'Learning analyt...","[""arXiv:2310.02357"", ""arXiv:2310.00603"", ""arXi...",[],AAY-5210-2020


In [487]:
def assign_weights(interests):

    num_interests = len(interests)
    weights = []
    weight = 0.9

    for i in range(num_interests):
        weights.append((interests[i], round(weight, 3)))
        weight *= 0.8
    
    return weights

In [488]:
def compute_relevance(article_subjects, researcher_interests):
    
    researcher_interests_dict = dict(researcher_interests)
    matched_subjects = []    
    relevance_score = 0
    
    for subject in article_subjects:
        for interest, weight in researcher_interests_dict.items():
            if (interest.lower() == subject.lower()) or (subject.lower() == "security" and interest.lower() == "it security"):
                relevance_score += weight
                matched_subjects.append(interest)
    
    return relevance_score, matched_subjects


In [489]:
content_recom_filepath = os.path.join(os.getcwd(), "Hybrid_Recommendation_Results.md")
top_articles = defaultdict()

with open(content_recom_filepath, 'w', encoding='utf-8') as file:

    for index, researcher in researchers_df.iterrows():

        current_id = researcher['Researcher ID']
        current_name = researcher['Full Name']
        current_expertise = eval(researcher['Expertise'])
        current_appreciated = set(eval(researcher['Appreciated']))
        
        match_id = researcher['Best Match ID']
        if match_id != 'None':
            match_name = researchers_df.loc[researchers_df['Researcher ID'] == match_id, 'Full Name'].values[0]
            match_expertise = eval(researchers_df.loc[researchers_df['Researcher ID'] == match_id, 'Expertise'].values[0])
            match_appreciated = set(eval(researchers_df.loc[researchers_df['Researcher ID'] == match_id, 'Appreciated'].values[0]))

            possible_recommendations = match_appreciated - current_appreciated
            # print(len(possible_recommendations))
            filtered_articles = articles_df[articles_df['id'].isin(possible_recommendations)]
            # print(filtered_articles['title'])

            num_interests = len(current_expertise)
            weighted_interests = assign_weights(current_expertise)
            articles_relevance = []


            for index_a, article in filtered_articles.iterrows():
                article_id = article['id']
                article_subjects_as_string = article['subject_split']
                article_subjects = ast.literal_eval(article_subjects_as_string)
                # compute relevance score for current article
                relevance_score, matched_subjects = compute_relevance(article_subjects, weighted_interests)
                articles_relevance.append((article, relevance_score, matched_subjects))


            # only keep articles with relevance_score > 0
            non_zero_articles = [art for art in articles_relevance if art[1] > 0]
            # sort articles based on relevance score
            non_zero_articles.sort(key=lambda x: x[1], reverse=True)

            top_articles[current_id] = non_zero_articles

            
            file.write("\n")
            file.write(f"# Researcher {current_name}\n")
            file.write(f"### **Expertise List:** {' | '.join(current_expertise)}\n\n")
            file.write(f"## **Best match: Researcher {match_name}** with **{nr_of_matches[current_id]} matched appreciated articles**\n")
            file.write(f"### Their expertise list: {' | '.join(match_expertise)}\n")
            file.write(f"## **Hybrid Recommendations** for **{current_name}**\n")
            file.write("| Nr | ID | URL | Title | Relevance Score | Matched Subjects | Article Subjects |\n")
            file.write("| --- | --- | --- | --- | --- | --- | --- | \n")
            index = 1
            for article, relevance_score, matched_subjects in top_articles[current_id]:
                file.write(f"| {index} | {article['id']} | {article['url']} | {article['title']} | {relevance_score} | {matched_subjects} | {article['subject_split']} |\n")
                index += 1
            file.write("\n\n\n")
                
        

    # print(f"Researcher {current_name} with id {current_id} has an expertise in {current_expertise} and is best matched with {match_id}")
