In [22]:
import os
import ast
import json
import random
import psycopg2
import pandas as pd
from collections import defaultdict
from sqlalchemy import create_engine
from urllib.parse import quote_plus

nr_of_recommendations = 50

In [23]:
# connection details
db_user = 'postgres'
db_password = 'admin'
db_host = 'localhost'
db_port = '5432'
db_name = 'AOI'

# format special characters
password = quote_plus(db_password)

conn = psycopg2.connect(
    dbname=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)

# SQLAlchemy engine
engine = create_engine(f'postgresql://{db_user}:{password}@{db_host}:{db_port}/{db_name}')

# PostgreSQL tables -> pandas DataFrames
researchers_query = 'SELECT * FROM researchers_table ORDER BY "Full Name"'
articles_query = 'SELECT * FROM articles_table'

researchers_df = pd.read_sql(researchers_query, engine)
articles_df = pd.read_sql(articles_query, engine)

researchers_df.head()


Unnamed: 0,Researcher ID,Full Name,Expertise,Appreciated,Random Recommendation,Best Match ID
0,AAG-9392-2021,"APOSTOL, Elena Simona","['Distributed systems', 'IT security', 'Parall...","[""arXiv:2310.02113"", ""arXiv:2310.05269"", ""arXi...",[],D-7296-2012
1,JCE-1061-2023,"CARABAS, Costin","['Computer Science', 'Software Engineering', '...","[""arXiv:2310.00562"", ""arXiv:2310.03736"", ""arXi...",[],E-4073-2016
2,AAY-5210-2020,"CHIRU, Costin Gabriel","['NLP', 'Machine learning', 'Artificial intell...","[""arXiv:2310.02357"", ""arXiv:2310.14261"", ""arXi...",[],O-4984-2014
3,C-5751-2009,"CIRTOAJE, Cristina","['Liquid crystal', 'Liquid crystals, polarised...","[""arXiv:2310.04022"", ""arXiv:2310.10524"", ""arXi...",[],G-5326-2016
4,O-4984-2014,"DASCALU, Mihai","['NLP', 'Discourse analysis', 'Learning analyt...","[""arXiv:2310.02357"", ""arXiv:2310.00603"", ""arXi...",[],AAY-5210-2020


In [24]:
def assign_weights(interests):

    num_interests = len(interests)
    weights = []
    weight = 0.9

    for i in range(num_interests):
        weights.append((interests[i], round(weight, 3)))
        weight *= 0.8
    
    return weights

In [25]:
def compute_relevance(article_subjects, researcher_interests):
    
    researcher_interests_dict = dict(researcher_interests)
    matched_subjects = []    
    relevance_score = 0
    
    for subject in article_subjects:
        for interest, weight in researcher_interests_dict.items():
            if (interest.lower() == subject.lower()) or (subject.lower() == "security" and interest.lower() == "it security"):
                relevance_score += weight
                matched_subjects.append(interest)
    
    return relevance_score, matched_subjects


In [26]:
articles_recommended_to_researchers = defaultdict(set)
top_articles = defaultdict()
random_selection = defaultdict()
content_recom_filepath = os.path.join(os.getcwd(), "Content-Based_Recommendation_Results.md")

with open(content_recom_filepath, 'w', encoding='utf-8') as file:

    for index, researcher in researchers_df.iterrows():

        interests_as_string = researcher['Expertise']
        interests = ast.literal_eval(interests_as_string)
        num_interests = len(interests)
        weighted_interests = assign_weights(interests)
        articles_relevance = []

        for index_a, article in articles_df.iterrows():
            article_id = article['id']
            article_subjects_as_string = article['subject_split']
            article_subjects = ast.literal_eval(article_subjects_as_string)
            # compute relevance score for current article
            relevance_score, matched_subjects = compute_relevance(article_subjects, weighted_interests)
            articles_relevance.append((article, relevance_score, matched_subjects))

        # only keep articles with relevance_score > 0
        non_zero_articles = [art for art in articles_relevance if art[1] > 0]
        # sort articles based on relevance score
        non_zero_articles.sort(key=lambda x: x[1], reverse=True)
        # select top fits
        researcher_id = researcher['Researcher ID']
        top_articles[researcher_id] = non_zero_articles[:nr_of_recommendations]

        # select zero-score articles
        zero_score_articles = [art for art in articles_relevance if art[1] == 0]
        
        # ensure selected articles aren't already present
        selected_articles = [art[0]['id'] for art in top_articles[researcher_id]]
        available_zero_score = [art for art in zero_score_articles if art[0]['id'] not in selected_articles]
        
        # randomly select articles
        random_selection[researcher_id] = random.sample(available_zero_score, nr_of_recommendations)
        missing_articles_count = nr_of_recommendations - len(top_articles[researcher_id])
        random_selection[researcher_id] = random_selection[researcher_id][:missing_articles_count]


        for article, _, _ in top_articles[researcher_id]:
            article_id = article['id']
            articles_recommended_to_researchers[article_id].add(researcher['Full Name'])

        # select only common recomm
        articles_recommended_to_multiple_researchers = {article_id: researchers 
                                                        for article_id, researchers in articles_recommended_to_researchers.items() 
                                                        if len(researchers) >= 2}
        

        file.write("\n")
        file.write(f"# Researcher {researcher['Full Name']}\n")
        file.write(f"### **Expertise List:** {' | '.join(interests)}\n\n")
        file.write(f"### **Top recommendations** according to interests for **{researcher['Full Name']}**:\n")
        file.write("| Nr | ID | URL | Title | Relevance Score | Matched Subjects |\n")
        file.write("| --- | --- | --- | --- | --- | --- | \n")
        index = 1
        for article, relevance_score, matched_subjects in top_articles[researcher_id]:
            file.write(f"| {index} | {article['id']} | {article['url']} | {article['title']} | {relevance_score} | {matched_subjects} |\n")
            index += 1
        file.write("\n\n")

        if len(top_articles[researcher_id]) < nr_of_recommendations:
            file.write("## Unfortunately, in octomber and november of 2023 there have not been so many new papers in your domains of interest..  Here are a few randomly selected ones that you might enjoy:")
        
            file.write("\n")
            file.write("| Nr | ID | URL | Title | Subjects |\n")
            file.write("| --- | --- | --- | --- | --- | \n")
            
            # limit the loop to print up to nr_of_recommendations articles
            for article, _, _ in random_selection[researcher_id]:
                if index > nr_of_recommendations:
                    break
                file.write(f"| {index} | {article['id']} | {article['url']} | {article['title']} | {article['subject_split']} | \n")
                index += 1

            file.write("\n\n")


### Save common interests

In [27]:
# common interests (helper for later debugging?)
with open('articles_recommended_to_multiple_researchers.txt', 'w') as file:
    for article_id, researchers in articles_recommended_to_multiple_researchers.items():
        file.write(f"Article ID: {article_id}\n")
        file.write("Recommended to Researchers:\n")
        for researcher in researchers:
            file.write(f"- {researcher}\n")
        file.write("\n")

#### Update database columns

In [28]:
conn = psycopg2.connect(
    dbname=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)
# cursor object
cursor = conn.cursor()

# update the 'Appreciated' column in the 'researchers_table'
for index, researcher in researchers_df.iterrows():
    researcher_id = researcher['Researcher ID']
    researcher_name = researcher['Full Name']
    recommended_articles = [article['id'] for article, _, _ in top_articles[researcher_id] if researcher_name in articles_recommended_to_researchers[article['id']]]
    recommended_articles_str = json.dumps(recommended_articles)

    random_articles = [article['id'] for article, _, _ in random_selection[researcher_id]]
    random_articles_str = json.dumps(random_articles)

    update_query_recom = f"UPDATE researchers_table SET \"Appreciated\" = '{recommended_articles_str}' WHERE \"Researcher ID\" = '{researcher_id}'"
    cursor.execute(update_query_recom)

    update_query_random = f"UPDATE researchers_table SET \"Random Recommendation\" = '{random_articles_str}' WHERE \"Researcher ID\" = '{researcher_id}'"
    cursor.execute(update_query_random)
    

conn.commit()

cursor.close()
conn.close()