# JOSS Reviewer Matching 


Algorithm to match JOSS reviewers to submitted papers

In [None]:
import glob
from tqdm import tqdm 
from pdfminer.high_level import extract_text
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk 
nltk.download('stopwords')
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

`JOSS_PAPERS_DIR` = where the JOSS papers subjected for reviewing are located. EX. In my jupyterhub the papers are in the projects folder, then in the joss-papers folder. This will change depending on your personal directory for the PDFs.

`JOSS_REVIWERS_DB_FNAME` = the location where the spreadsheet containing the reviewers information comes from. Make sure to include the file type (.xlsx) so the spreadsheet can be accessed. This will change depending on your personal location of the reviewer spreadsheet.

`PAPER_OF_INTEREST_FNAME` = where the paper that is being matched to the reviewer is located. This will change depending on your location of the paper of interest.

__** This cell is the only one that should need adjusting! The rest of the cells just need to be run. **__



In [None]:
JOSS_PAPERS_DIR = "./"
# A local version is already in this repository. Download a new version at 
# http://bit.ly/joss-reviewers and upload to JupyterLab session if you want the latest reviewer list.
JOSS_REVIEWERS_DB_FNAME = "./joss_reviewers.xlsx"
# Download Whedon-generated paper from pre-review issue thread and upload to JupyterLab session
# xxxxx = paper number found in the filename (be sure to include leading zero/s)
PAPER_OF_INTEREST_FNAME = "./10.21105.joss.xxxxx.pdf"

## 1. Import & extract papers


The papers were imported into a folder in jupyterhub from github. This needs to be done before extracting the pdf of the paper for review.

`ALL_JOSS_FNAME` = Find all of the pdfs of the JOSS papers within `JOSS_PAPERS_DIR`

`JOSS_PDF` = A list of all of the pdfs of JOSS papers

The pdfs of the papers are put into the JOSS_PDF list so their contents can be later extracted.

In [None]:
ALL_JOSS_FNAME=list(glob.glob(JOSS_PAPERS_DIR+'/**/*.pdf'))
JOSS_PDF = [extract_text(fname) for fname in tqdm(ALL_JOSS_FNAME)]

`POI_PDF` = The paper of interest to be matched with a reviewer

In [None]:
POI_PDF = [extract_text(PAPER_OF_INTEREST_FNAME)]

## 2. Import & extract reviewer information

The reviewers information needs to be imported from the shared google sheet.

`JOSS_EXCEL` = Opens and reads the excel file of reviewers.

`JOSS_EXCEL_FINAL` = Replaces all of the NaNs with an empty string.

In [None]:
JOSS_EXCEL = pd.read_excel(JOSS_REVIEWERS_DB_FNAME)
new_header = JOSS_EXCEL.iloc[0] 
JOSS_EXCEL = JOSS_EXCEL[1:] 
JOSS_EXCEL.columns = new_header
JOSS_EXCEL.drop(index = 1)
JOSS_EXCEL_FINAL = JOSS_EXCEL.replace(np.nan, '')
USERNAME = JOSS_EXCEL_FINAL['username'].to_list()
ACTIVE_REVIEWS = JOSS_EXCEL_FINAL['Active reviews'].to_list()
LAST_QUARTER_REVIEWS = JOSS_EXCEL_FINAL['Review count(last quarter)'].to_list()
LAST_YEAR_REVIEWS = JOSS_EXCEL_FINAL['Review count(last year)'].to_list()
TOTAL_REVIEWS = JOSS_EXCEL_FINAL['Review count(all time)'].to_list()

In [None]:
REVIEWER_TOPICS = JOSS_EXCEL_FINAL['Domains/topic areas you are comfortable reviewing'].to_list()
REVIEWER_TOPICS_FINAL = []
for topic in REVIEWER_TOPICS:
    topic = topic.replace('\n', ',').replace('/', ',').replace('"', ',').replace('&', ',')
    topic = [item.strip().lower() for item in str(topic).split(",")]
    REVIEWER_TOPICS_FINAL.extend(topic)
#vocabulary used by reviewers to describe the work they are comfortable reviewing 
reviewer_vocab = np.unique(REVIEWER_TOPICS_FINAL)[1:]

## 3. Vectorize the words

Now the words from the pdf will be vectorized using TFIDF, which will determine the weight of the words in the documents based off of how many times they appear. This needs to be done to match the paper to a reviewer.

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', vocabulary = reviewer_vocab)
reviewer_vectors = tfidf_vectorizer.fit_transform(REVIEWER_TOPICS)
column_names = tfidf_vectorizer.get_feature_names()
reviewer_vectors_matrix = reviewer_vectors.todense()
reviewer_matrix_list = reviewer_vectors_matrix.tolist()
df_tfidf_reviewers = pd.DataFrame(reviewer_matrix_list, columns=column_names)

In [None]:
cv = CountVectorizer(stop_words='english', vocabulary = reviewer_vocab)
poi_vector=cv.transform(POI_PDF)
joss_pdf_vector = cv.fit_transform(JOSS_PDF)
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(joss_pdf_vector)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
df_idf_transform = df_idf.T

In [None]:
tfidf_vector=tfidf_transformer.transform(poi_vector)
feature_names = cv.get_feature_names()
df_tfidf = pd.DataFrame(tfidf_vector.T.todense(), columns=["tfidf"], index=feature_names)
df_poi = df_tfidf.T

## 4. Dot Product for top 10 reviewer suggestions


Now we will calculate the dot product to see how well the reviewers match to a specific paper. `df_reviewers.iloc[0:x]` is the whole range of reviewers. `df_poi.iloc[0]` is the paper in question

The output of the cell below `top_10_reviewers` will be the reccomended top 10 reviewers for the paper of interest.

In [None]:
x =(len(df_tfidf_reviewers))
dot_prod = (np.dot(df_tfidf_reviewers.iloc[0: x], df_poi.iloc[0]))
paper_used = []
for i, value in enumerate (dot_prod):
    paper_used.append(value)
paper_of_interest = {'Dot Product':paper_used, 'Username':USERNAME, 'Topics':REVIEWER_TOPICS, 'Active Reviews': ACTIVE_REVIEWS, 'Last Quarter Reviews': LAST_QUARTER_REVIEWS, 'Last Year Reviews': LAST_YEAR_REVIEWS, 'Total Reviews': TOTAL_REVIEWS}
df_paper_of_interest = pd.DataFrame(paper_of_interest)
top_10_reviewer = df_paper_of_interest.nlargest(10, 'Dot Product')
top_10_reviewer