In [19]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pprint

In [2]:
# Read the TSV files into pandas DataFrames
df_citations = pd.read_csv("doi_metadata.tsv", sep="\t")
df_metadata = pd.read_csv("pdf_metadata.tsv", sep="\t")


In [3]:
# Combine title and author information into a single field for both datasets
df_citations['title_author'] = df_citations['Title'] + ' ' + df_citations['Authors']
df_metadata['title_author'] = df_metadata['Title'] + ' ' + df_metadata['Author']


In [4]:
# Preprocess the combined field
def preprocess_text(text):
    if pd.isnull(text):  # Handle NaN values
        return ''
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text.split()  # Tokenize


In [5]:
df_citations['title_author_tokens'] = df_citations['title_author'].apply(preprocess_text)
df_metadata['title_author_tokens'] = df_metadata['title_author'].apply(preprocess_text)


In [6]:
df_citations

Unnamed: 0,DOI,Updated DOI,Title,Journal,Year,Year Type,Publisher,Authors,Volume,Issue,Pages,DOI URL,title_author,title_author_tokens
0,10.1128/mSystems.00045-18,,Exploration of the Biosynthetic Potential of t...,mSystems,2018,published-print,American Society for Microbiology,Patricia M. Blair; Miriam L. Land; Marek J. Pi...,3.0,5,,https://doi.org/10.1128/msystems.00045-18,Exploration of the Biosynthetic Potential of t...,"[exploration, of, the, biosynthetic, potential..."
1,10.1101/2022.12.12.520098,10.3389/fsoil.2023.1120425,One thousand soils for molecular understanding...,Frontiers in Soil Science,2023,published,Frontiers Media SA,Maggie M. Bowman; Alexis E. Heath; Tamas Varga...,3.0,,,https://doi.org/10.3389/fsoil.2023.1120425,One thousand soils for molecular understanding...,"[one, thousand, soils, for, molecular, underst..."
2,10.1038/s41564-022-01266-x,,Standardized multi-omics of Earth’s microbiome...,Nature Microbiology,2022,published,Springer Science and Business Media LLC,Justin P. Shaffer; Louis-Félix Nothias; Luke R...,7.0,12,2128-2150,https://doi.org/10.1038/s41564-022-01266-x,Standardized multi-omics of Earth’s microbiome...,"[standardized, multiomics, of, earth’s, microb..."
3,10.1111/1462-2920.16314,,Environmental predictors of electroactive bact...,Environmental Microbiology,2023,published-print,Wiley,Charles N. Olmsted; Roger Ort; Patricia Q. Tra...,25.0,3,705-720,https://doi.org/10.1111/1462-2920.16314,Environmental predictors of electroactive bact...,"[environmental, predictors, of, electroactive,..."
4,10.1111/mec.16891,,"Diversity, distribution, and expression of ops...",Molecular Ecology,2023,published-print,Wiley,Shaomei He; Alexandra M. Linz; Sarah L. R. Ste...,32.0,11,2798-2817,https://doi.org/10.1111/mec.16891,"Diversity, distribution, and expression of ops...","[diversity, distribution, and, expression, of,..."
5,10.1021/acs.estlett.0c00748,,Long-Term Warming Decreases Redox Capacity of ...,Environmental Science &amp; Technology Letters,2021,published-print,American Chemical Society (ACS),Rachelle E. LaCroix; Nicolas Walpen; Michael S...,8.0,1,92-97,https://doi.org/10.1021/acs.estlett.0c00748,Long-Term Warming Decreases Redox Capacity of ...,"[longterm, warming, decreases, redox, capacity..."
6,10.1128/msystems.00768-19,,Complementary Metagenomic Approaches Improve R...,mSystems,2020,published-print,American Society for Microbiology,L. V. Alteio; F. Schulz; R. Seshadri; N. Vargh...,5.0,2,,https://doi.org/10.1128/msystems.00768-19,Complementary Metagenomic Approaches Improve R...,"[complementary, metagenomic, approaches, impro..."
7,10.1371/journal.pone.0228165,,Distinct temporal diversity profiles for nitro...,PLOS ONE,2020,published,Public Library of Science (PLoS),William C. Nelson; Emily B. Graham; Alex R. Cr...,15.0,1,e0228165,https://doi.org/10.1371/journal.pone.0228165,Distinct temporal diversity profiles for nitro...,"[distinct, temporal, diversity, profiles, for,..."
8,10.1016/j.geoderma.2021.115674,,Soil pore network response to freeze-thaw cycl...,Geoderma,2022,published-print,Elsevier BV,Erin C. Rooney; Vanessa L. Bailey; Kaizad F. P...,411.0,,115674,https://doi.org/10.1016/j.geoderma.2021.115674,Soil pore network response to freeze-thaw cycl...,"[soil, pore, network, response, to, freezethaw..."
9,10.1029/2022JG006889,,The Impact of Freeze‐Thaw History on Soil Carb...,Journal of Geophysical Research: Biogeosciences,2022,published-print,American Geophysical Union (AGU),Erin C. Rooney; Vanessa L. Bailey; Kaizad F. P...,127.0,5,,https://doi.org/10.1029/2022jg006889,The Impact of Freeze‐Thaw History on Soil Carb...,"[the, impact, of, freeze‐thaw, history, on, so..."


In [7]:
df_metadata

Unnamed: 0,Filename,DOI,Year,Volume,Pages,Title,Author,Subject,Keywords,Creation Date,Modification Date,Producer,Creator,title_author,title_author_tokens
0,acp-23-15783-2023.pdf,,,,,,,,,2023-12-22,2023-12-22,pdfTeX-1.40.19,copernicus.cls Version 2023/11/02 10.1.11 Cope...,,
1,bell-et-al-2020-metatranscriptomic-sequencing-...,10.1128/mra.01361-19,2020.0,9.0,10,Metatranscriptomic Sequencing of a Cyanobacter...,"Terrence H. Bell, Ryan V. Trexler, Xin Peng, M...",Microbiol Resour Announc 2020.9:10.1128/mra.01...,,2019-12-10,2025-02-10,Adobe LiveCycle PDF Generator; modified using ...,XPP,Metatranscriptomic Sequencing of a Cyanobacter...,"[metatranscriptomic, sequencing, of, a, cyanob..."
2,Soil_Bacterial_Diversity_Is_Positively_Correla...,,,,,Microsoft Word - microorganisms-1106083.docx,Admin,,,2021-02-11,2021-02-11,Acrobat Distiller 21.0 (Windows),PScript5.dll Version 5.2.2,Microsoft Word - microorganisms-1106083.docx A...,"[microsoft, word, microorganisms1106083docx, a..."
3,BISS_article_20637.pdf,,,,,,,description,keywords,,,WeasyPrint 0.31 (http://weasyprint.org/),,,
4,s40168-020-00889-8.pdf,10.1186/s40168-020-00889-8,,,,Influence of the polar light cycle on seasonal...,Pratibha Panwar,"Microbiome, 2020, doi:10.1186/s40168-020-00889-8","Antarctic microbiology,Polar light cycle,Metag...",2020-11-14,2020-11-15,Acrobat Distiller 10.0.0 (Windows); modified u...,Arbortext Advanced Print Publisher 9.1.440/W U...,Influence of the polar light cycle on seasonal...,"[influence, of, the, polar, light, cycle, on, ..."
5,sciadv.adg7888.pdf,,2024.0,10.0,eadg7888,Reproducible growth of Brachypodium in EcoFAB ...,,Sci. Adv. 2024.10:eadg7888,,2023-12-29,2024-08-22,Adobe PDF Library 15.0; modified using iText 4...,Adobe InDesign 16.2 (Windows),,
6,s41597-024-04013-5.pdf,10.1038/s41597-024-04013-5,,,,Microbial Metagenomes Across a Complete Phytop...,Brook L. Nunn,"Scientific Data, doi:10.1038/s41597-024-04013-5",,2025-01-15,2025-01-21,iText® 5.3.5 ©2000-2012 1T3XT BVBA (SPRINGER S...,Springer,Microbial Metagenomes Across a Complete Phytop...,"[microbial, metagenomes, across, a, complete, ..."
7,s41564-020-00861-0.pdf,10.1038/s41564-020-00861-0,,,,Genomic and functional analyses of fungal and ...,Xuefeng Peng,"Nature Microbiology, doi:10.1038/s41564-020-00...",,2021-03-24,2021-03-24,,Springer,Genomic and functional analyses of fungal and ...,"[genomic, and, functional, analyses, of, funga..."
8,41564_2019_Article_449.pdf,10.1038/s41564-019-0449-y,,,,Mediterranean grassland soil C–N compound turn...,Spencer Diamond,"Nature Microbiology, doi:10.1038/s41564-019-04...",,2019-07-17,2019-07-17,,Springer,Mediterranean grassland soil C–N compound turn...,"[mediterranean, grassland, soil, c–n, compound..."
9,gkab990.pdf,,,,,,,,,2021-12-27,2024-08-21,Acrobat Distiller 21.0 (Windows); modified usi...,XMLPublish v.3.0 Copyright 1991-2017 Aptara Inc.,,


In [8]:
# Create a TF-IDF vectorizer to convert the tokenized text into numerical vectors
vectorizer = TfidfVectorizer()


In [9]:
# Fit the vectorizer on the combined title_author_tokens from both DataFrames
vectorizer.fit(df_citations['title_author_tokens'].apply(lambda x: ' '.join(x)).tolist() +
              df_metadata['title_author_tokens'].apply(lambda x: ' '.join(x)).tolist())


In [10]:
# Transform the title_author_tokens into numerical vectors
citations_vectors = vectorizer.transform(df_citations['title_author_tokens'].apply(lambda x: ' '.join(x)))
metadata_vectors = vectorizer.transform(df_metadata['title_author_tokens'].apply(lambda x: ' '.join(x)))


In [11]:
# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(citations_vectors, metadata_vectors)


In [12]:
# Find the maximum cosine similarity score for each entry in doi_metadata.tsv
max_sim_scores = cosine_sim_matrix.max(axis=1)


In [15]:
# Find the maximum cosine similarity score and its index for each entry in doi_metadata.tsv
max_sim_scores = cosine_sim_matrix.max(axis=1)
max_sim_indices = cosine_sim_matrix.argmax(axis=1)

In [20]:
# Create a list of dictionaries to store the results
results = []
for i, (score, idx) in enumerate(zip(max_sim_scores, max_sim_indices)):
    results.append({
        'Entry': i + 1,
        'DOI': df_citations.loc[i, 'DOI'],
        'Filename': df_metadata.loc[idx, 'Filename'],
        'Similarity Score': score
    })

In [22]:
df = pd.DataFrame(results)

In [23]:
df

Unnamed: 0,Entry,DOI,Filename,Similarity Score
0,1,10.1128/mSystems.00045-18,blair-et-al-2018-exploration-of-the-biosynthet...,0.948377
1,2,10.1101/2022.12.12.520098,2022.12.12.520098v2.full.pdf,0.161309
2,3,10.1038/s41564-022-01266-x,s41564-022-01266-x.pdf,0.187543
3,4,10.1111/1462-2920.16314,blair-et-al-2018-exploration-of-the-biosynthet...,0.08341
4,5,10.1111/mec.16891,file.pdf,0.098154
5,6,10.1021/acs.estlett.0c00748,alteio-et-al-2020-complementary-metagenomic-ap...,0.06763
6,7,10.1128/msystems.00768-19,alteio-et-al-2020-complementary-metagenomic-ap...,1.0
7,8,10.1371/journal.pone.0228165,file.pdf,1.0
8,9,10.1016/j.geoderma.2021.115674,1-s2.0-S0016706121007540-main.pdf,0.511187
9,10,10.1029/2022JG006889,1-s2.0-S0016706121007540-main.pdf,0.232726


In [24]:
df.to_csv("citations_to_metadata_mapping.tsv", sep="\t", index=False)