In [1]:
import bibtexparser
import requests
import pandas as pd
import json

with open('FL_publications.bib', encoding= "utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

entries = bib_database.entries

In [2]:
len(entries), entries

(129,
 [{'abstract': 'Glyphosate is the most widely used active ingredient (AI) in thousands of glyphosate-based herbicides (GBHs) worldwide. Short-term impacts of AIs or GBHs on earthworms are well known, but few studies have examined long-term legacy effects >3 months after application. In a pot experiment, we studied both short-term and long-term effects on deep burrowing earthworms (Lumbricus terrestris) and soil functions. Therefore, the cover crop Sinapis alba was grown in soils with either 3.0% or 4.1% soil organic matter content (SOM) and either sprayed with a GBH (Touchdown Quattro, Roundup PowerFlex, or Roundup LB Plus) or the respective glyphosate AI (diammonium-, potassium-, or isopropylamine-salt) or hand weeded (control). Long-term effects showed increased earthworm activity under GBHs even 4 months after application, but similar activity under AIs and control. Another application of the same treatments 5 months after the previous one also increased earthworm activity und

In [82]:
def fetch_abstract_crossref(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    data = response.json()
    info = {}
    if response.status_code == 200:  
        info["title"] = data["message"].get("title")[0]
        info["abstract"] = data["message"].get("abstract")
        info["authors"] = [a.get("given") + " " + a.get("family") for a in data["message"].get("author")]
        info["referencecount"] = data["message"].get("reference-count")
        info["journal"] = data["message"].get("container-title")[0]
        info["publisher"] = data["message"].get("publisher")
        info["doi"] = data["message"].get("DOI")
        info["url"] = data["message"].get("URL")
    return info

# Example usage
doi = "10.3390/soilsystems7030066"
infos = fetch_abstract_crossref(doi)
print(f"DOI: {doi} Info: {infos}")

DOI: 10.3390/soilsystems7030066 Info: {'title': 'Glyphosate-Based Herbicide Formulations with Greater Impact on Earthworms and Water Infiltration than Pure Glyphosate', 'abstract': '<jats:p>Glyphosate is the most widely used active ingredient (AI) in thousands of glyphosate-based herbicides (GBHs) worldwide. Short-term impacts of AIs or GBHs on earthworms are well known, but few studies have examined long-term legacy effects &gt;3 months after application. In a pot experiment, we studied both short-term and long-term effects on deep burrowing earthworms (Lumbricus terrestris) and soil functions. Therefore, the cover crop Sinapis alba was grown in soils with either 3.0% or 4.1% soil organic matter content (SOM) and either sprayed with a GBH (Touchdown Quattro, Roundup PowerFlex, or Roundup LB Plus) or the respective glyphosate AI (diammonium-, potassium-, or isopropylamine-salt) or hand weeded (control). Long-term effects showed increased earthworm activity under GBHs even 4 months afte

In [83]:
data = []
for i, entry in enumerate(entries):
    doi = entry.get("doi")
    print(f"Fetching info for {i} DOI: {doi}")
    if doi:
        infos = fetch_abstract_crossref(doi)
        data.append(infos)
    else:
        print(f"DOI not found in entry: {entry}")

data

entries[4]

Fetching info for 0 DOI: 10.3390/soilsystems7030066
Fetching info for 1 DOI: 10.1007/s11116-021-10221-4
Fetching info for 2 DOI: 10.1002/bimj.202200104
Fetching info for 3 DOI: 10.3390/agriculture12060879
Fetching info for 4 DOI: 10.1186/s12302-022-00622-2
Fetching info for 5 DOI: 10.7717/peerj.11309
Fetching info for 6 DOI: 10.1371/journal.pone.0249082
Fetching info for 7 DOI: 10.1016/j.ces.2021.116497
Fetching info for 8 DOI: 10.1186/s12302-021-00492-0
Fetching info for 9 DOI: 10.1007/s00357-019-09328-2
Fetching info for 10 DOI: 10.1038/s41598-020-68323-5
Fetching info for 11 DOI: 10.1093/femsyr/foaa001
Fetching info for 12 DOI: 10.1007/s11356-020-08213-5
Fetching info for 13 DOI: 10.3390/a12090177
Fetching info for 14 DOI: 10.1016/j.trb.2019.09.010
Fetching info for 15 DOI: 10.1002/biot.201800521
Fetching info for 16 DOI: 10.1002/bit.26984
Fetching info for 17 DOI: 10.1016/j.annals.2018.10.007
Fetching info for 18 DOI: 10.1007/978-981-10-8818-6
Fetching info for 19 DOI: 10.18637/jss

{'doi': '10.1186/s12302-022-00622-2',
 'pages': '1--20',
 'number': '44',
 'volume': '34',
 'year': '2022',
 'journal': 'Environmental Sciences Europe',
 'title': 'Reducing Overall Herbicide Use May Reduce Risks to\nHumans but Increase Toxic Loads to Honeybees,\nEarthworms and Birds',
 'author': 'Ramona M. Cech1 and Suzanne Jovanovic and Susan\nKegley and Koen Hertoge and Friedrich Leisch and\nJohann G. Zaller',
 'ENTRYTYPE': 'article',
 'ID': 'fl-papers:Cech1+Jovanovic+Kegley:2022'}

In [87]:
df = pd.DataFrame(data)

In [95]:
with open("mydata.json", "w") as final:
    json.dump(data, final)

In [74]:
doi = "10.1186/gb-2004-5-10-r80"
url = f"https://api.crossref.org/works/{doi}"
response = requests.get(url)
data = response.json()

In [75]:
data["message"]

{'indexed': {'date-parts': [[2024, 6, 19]],
  'date-time': '2024-06-19T14:00:50Z',
  'timestamp': 1718805650529},
 'reference-count': 0,
 'publisher': 'Springer Science and Business Media LLC',
 'issue': '10',
 'content-domain': {'domain': [], 'crossmark-restriction': False},
 'short-container-title': ['Genome Biol'],
 'published-print': {'date-parts': [[2004]]},
 'DOI': '10.1186/gb-2004-5-10-r80',
 'type': 'journal-article',
 'created': {'date-parts': [[2004, 9, 16]],
  'date-time': '2004-09-16T06:23:38Z',
  'timestamp': 1095315818000},
 'page': 'R80',
 'source': 'Crossref',
 'is-referenced-by-count': 9555,
 'title': [],
 'prefix': '10.1186',
 'volume': '5',
 'author': [{'given': 'Robert C',
   'family': 'Gentleman',
   'sequence': 'first',
   'affiliation': []},
  {'given': 'Vincent J',
   'family': 'Carey',
   'sequence': 'additional',
   'affiliation': []},
  {'given': 'Douglas M',
   'family': 'Bates',
   'sequence': 'additional',
   'affiliation': []},
  {'given': 'Ben',
   'fami

In [29]:
data["message"].get("abstract"), data["message"].get("title")[0], data["message"].get("author"), data["message"].get("container-title")[0], data["message"].get("DOI"), data["message"].get("reference-count"), data["message"].get("publisher"), data["message"].get("URL")

('<jats:p>Glyphosate is the most widely used active ingredient (AI) in thousands of glyphosate-based herbicides (GBHs) worldwide. Short-term impacts of AIs or GBHs on earthworms are well known, but few studies have examined long-term legacy effects &gt;3 months after application. In a pot experiment, we studied both short-term and long-term effects on deep burrowing earthworms (Lumbricus terrestris) and soil functions. Therefore, the cover crop Sinapis alba was grown in soils with either 3.0% or 4.1% soil organic matter content (SOM) and either sprayed with a GBH (Touchdown Quattro, Roundup PowerFlex, or Roundup LB Plus) or the respective glyphosate AI (diammonium-, potassium-, or isopropylamine-salt) or hand weeded (control). Long-term effects showed increased earthworm activity under GBHs even 4 months after application, but similar activity under AIs and control. Another application of the same treatments 5 months after the previous one also increased earthworm activity under GBHs, 

In [33]:
info = {}
info["title"] = data["message"].get("title")[0]
info["abstract"] = data["message"].get("abstract")
info["authors"] = [a.get("given") + " " + a.get("family") for a in data["message"].get("author")]
info["referencecount"] = data["message"].get("reference-count")
info["journal"] = data["message"].get("container-title")[0]
info["publisher"] = data["message"].get("publisher")
info["doi"] = data["message"].get("DOI")
info["url"] = data["message"].get("URL")

['Verena Brandmaier',
 'Anna Altmanninger',
 'Friedrich Leisch',
 'Edith Gruber',
 'Eszter Takács',
 'Mária Mörtl',
 'Szandra Klátyik',
 'András Székács',
 'Johann G. Zaller']

In [13]:
def fetch_metadata_by_author(author):
    url = f"https://api.crossref.org/works?query.author={author}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        items = data['message']
        if items:
            return items  # Return the first match
        else:
            return "No matching articles found"
    return None

# Example usage
author = "Friedrich Leisch"
url = f"https://api.crossref.org/works?query.author={author}&filter=type:journal-article"
print(url)
metadata = fetch_metadata_by_author(author)
print(metadata)

https://api.crossref.org/works?query.author=Friedrich Leisch&filter=type:journal-article
{'facets': {}, 'total-results': 105085, 'items': [{'indexed': {'date-parts': [[2024, 4, 27]], 'date-time': '2024-04-27T23:38:57Z', 'timestamp': 1714261137981}, 'publisher-location': 'Berlin, Heidelberg', 'reference-count': 27, 'publisher': 'Springer Berlin Heidelberg', 'isbn-type': [{'value': '9783540330363', 'type': 'print'}, {'value': '9783540330370', 'type': 'electronic'}], 'license': [{'start': {'date-parts': [[2008, 1, 1]], 'date-time': '2008-01-01T00:00:00Z', 'timestamp': 1199145600000}, 'content-version': 'tdm', 'delay-in-days': 0, 'URL': 'http://www.springer.com/tdm'}], 'content-domain': {'domain': [], 'crossmark-restriction': False}, 'published-print': {'date-parts': [[2008]]}, 'DOI': '10.1007/978-3-540-33037-0_22', 'type': 'book-chapter', 'created': {'date-parts': [[2007, 12, 17]], 'date-time': '2007-12-17T08:28:58Z', 'timestamp': 1197880138000}, 'page': '561-587', 'source': 'Crossref', '

In [203]:
' and '.join(df.iloc[114]["authors"])

'Angela Heiden and Petra Schüssler and Ulrike Itzlinger and Friedrich Leisch and Joachim Scharfetter and Christian Gebhardt and Karoline Fuchs and Matthäus Willeit and Linda Nilsson and Ellenore Miller-Reiter and Thomas Stompe and Kurt Meszaros and Werner Sieghart and Kurt Hornik and Siegfried Kasper and Harald N. Aschauer'

In [2]:
import bibtexparser
import requests
import pandas as pd
import json
import numpy as np

with open('FL_publications.bib', encoding= "utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

entries = bib_database.entries
len(entries), entries

(129,
 [{'abstract': 'Glyphosate is the most widely used active ingredient (AI) in thousands of glyphosate-based herbicides (GBHs) worldwide. Short-term impacts of AIs or GBHs on earthworms are well known, but few studies have examined long-term legacy effects >3 months after application. In a pot experiment, we studied both short-term and long-term effects on deep burrowing earthworms (Lumbricus terrestris) and soil functions. Therefore, the cover crop Sinapis alba was grown in soils with either 3.0% or 4.1% soil organic matter content (SOM) and either sprayed with a GBH (Touchdown Quattro, Roundup PowerFlex, or Roundup LB Plus) or the respective glyphosate AI (diammonium-, potassium-, or isopropylamine-salt) or hand weeded (control). Long-term effects showed increased earthworm activity under GBHs even 4 months after application, but similar activity under AIs and control. Another application of the same treatments 5 months after the previous one also increased earthworm activity und

In [3]:
df2 = pd.DataFrame(entries)
df2

Unnamed: 0,abstract,publisherurl,doi,pages,number,volume,year,journal,title,author,...,preprinturl,submitted,bibdate,coden,month,day,http,isbn,series,ps
0,Glyphosate is the most widely used active ingr...,https://www.mdpi.com/2571-8789/7/3/66,10.3390/soilsystems7030066,1--18,3,7,2023,Soil Systems,Glyphosate-Based Herbicide Formulations with G...,Verena Brandmaier and Anna Altmanninger and Fr...,...,,,,,,,,,,
1,The value of travel time savings (VTTS) repres...,,10.1007/s11116-021-10221-4,1599--1625,,49,2022,Transportation,The Role of Unpaid Domestic Work in Explaining...,Simona Jokubauskaitė and Reinhard Hössinger an...,...,,,,,,,,,,
2,Many methodological comparison studies aim at ...,,10.1002/bimj.202200104,1--16,,,2022,Biometrical Journal,Against the ``One Method Fits All Data Sets'' ...,Carolin Strobl and Friedrich Leisch,...,,,,,,,,,,
3,The production of synthetic pesticides is ene...,,10.3390/agriculture12060879,1--16,879,12,2022,Agriculture,Pesticide Use and Associated Greenhouse Gas Em...,Ramona M. Cech and Friedrich Leisch and Johann...,...,,,,,,,,,,
4,Background Pesticide use has been associated w...,,10.1186/s12302-022-00622-2,1--20,44,34,2022,Environmental Sciences Europe,Reducing Overall Herbicide Use May Reduce Risk...,Ramona M. Cech and Suzanne Jovanovic and Susan...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,"We introduce arc-Ih, a new algorithm for impro...",,,522--528,,9,1997,,{ARC-LH}: A New Adaptive Resampling Algorithm ...,Friedrich Leisch and Kurt Hornik,...,,,,,,,,,,
125,In this paper we investigate several ways of u...,,,1--14,,,1997,,Error-Dependent Resampling for Artificial Neur...,Friedrich Leisch and Kurt Hornik,...,,,,,,,,,,http://homepage.boku.ac.at/leisch/papers/Leisc...
126,We show that error correcting output codes (EC...,,,266--269,,,1997,,Combining Neural Network Voting Classifiers an...,Friedrich Leisch and Kurt Hornik,...,,,,,,,,,,http://homepage.boku.ac.at/leisch/papers/Leisc...
127,"This paper introduces Bayesian classification,...",,,1--1,,,1995,,Performance Measures for {NN} Classifiers,Friedrich Leisch and Lakhmi C. Jain and Kurt H...,...,,,,,,,,,,http://homepage.boku.ac.at/leisch/papers/Leisc...


In [5]:
authors = [d.split(" and ") for d in df2["author"]]
all_authors_set = set(sum(authors, []))
all_authors_set, len(all_authors_set)

({'A. Konstantinidis',
  'Achim Zeileis',
  'Adrian Trapletti',
  'Ahmet Yildiz',
  'Ajaykumar Pradhan',
  'Albert Kriegner',
  'Alexander Neumeister',
  'Alexander Yassouridis',
  'Alexandra B Graf',
  'Alexandra Schosser',
  'Alexandra Strnad',
  'Alexandros Karatzoglou',
  'Alois Geyer',
  'Alois Jungbauer',
  'Alyssa Schneebaum',
  'Amata Ring',
  'Andreas Beyerlein',
  'Andreas Tockner',
  'Andreas Weingessel',
  'Andreas Weinhaeusel',
  'András Székács',
  'Anette-Gabriele Ziegler',
  'Angela Heiden',
  'Angela Schneider',
  'Anja Puklowski',
  'Anna Altmanninger',
  'Anne-Luise Tscheliessnig',
  'Anthony J Rossini',
  'Armin Monecke',
  'Arne Nothdurft',
  'Arthur Filippidis',
  'Astrid Dürauer',
  'Astrid Hrdina',
  'Axel Steiger',
  'B Bondy',
  'B. Mitterauer',
  'Basil Schmid',
  'Ben Bolstad',
  'Bernhard Andreas Hrobath',
  'Bernhard Hrobath',
  'Bernhard Spangl',
  'Bettina Grün',
  'Bodo Hattendorf',
  'Brigitte Gasser',
  'Byron Ellis',
  'Carolin Strobl',
  'Cheng Li',

In [6]:
res = []
for article in authors:
    res.append([sorted(list(all_authors_set)).index(author) for author in article])
res    

a = np.zeros([129, 272])
# now we fill 1 on positions from res
for i, article in enumerate(res):
    a[i, article] = 1
dff = pd.DataFrame(a)
dff.columns = sorted(list(all_authors_set))
dff

Unnamed: 0,A. Konstantinidis,Achim Zeileis,Adrian Trapletti,Ahmet Yildiz,Ajaykumar Pradhan,Albert Kriegner,Alexander Neumeister,Alexander Yassouridis,Alexandra B Graf,Alexandra Schosser,...,Verena Dorn,Vincent J Carey,Weksi Budiaji,Werner Sieghart,Wiebke Unbehaun,Wolfgang Huber,Wolfgang Maier,Wolfgang Steiner,Xavier Boivin,Yongchao Ge
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_final = pd.concat([df2[["abstract", "pages", "year", "journal", "title", "doi"]], dff], axis=1)
pag = df_final["pages"].str.split("--", expand=True)
df_final["pages"] = pag[1].astype(int) - pag[0].astype(int)

In [8]:
df_final

Unnamed: 0,abstract,pages,year,journal,title,doi,A. Konstantinidis,Achim Zeileis,Adrian Trapletti,Ahmet Yildiz,...,Verena Dorn,Vincent J Carey,Weksi Budiaji,Werner Sieghart,Wiebke Unbehaun,Wolfgang Huber,Wolfgang Maier,Wolfgang Steiner,Xavier Boivin,Yongchao Ge
0,Glyphosate is the most widely used active ingr...,17,2023,Soil Systems,Glyphosate-Based Herbicide Formulations with G...,10.3390/soilsystems7030066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The value of travel time savings (VTTS) repres...,26,2022,Transportation,The Role of Unpaid Domestic Work in Explaining...,10.1007/s11116-021-10221-4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Many methodological comparison studies aim at ...,15,2022,Biometrical Journal,Against the ``One Method Fits All Data Sets'' ...,10.1002/bimj.202200104,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The production of synthetic pesticides is ene...,15,2022,Agriculture,Pesticide Use and Associated Greenhouse Gas Em...,10.3390/agriculture12060879,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Background Pesticide use has been associated w...,19,2022,Environmental Sciences Europe,Reducing Overall Herbicide Use May Reduce Risk...,10.1186/s12302-022-00622-2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,"We introduce arc-Ih, a new algorithm for impro...",6,1997,,{ARC-LH}: A New Adaptive Resampling Algorithm ...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,In this paper we investigate several ways of u...,13,1997,,Error-Dependent Resampling for Artificial Neur...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,We show that error correcting output codes (EC...,3,1997,,Combining Neural Network Voting Classifiers an...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,"This paper introduces Bayesian classification,...",0,1995,,Performance Measures for {NN} Classifiers,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = ['That is a happy person', 'That is a very happy person']

model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
embeddings = model.encode(sentences)
embeddings

array([[ 0.23888056, -0.1409193 , -0.22524223, ...,  0.15000606,
         0.06842397, -0.55839574],
       [ 0.05154533,  0.04116291,  0.14477819, ..., -0.11265965,
         0.2396578 , -0.80652076]], dtype=float32)

In [21]:
df_final["GTE_embeddings"] =  model.encode(df_final["abstract"]).tolist()

In [34]:
from openai import OpenAI
client = OpenAI(api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [38]:
df_final['OpenAI_embeddings'] = df_final['abstract'].apply(get_embedding)

In [40]:
df_final

Unnamed: 0,abstract,pages,year,journal,title,doi,A. Konstantinidis,Achim Zeileis,Adrian Trapletti,Ahmet Yildiz,...,Weksi Budiaji,Werner Sieghart,Wiebke Unbehaun,Wolfgang Huber,Wolfgang Maier,Wolfgang Steiner,Xavier Boivin,Yongchao Ge,GTE_embeddings,OpenAI_embeddings
0,Glyphosate is the most widely used active ingr...,17,2023,Soil Systems,Glyphosate-Based Herbicide Formulations with G...,10.3390/soilsystems7030066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.29964905977249146, 0.03478763625025749, 0.0...","[-0.00028995011234655976, -0.01695988513529300..."
1,The value of travel time savings (VTTS) repres...,26,2022,Transportation,The Role of Unpaid Domestic Work in Explaining...,10.1007/s11116-021-10221-4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.44411033391952515, -0.06967713683843613, 0....","[-0.023544123396277428, -0.01776915043592453, ..."
2,Many methodological comparison studies aim at ...,15,2022,Biometrical Journal,Against the ``One Method Fits All Data Sets'' ...,10.1002/bimj.202200104,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.7856029868125916, -0.6793034672737122, -0.4...","[0.014711462892591953, 0.022615734487771988, -..."
3,The production of synthetic pesticides is ene...,15,2022,Agriculture,Pesticide Use and Associated Greenhouse Gas Em...,10.3390/agriculture12060879,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.18162254989147186, 0.22526495158672333, 0....","[-0.02674596570432186, -0.011602101847529411, ..."
4,Background Pesticide use has been associated w...,19,2022,Environmental Sciences Europe,Reducing Overall Herbicide Use May Reduce Risk...,10.1186/s12302-022-00622-2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.2197885811328888, 0.5345749855041504, 1.00...","[-0.023423321545124054, -0.028338994830846786,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,"We introduce arc-Ih, a new algorithm for impro...",6,1997,,{ARC-LH}: A New Adaptive Resampling Algorithm ...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0052680969238281, 0.04188179224729538, -0.1...","[-0.043554987758398056, -0.007337905932217836,..."
125,In this paper we investigate several ways of u...,13,1997,,Error-Dependent Resampling for Artificial Neur...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.5336999297142029, 0.664895236492157, 0.5823...","[-0.03780759871006012, 0.01104810368269682, -0..."
126,We show that error correcting output codes (EC...,3,1997,,Combining Neural Network Voting Classifiers an...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.8301132321357727, -0.5021154284477234, 0.73...","[-0.0037767274770885706, 0.018803590908646584,..."
127,"This paper introduces Bayesian classification,...",0,1995,,Performance Measures for {NN} Classifiers,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.8892611265182495, -0.21318231523036957, 0.0...","[-0.024031737819314003, 0.023857232183218002, ..."


In [41]:
df_final.to_pickle("df_final.pkl")

In [44]:
dd = df_final["abstract"]
dd.to_csv("abstracts.csv", index=False)

### Update

In [1]:
import pandas

df_final = pandas.read_pickle("df_final.pkl")

In [2]:
df_final

Unnamed: 0,abstract,pages,year,journal,title,doi,A. Konstantinidis,Achim Zeileis,Adrian Trapletti,Ahmet Yildiz,...,Weksi Budiaji,Werner Sieghart,Wiebke Unbehaun,Wolfgang Huber,Wolfgang Maier,Wolfgang Steiner,Xavier Boivin,Yongchao Ge,GTE_embeddings,OpenAI_embeddings
0,Glyphosate is the most widely used active ingr...,17,2023,Soil Systems,Glyphosate-Based Herbicide Formulations with G...,10.3390/soilsystems7030066,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.29964905977249146, 0.03478763625025749, 0.0...","[-0.00028995011234655976, -0.01695988513529300..."
1,The value of travel time savings (VTTS) repres...,26,2022,Transportation,The Role of Unpaid Domestic Work in Explaining...,10.1007/s11116-021-10221-4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.44411033391952515, -0.06967713683843613, 0....","[-0.023544123396277428, -0.01776915043592453, ..."
2,Many methodological comparison studies aim at ...,15,2022,Biometrical Journal,Against the ``One Method Fits All Data Sets'' ...,10.1002/bimj.202200104,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.7856029868125916, -0.6793034672737122, -0.4...","[0.014711462892591953, 0.022615734487771988, -..."
3,The production of synthetic pesticides is ene...,15,2022,Agriculture,Pesticide Use and Associated Greenhouse Gas Em...,10.3390/agriculture12060879,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.18162254989147186, 0.22526495158672333, 0....","[-0.02674596570432186, -0.011602101847529411, ..."
4,Background Pesticide use has been associated w...,19,2022,Environmental Sciences Europe,Reducing Overall Herbicide Use May Reduce Risk...,10.1186/s12302-022-00622-2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.2197885811328888, 0.5345749855041504, 1.00...","[-0.023423321545124054, -0.028338994830846786,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,"We introduce arc-Ih, a new algorithm for impro...",6,1997,,{ARC-LH}: A New Adaptive Resampling Algorithm ...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1.0052680969238281, 0.04188179224729538, -0.1...","[-0.043554987758398056, -0.007337905932217836,..."
125,In this paper we investigate several ways of u...,13,1997,,Error-Dependent Resampling for Artificial Neur...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.5336999297142029, 0.664895236492157, 0.5823...","[-0.03780759871006012, 0.01104810368269682, -0..."
126,We show that error correcting output codes (EC...,3,1997,,Combining Neural Network Voting Classifiers an...,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.8301132321357727, -0.5021154284477234, 0.73...","[-0.0037767274770885706, 0.018803590908646584,..."
127,"This paper introduces Bayesian classification,...",0,1995,,Performance Measures for {NN} Classifiers,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.8892611265182495, -0.21318231523036957, 0.0...","[-0.024031737819314003, 0.023857232183218002, ..."


In [17]:
from openai import OpenAI
client = OpenAI(api_key="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model, dimensions = 256).data[0].embedding


In [6]:
df_final['OpenAI_embeddings512'] = df_final['abstract'].apply(get_embedding)

In [18]:
df_final['OpenAI_embeddings256'] = df_final['abstract'].apply(get_embedding)

In [20]:
len(df_final['OpenAI_embeddings256'][0])

256

In [21]:
df_final.to_pickle("df_final.pkl")