In [1]:
import requests
import json
import os
import asyncio
import httpx
import pandas as pd

In [2]:
with open('improved_refrences.json', 'r', encoding='utf-8') as f:
    first_data = json.load(f)
len(first_data)

122

In [3]:
existing_papers_id = [rec.get('id') for rec in first_data]
existing_papers_id[:5]

['https://openalex.org/W4396903381',
 'https://openalex.org/W4226546914',
 'https://openalex.org/W4362562906',
 'https://openalex.org/W4367182782',
 'https://openalex.org/W3176287975']

In [4]:
papers_id = []
for i, rec in enumerate(first_data):
    referenced_works = rec.get('referenced_works')
    papers_id.extend(referenced_works)
        
    
len(papers_id)

10561

In [6]:
sum(1 for item in papers_id if not item.startswith('https'))

0

In [7]:
papers_id = set(papers_id)
len(papers_id)

6615

In [8]:
papers_id = papers_id - set(existing_papers_id)
papers_id = list(papers_id)
len(papers_id)

6570

In [4]:
temp_response = requests.get('https://api.openalex.org/W4396903381')
temp_response.status_code

200

In [5]:
temp_response.raise_for_status()
temp_data = temp_response.json()

In [6]:
temp_data.keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'institution_assertions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'datasets', 'versions', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])

In [11]:
temp_response = requests.get('https://api.openalex.org/A5060187904')
temp_response.status_code

200

In [12]:
temp_response.raise_for_status()
temp_data = temp_response.json()

In [13]:
temp_data.keys()

dict_keys(['id', 'orcid', 'display_name', 'display_name_alternatives', 'works_count', 'cited_by_count', 'summary_stats', 'ids', 'affiliations', 'last_known_institutions', 'topics', 'topic_share', 'x_concepts', 'counts_by_year', 'works_api_url', 'updated_date', 'created_date'])

In [100]:
rec = first_data[56]

In [44]:
rec.keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'institution_assertions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'datasets', 'versions', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])

In [67]:
len(json.dumps(rec.get('topics')))

1227

In [101]:
print(json.dumps(rec.get('grants'), indent=4))

[
    {
        "funder": "https://openalex.org/F4320319993",
        "funder_display_name": "Leverhulme Trust",
        "award_id": "ECF-2021-517"
    },
    {
        "funder": "https://openalex.org/F4320320006",
        "funder_display_name": "Royal Society",
        "award_id": "IEC\\NSFC\\223228"
    }
]


In [76]:
CONCEPT_SIMILARITY_THRESH = 0.5
TOPIC_SIMILARITY_THRESH = 0.8
papers_data = []
for rec in first_data:
    papers_data.append({
        'paper':{
            'id': rec.get('id', None),
            'doi': rec.get('doi', None),
            'title': rec.get('title', None),
            'publicationYear': rec.get('publication_year', None),
            'language': rec.get('language', None),
            'type': rec.get('type', None),
            'crossrefType': rec.get('type_crossref', None),
            'numberOfCiteation': rec.get('cited_by_count', None),
            'FieldWeightedCitationImpact': rec.get('fwci', None),
            'countsByYear': rec.get('counts_by_year', {}),
            'pdfUrl': rec.get('primary_location', {}).get('pdf_url', None)
        },

        'authorship':[
            {
                'authorPosition': author.get('author_position', None),
                'author':{
                    'id': author.get('author', {}).get('id', None),
                    'institutions': [institution.get('id', None) for institution in author.get('institutions', [])],
                    'isCorresponding': author.get('is_corresponding', None),
                    'affiliations': author.get('raw_affiliation_strings', []),
                }    
            } for author in rec.get('authorships')
        ],

        'source': {
            'id': rec.get('primary_location', {}).get('source', {}).get('id', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'hostOrganizationId': rec.get('primary_location', {}).get('source', {}).get('host_organization', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'volume': rec.get('biblio', {}).get('volume', None),
            'issue': rec.get('biblio', {}).get('issue', None),
        },

        'referencedWorks': rec.get('referenced_works', []),
        'relatedWorks': rec.get('related_works', []),

        'grants': [
            {
                'funder': grant.get('funder', None),
                'awardId': grant.get('award_id', None),
            } for grant in rec.get('grants', [])
        ],

        'concepts': [concept.get('id', None) for concept in rec.get('concepts', []) if concept.get('score', None) > CONCEPT_SIMILARITY_THRESH ],
        'keywords': [keyword.get('display_name', None) for keyword in rec.get('keywords', []) ],

        'topics': [topic.get('id', None) for topic in rec.get('topics', []) if topic.get('score', None) > TOPIC_SIMILARITY_THRESH ],

    })

In [106]:
with open('clean_data/batch_1.json', 'w', encoding='utf-8') as f:
    json.dump(papers_data, f, indent=4, ensure_ascii=False)

In [105]:
print(json.dumps(papers_data[56].get('topics', {}), indent=4))

[
    "https://openalex.org/T10586",
    "https://openalex.org/T11099",
    "https://openalex.org/T10191"
]


In [87]:
len(papers_data[116].get('referencedWorks', ))

32

In [88]:
len(papers_data[116].get('relatedWorks', ))

10

In [91]:
papers_data[116].get('referencedWorks', )

['https://openalex.org/W2200971133',
 'https://openalex.org/W3039205832',
 'https://openalex.org/W2995010020',
 'https://openalex.org/W4386471863',
 'https://openalex.org/W4225754812',
 'https://openalex.org/W2563487472',
 'https://openalex.org/W4296079511',
 'https://openalex.org/W2975767248',
 'https://openalex.org/W4378833173',
 'https://openalex.org/W2101545882',
 'https://openalex.org/W4364322274',
 'https://openalex.org/W4385331824',
 'https://openalex.org/W3216774437',
 'https://openalex.org/W2084246939',
 'https://openalex.org/W2146404773',
 'https://openalex.org/W2155295407',
 'https://openalex.org/W2334470880',
 'https://openalex.org/W2079150870',
 'https://openalex.org/W1966312294',
 'https://openalex.org/W4310983610',
 'https://openalex.org/W4226051392',
 'https://openalex.org/W4214895168',
 'https://openalex.org/W2963914175',
 'https://openalex.org/W2905301806',
 'https://openalex.org/W4312731878',
 'https://openalex.org/W3169575318',
 'https://openalex.org/W4205342285',
 

In [89]:
papers_data[116].get('relatedWorks', )

['https://openalex.org/W4360995134',
 'https://openalex.org/W4323768008',
 'https://openalex.org/W4248382324',
 'https://openalex.org/W3131574667',
 'https://openalex.org/W3023605104',
 'https://openalex.org/W2387529410',
 'https://openalex.org/W2383578611',
 'https://openalex.org/W2101105382',
 'https://openalex.org/W2039473718',
 'https://openalex.org/W1941703695']