Data Wrangling
----

# Outline

# Setup

In [1]:
import pandas as pd
import requests
import numpy as np
from time import sleep, time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from os import getcwd
print("Working directory: ", getcwd())

Working directory:  c:\Users\marks\coding\QualAbstracts\notebooks


# Filtering for Social Science and Humanities Articles

## Links to APIs
- Finding registration authority: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#4-which-ra
    - Note, vast majority of academic articles are on crossref
- Crossref lookup by doi: https://api.crossref.org/swagger-ui/index.html#/Works/get_works__doi_
    - Note preference for including email
    - Or download 185GB as torrent: https://www.crossref.org/blog/2023-public-data-file-now-available-with-new-and-improved-retrieval-options/
- DataCite: https://support.datacite.org/docs/api-get-doi 

In [2]:
articles_md = pd.read_csv('../data/all_articles_notext.csv')
print(articles_md.info())
articles_md.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650037 entries, 0 to 1650036
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   doi       1650036 non-null  object
 1   title     1650033 non-null  object
 2   abstract  1649014 non-null  object
 3   issn      1548098 non-null  object
 4   subjects  686568 non-null   object
dtypes: object(5)
memory usage: 62.9+ MB
None


Unnamed: 0,doi,title,abstract,issn,subjects
0,10.1111/1467-8330.00227,Institutionalising future geographies of finan...,This paper provides a critical overview of rec...,'0066-4812',
1,10.1163/13822373-90002428,Landscaping Hispaniola Moreau de Saint-Méry's ...,This article focuses on Médéric Louis Élie Mor...,"'issn:1382-2373', '1382-2373'",
2,10.3354/meps09193,Modelling the effect of vertical mixing on bot...,Reliable estimates of in situ phytoplankton gr...,"'1616-1599', 'issn:0171-8630', 'issn:1616-1599...",
3,10.1177/1350508408095817,The Body Dances: Carnival Dance and Organization,Building on the work of Pierre Bourdieu and Ma...,'1350-5084',sociology
4,10.1016/s0022-4049(02)00180-9,Arithmeticity of Orbifold Generalised Triangle...,Maclachlan and Martin have proved that only fi...,'0022-4049',


In [3]:
# records without DOIs are useless
articles_md.dropna(subset=['doi'], inplace=True)
# remove duplicated DOIs
articles_md.drop_duplicates(subset=['doi'], inplace=True)

## Registration authority

In [4]:
doi_ra = pd.read_csv('../data/RAs/all_prev.csv').drop_duplicates(subset='doi').dropna()
doi_ra.ra = doi_ra.ra.str.lower()
doi_ra = doi_ra[doi_ra.ra =='crossref']
print(doi_ra.describe())

                              doi        ra
count                     1637243   1618438
unique                    1637240        14
top     10.29238/jnutri.v19i2.259  Crossref
freq                            2   1593310


In [5]:
articles_md = articles_md.merge(doi_ra, on='doi', how='inner')
print(articles_md.head())

                             doi  \
0        10.1111/1467-8330.00227   
1      10.1163/13822373-90002428   
2              10.3354/meps09193   
3       10.1177/1350508408095817   
4  10.1016/s0022-4049(02)00180-9   

                                               title  \
0  Institutionalising future geographies of finan...   
1  Landscaping Hispaniola Moreau de Saint-Méry's ...   
2  Modelling the effect of vertical mixing on bot...   
3   The Body Dances: Carnival Dance and Organization   
4  Arithmeticity of Orbifold Generalised Triangle...   

                                            abstract  \
0  This paper provides a critical overview of rec...   
1  This article focuses on Médéric Louis Élie Mor...   
2  Reliable estimates of in situ phytoplankton gr...   
3  Building on the work of Pierre Bourdieu and Ma...   
4  Maclachlan and Martin have proved that only fi...   

                                                issn   subjects        ra  
0                                

# Reading in data

In [2]:
df400k = pd.read_csv('../data/400k_articles.csv.gz', nrows=10_000)
print(df400k.info())
df400k.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doi       10000 non-null  object
 1   title     10000 non-null  object
 2   fulltext  9995 non-null   object
 3   abstract  10000 non-null  object
 4   issn      9729 non-null   object
 5   subjects  2669 non-null   object
dtypes: object(6)
memory usage: 468.9+ KB
None


Unnamed: 0,doi,title,fulltext,abstract,issn,subjects
0,10.1111/1467-8330.00227,Institutionalising future geographies of finan...,1INSTITUTIONALISING FUTURE GEOGRAPHIES OF FINA...,This paper provides a critical overview of rec...,'0066-4812',
1,10.1163/13822373-90002428,Landscaping Hispaniola Moreau de Saint-Méry's ...,New West Indian Guide / Nieuwe West-Indische G...,This article focuses on Médéric Louis Élie Mor...,"'issn:1382-2373', '1382-2373'",
2,10.3354/meps09193,Modelling the effect of vertical mixing on bot...,MARINE ECOLOGY PROGRESS SERIES\nMar Ecol Prog ...,Reliable estimates of in situ phytoplankton gr...,"'1616-1599', 'issn:0171-8630', 'issn:1616-1599...",
3,10.1177/1350508408095817,The Body Dances: Carnival Dance and Organization,The Body Dances: Carnival Dance \nand Organi...,Building on the work of Pierre Bourdieu and Ma...,'1350-5084',sociology
4,10.1016/s0022-4049(02)00180-9,Arithmeticity of Orbifold Generalised Triangle...,,Maclachlan and Martin have proved that only fi...,'0022-4049',


In [4]:
df200k = pd.read_csv('../data/200k_articles.csv.gz', nrows=10_000)
print(df200k.info())
df200k.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doi       10000 non-null  object
 1   title     10000 non-null  object
 2   fulltext  9995 non-null   object
 3   abstract  10000 non-null  object
 4   issn      9729 non-null   object
 5   subjects  2669 non-null   object
dtypes: object(6)
memory usage: 468.9+ KB
None


Unnamed: 0,doi,title,fulltext,abstract,issn,subjects
0,10.1111/1467-8330.00227,Institutionalising future geographies of finan...,1INSTITUTIONALISING FUTURE GEOGRAPHIES OF FINA...,This paper provides a critical overview of rec...,'0066-4812',
1,10.1163/13822373-90002428,Landscaping Hispaniola Moreau de Saint-Méry's ...,New West Indian Guide / Nieuwe West-Indische G...,This article focuses on Médéric Louis Élie Mor...,"'issn:1382-2373', '1382-2373'",
2,10.3354/meps09193,Modelling the effect of vertical mixing on bot...,MARINE ECOLOGY PROGRESS SERIES\nMar Ecol Prog ...,Reliable estimates of in situ phytoplankton gr...,"'1616-1599', 'issn:0171-8630', 'issn:1616-1599...",
3,10.1177/1350508408095817,The Body Dances: Carnival Dance and Organization,The Body Dances: Carnival Dance \nand Organi...,Building on the work of Pierre Bourdieu and Ma...,'1350-5084',sociology
4,10.1016/s0022-4049(02)00180-9,Arithmeticity of Orbifold Generalised Triangle...,,Maclachlan and Martin have proved that only fi...,'0022-4049',


In [7]:
issn = df400k.issn.str.extract(r'(\d{4}-\d{4})')
issn.head()

Unnamed: 0,0
0,0066-4812
1,1382-2373
2,1616-1599
3,1350-5084
4,0022-4049
