Data Wrangling
----

# Outline

# Setup

In [5]:
import pandas as pd
import requests
import numpy as np
from datetime import datetime
from os import getcwd
import json
import gzip
print("Working directory: ", getcwd())

Working directory:  c:\Users\marks\coding\QualAbstracts\notebooks


# Filtering for Social Science and Humanities Articles

## Links to APIs
- Finding registration authority: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#4-which-ra
    - Note, vast majority of academic articles are on crossref
- Crossref lookup by doi: https://api.crossref.org/swagger-ui/index.html#/Works/get_works__doi_
    - Note preference for including email
    - Or download 185GB as torrent: https://www.crossref.org/blog/2023-public-data-file-now-available-with-new-and-improved-retrieval-options/
- DataCite: https://support.datacite.org/docs/api-get-doi 

In [2]:
articles_md = pd.read_csv('../data/all_articles_notext.csv')
print(articles_md.info())
articles_md.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650037 entries, 0 to 1650036
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   doi       1650036 non-null  object
 1   title     1650033 non-null  object
 2   abstract  1649014 non-null  object
 3   issn      1548098 non-null  object
 4   subjects  686568 non-null   object
dtypes: object(5)
memory usage: 62.9+ MB
None


Unnamed: 0,doi,title,abstract,issn,subjects
0,10.1111/1467-8330.00227,Institutionalising future geographies of finan...,This paper provides a critical overview of rec...,'0066-4812',
1,10.1163/13822373-90002428,Landscaping Hispaniola Moreau de Saint-Méry's ...,This article focuses on Médéric Louis Élie Mor...,"'issn:1382-2373', '1382-2373'",
2,10.3354/meps09193,Modelling the effect of vertical mixing on bot...,Reliable estimates of in situ phytoplankton gr...,"'1616-1599', 'issn:0171-8630', 'issn:1616-1599...",
3,10.1177/1350508408095817,The Body Dances: Carnival Dance and Organization,Building on the work of Pierre Bourdieu and Ma...,'1350-5084',sociology
4,10.1016/s0022-4049(02)00180-9,Arithmeticity of Orbifold Generalised Triangle...,Maclachlan and Martin have proved that only fi...,'0022-4049',


In [3]:
# records without DOIs are useless
articles_md.dropna(subset=['doi'], inplace=True)
# remove duplicated DOIs
articles_md.drop_duplicates(subset=['doi'], inplace=True)

## Registration authority

In [3]:
doi_ra = pd.read_csv('../data/doi_ra.csv').drop_duplicates(subset='doi').dropna()
doi_ra = doi_ra[doi_ra.ra =='Crossref']
print(doi_ra.describe())

                            doi        ra
count                   1598295   1598295
unique                  1598295         1
top     10.1111/1467-8330.00227  Crossref
freq                          1   1598295


## Crossref data

In [6]:
with gzip.open('../data/crossref_data_2023_04/0.json.gz') as f:
    cr_json0 = json.load(f)
cr_json0.keys()

dict_keys(['items'])

In [9]:
cr_list0 = cr_json0['items']
cr_list0[0]

{'URL': 'http://dx.doi.org/10.1063/1.1486445',
 'resource': {'primary': {'URL': 'http://aip.scitation.org/doi/10.1063/1.1486445'}},
 'member': '317',
 'score': 0.0,
 'created': {'date-parts': [[2002, 7, 28]],
  'date-time': '2002-07-28T22:05:02Z',
  'timestamp': 1027893902000},
 'ISSN': ['0021-9606', '1089-7690'],
 'container-title': ['The Journal of Chemical Physics'],
 'issued': {'date-parts': [[2002, 7, 22]]},
 'issue': '4',
 'prefix': '10.1063',
 'reference-count': 76,
 'indexed': {'date-parts': [[2023, 3, 8]],
  'date-time': '2023-03-08T23:53:53Z',
  'timestamp': 1678319633621},
 'author': [{'given': 'Jan',
   'family': 'Forsman',
   'sequence': 'first',
   'affiliation': []},
  {'given': 'Clifford E.',
   'family': 'Woodward',
   'sequence': 'additional',
   'affiliation': []},
  {'given': 'Ben C.',
   'family': 'Freasier',
   'sequence': 'additional',
   'affiliation': []}],
 'DOI': '10.1063/1.1486445',
 'is-referenced-by-count': 40,
 'published': {'date-parts': [[2002, 7, 22]]}

In [11]:
cr_list0[0]['subject']

['Physical and Theoretical Chemistry', 'General Physics and Astronomy']

In [29]:
cf_df0 = pd.DataFrame(
    [{
        'doi': item['DOI'],
        'subjects': str(item['subject'])[1:-1] if 'subject' in item.keys() else np.NaN,
        'journal_title': item['container-title'][0]
    } for item in cr_list0]
)
print(cf_df0.info())
cf_df0.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   doi            5000 non-null   object
 1   subjects       4742 non-null   object
 2   journal_title  5000 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB
None


Unnamed: 0,doi,subjects,journal_title
0,10.1063/1.1486445,"'Physical and Theoretical Chemistry', 'General...",The Journal of Chemical Physics
1,10.1021/cm011675j,"'Materials Chemistry', 'General Chemical Engin...",Chemistry of Materials
2,10.1108/01443579610125787,"'Management of Technology and Innovation', 'St...",International Journal of Operations &amp; Prod...
3,10.1108/01437730110380183,'Organizational Behavior and Human Resource Ma...,Leadership &amp; Organization Development Journal
4,10.1149/1.1392654,"'Materials Chemistry', 'Electrochemistry', 'Su...",Journal of The Electrochemical Society


In [25]:
subjects_list = []
for l in cf_df0.subjects:
    subjects_list.extend(l)
subjects = pd.Series(subjects_list)
# print(subjects.value_counts().index)
print(np.unique(subjects_list))

['Accounting' 'Acoustics and Ultrasonics'
 'Advanced and Specialized Nursing' 'Aerospace Engineering' 'Aging'
 'Agricultural and Biological Sciences (miscellaneous)'
 'Agronomy and Crop Science' 'Algebra and Number Theory' 'Analysis'
 'Analytical Chemistry' 'Anesthesiology and Pain Medicine'
 'Animal Science and Zoology' 'Anthropology' 'Applied Mathematics'
 'Applied Microbiology and Biotechnology' 'Applied Psychology'
 'Aquatic Science' 'Architecture' 'Artificial Intelligence'
 'Arts and Humanities (miscellaneous)' 'Astronomy and Astrophysics'
 'Atomic and Molecular Physics, and Optics' 'Automotive Engineering'
 'Behavioral Neuroscience' 'Biochemistry' 'Bioengineering' 'Biomaterials'
 'Biomedical Engineering' 'Biophysics' 'Biotechnology'
 'Building and Construction' 'Business and International Management'
 'Business, Management and Accounting (miscellaneous)' 'Cancer Research'
 'Cardiology and Cardiovascular Medicine' 'Catalysis' 'Cell Biology'
 'Cellular and Molecular Neuroscience' '

In [33]:
# cf_df0[cf_df0.subjects.apply(lambda l: 'Literature and Literary Theory' in l)]
cf_df0[cf_df0.subjects.str.contains('liter', case=False, na=False)]

Unnamed: 0,doi,subjects,journal_title
1927,10.1080/147538202320147830,"'Literature and Literary Theory', 'Cultural St...",Bulletin of Spanish Studies
2557,10.1016/s0304-422x(02)00008-6,"'Literature and Literary Theory', 'Linguistics...",Poetics
2626,10.1163/156853302760013866,"'Literature and Literary Theory', 'Linguistics...",Vetus Testamentum
3077,10.1080/13696810220146100,"'Literature and Literary Theory', 'Linguistics...",Journal of African Cultural Studies
3157,10.1163/156853302760013857,"'Literature and Literary Theory', 'Linguistics...",Vetus Testamentum
4730,10.1080/09670880220148348,"'Literature and Literary Theory', 'Sociology a...",Irish Studies Review


In [32]:
? pd.Series.str.contains

[1;31mSignature:[0m
 [0mpd[0m[1;33m.[0m[0mSeries[0m[1;33m.[0m[0mstr[0m[1;33m.[0m[0mcontains[0m[1;33m([0m[1;33m
[0m    [0mself[0m[1;33m,[0m[1;33m
[0m    [0mpat[0m[1;33m,[0m[1;33m
[0m    [0mcase[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mflags[0m[1;33m:[0m [1;34m'int'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mna[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mregex[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Test if pattern or regex is contained within a string of a Series or Index.

Return boolean Series or Index based on whether a given pattern or regex is
contained within a string of a Series or Index.

Parameters
----------
pat : str
    Character sequence or regular expression.
case : bool, default True
    If True, case sensitive.
flags : int, default 0 (no flag

# Reading in data

In [2]:
df400k = pd.read_csv('../data/400k_articles.csv.gz', nrows=10_000)
print(df400k.info())
df400k.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doi       10000 non-null  object
 1   title     10000 non-null  object
 2   fulltext  9995 non-null   object
 3   abstract  10000 non-null  object
 4   issn      9729 non-null   object
 5   subjects  2669 non-null   object
dtypes: object(6)
memory usage: 468.9+ KB
None


Unnamed: 0,doi,title,fulltext,abstract,issn,subjects
0,10.1111/1467-8330.00227,Institutionalising future geographies of finan...,1INSTITUTIONALISING FUTURE GEOGRAPHIES OF FINA...,This paper provides a critical overview of rec...,'0066-4812',
1,10.1163/13822373-90002428,Landscaping Hispaniola Moreau de Saint-Méry's ...,New West Indian Guide / Nieuwe West-Indische G...,This article focuses on Médéric Louis Élie Mor...,"'issn:1382-2373', '1382-2373'",
2,10.3354/meps09193,Modelling the effect of vertical mixing on bot...,MARINE ECOLOGY PROGRESS SERIES\nMar Ecol Prog ...,Reliable estimates of in situ phytoplankton gr...,"'1616-1599', 'issn:0171-8630', 'issn:1616-1599...",
3,10.1177/1350508408095817,The Body Dances: Carnival Dance and Organization,The Body Dances: Carnival Dance \nand Organi...,Building on the work of Pierre Bourdieu and Ma...,'1350-5084',sociology
4,10.1016/s0022-4049(02)00180-9,Arithmeticity of Orbifold Generalised Triangle...,,Maclachlan and Martin have proved that only fi...,'0022-4049',


In [4]:
df200k = pd.read_csv('../data/200k_articles.csv.gz', nrows=10_000)
print(df200k.info())
df200k.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doi       10000 non-null  object
 1   title     10000 non-null  object
 2   fulltext  9995 non-null   object
 3   abstract  10000 non-null  object
 4   issn      9729 non-null   object
 5   subjects  2669 non-null   object
dtypes: object(6)
memory usage: 468.9+ KB
None


Unnamed: 0,doi,title,fulltext,abstract,issn,subjects
0,10.1111/1467-8330.00227,Institutionalising future geographies of finan...,1INSTITUTIONALISING FUTURE GEOGRAPHIES OF FINA...,This paper provides a critical overview of rec...,'0066-4812',
1,10.1163/13822373-90002428,Landscaping Hispaniola Moreau de Saint-Méry's ...,New West Indian Guide / Nieuwe West-Indische G...,This article focuses on Médéric Louis Élie Mor...,"'issn:1382-2373', '1382-2373'",
2,10.3354/meps09193,Modelling the effect of vertical mixing on bot...,MARINE ECOLOGY PROGRESS SERIES\nMar Ecol Prog ...,Reliable estimates of in situ phytoplankton gr...,"'1616-1599', 'issn:0171-8630', 'issn:1616-1599...",
3,10.1177/1350508408095817,The Body Dances: Carnival Dance and Organization,The Body Dances: Carnival Dance \nand Organi...,Building on the work of Pierre Bourdieu and Ma...,'1350-5084',sociology
4,10.1016/s0022-4049(02)00180-9,Arithmeticity of Orbifold Generalised Triangle...,,Maclachlan and Martin have proved that only fi...,'0022-4049',


In [7]:
issn = df400k.issn.str.extract(r'(\d{4}-\d{4})')
issn.head()

Unnamed: 0,0
0,0066-4812
1,1382-2373
2,1616-1599
3,1350-5084
4,0022-4049
