# Kaggle arXiv Data Parser

[Source](https://www.kaggle.com/code/kaggleaccount2112/extract-explainable-aiml-articles-from-arxiv-data)

In [None]:
import os

import pandas as pd
import json
from collections import defaultdict

# Paste your username where is says 'username'
# and your API key where it say 'key'

os.environ['KAGGLE_USERNAME'] = 'nickmccarty'
os.environ['KAGGLE_KEY'] = ''

# Install Kaggle packages

!pip install -q kaggle

# Download the data set

!kaggle datasets download -d Cornell-University/arxiv

# Remove the sample_data folder from our working directory

!rm -r sample_data

# If you get any errors, its likely due to conflicts in the Python versions and the Unix versions,
# but they should not be an issue. They are more warnings, then errors.
# As long as it downloads the data, you're fine.

!unzip arxiv.zip
!rm -r arxiv.zip

Downloading arxiv.zip to /content
100% 1.17G/1.18G [00:16<00:00, 126MB/s] 
100% 1.18G/1.18G [00:16<00:00, 78.4MB/s]


In [None]:
# 'xai' is kept as a single term keyword
single_term_keywords = ['xai']

descriptors = ['explainable', 'transparent', 'interpretable']
terms = ['ai', 'machine learning', 'aiml']

keywords = [f'{descriptor} {term}' for descriptor in descriptors for term in terms]
keywords.extend(single_term_keywords)

descriptors_anti = ['black-box', 'black box', 'opaque', 'uninterpretable', 'non-transparent', 'inexplicable', 'unexplainable']
anti_keywords = [f'{descriptor} {term}' for descriptor in descriptors_anti for term in terms]

print(f'Keywords: {", ".join(keywords)}')
print(f'Anti-Keywords: {", ".join(anti_keywords)}')

Keywords: explainable ai, explainable machine learning, explainable aiml, transparent ai, transparent machine learning, transparent aiml, interpretable ai, interpretable machine learning, interpretable aiml, xai
Anti-Keywords: black-box ai, black-box machine learning, black-box aiml, black box ai, black box machine learning, black box aiml, opaque ai, opaque machine learning, opaque aiml, uninterpretable ai, uninterpretable machine learning, uninterpretable aiml, non-transparent ai, non-transparent machine learning, non-transparent aiml, inexplicable ai, inexplicable machine learning, inexplicable aiml, unexplainable ai, unexplainable machine learning, unexplainable aiml


In [None]:
# Create a dictionary to keep count of articles for each keyword
keyword_counts = defaultdict(int)
anti_keyword_counts = defaultdict(int)

# List to store articles
articles = []

with open('arxiv-metadata-oai-snapshot.json', 'r') as f:
    for i, line in enumerate(f):
        article = json.loads(line)
        title = article.get('title', '').lower()
        abstract = article.get('abstract', '').lower()

        # keep a list of keywords that match for this article
        matching_keywords = [keyword for keyword in keywords if keyword in abstract or keyword in title]
        matching_anti_keywords = [keyword for keyword in anti_keywords if keyword in abstract or keyword in title]

        # increment count for each keyword found in the title or abstract
        for keyword in matching_keywords:
            keyword_counts[keyword] += 1

        for keyword in matching_anti_keywords:
            anti_keyword_counts[keyword] += 1

        if matching_keywords or matching_anti_keywords:
            # append the matching keywords to the article data
            article['matching_keywords'] = matching_keywords
            article['matching_anti_keywords'] = matching_anti_keywords
            articles.append(article)

        # print the first 10 article titles
        if i < 10:
            print('Title:', title)

# convert the list of articles into a DataFrame
df = pd.DataFrame(articles)

Title: calculation of prompt diphoton production cross sections at tevatron and
  lhc energies
Title: sparsity-certifying graph decompositions
Title: the evolution of the earth-moon system based on the dark matter field
  fluid model
Title: a determinant of stirling cycle numbers counts unlabeled acyclic
  single-source automata
Title: from dyadic $\lambda_{\alpha}$ to $\lambda_{\alpha}$
Title: bosonic characters of atomic cooper pairs across resonance
Title: polymer quantum mechanics and its continuum limit
Title: numerical solution of shock and ramp compression for general material
  properties
Title: the spitzer c2d survey of large, nearby, insterstellar clouds. ix. the
  serpens yso population as observed with irac and mips
Title: partial cubes: structures, characterizations, and constructions


In [None]:
df

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,matching_keywords,matching_anti_keywords
0,1301.4564,Rafael Najmanovich,Rafael Najmanovich,Protein flexibility upon ligand binding: Docki...,Thesis for the degree Doctor of Philosophy sub...,,,,q-bio.BM,http://arxiv.org/licenses/nonexclusive-distrib...,Side chain flexibility is an important facto...,"[{'version': 'v1', 'created': 'Sat, 19 Jan 201...",2013-01-22,"[[Najmanovich, Rafael, ]]",[xai],[]
1,1406.6200,Thijs van Ommen,Thijs van Ommen,Combining predictions from linear models when ...,"12 pages, 2 figures. To appear in Proceedings ...",,,,stat.ME cs.LG stat.ML,http://arxiv.org/licenses/nonexclusive-distrib...,Methods for combining predictions from diffe...,"[{'version': 'v1', 'created': 'Tue, 24 Jun 201...",2014-06-25,"[[van Ommen, Thijs, ]]",[xai],[]
2,1503.05526,Fabrice Rossi,"Tsirizo Rabenoro (SAMM), J\'er\^ome Lacaille, ...",Interpretable Aircraft Engine Diagnostic via E...,arXiv admin note: substantial text overlap wit...,Transactions on Machine Learning and Data Mini...,,,stat.ML cs.LG math.ST stat.AP stat.TH,http://arxiv.org/licenses/nonexclusive-distrib...,Detecting early signs of failures (anomalies...,"[{'version': 'v1', 'created': 'Wed, 18 Mar 201...",2015-03-19,"[[Rabenoro, Tsirizo, , SAMM], [Lacaille, Jérôm...",[interpretable ai],[]
3,1606.05798,Guolong Su,"Guolong Su, Dennis Wei, Kush R. Varshney, Dmit...",Interpretable Two-level Boolean Rule Learning ...,presented at 2016 ICML Workshop on Human Inter...,,,WHI 2016 submission,stat.ML cs.LG,http://arxiv.org/licenses/nonexclusive-distrib...,As a contribution to interpretable machine l...,"[{'version': 'v1', 'created': 'Sat, 18 Jun 201...",2016-06-21,"[[Su, Guolong, ], [Wei, Dennis, ], [Varshney, ...",[interpretable machine learning],[]
4,1606.07163,William Souillard-Mandar,"William Souillard-Mandar, Randall Davis, Cynth...",Interpretable Machine Learning Models for the ...,Presented at 2016 ICML Workshop on Human Inter...,,,,stat.ML cs.LG,http://arxiv.org/licenses/nonexclusive-distrib...,"The Clock Drawing Test (CDT) is a rapid, ine...","[{'version': 'v1', 'created': 'Thu, 23 Jun 201...",2016-06-24,"[[Souillard-Mandar, William, ], [Davis, Randal...",[interpretable machine learning],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,2308.00143,Guy Amir,"Shahaf Bassan, Guy Amir, Davide Corsi, Idan Re...",Formally Explaining Neural Networks within Rea...,To appear in Proc. 23rd Int. Conf. on Formal M...,,,,cs.AI cs.LG cs.LO,http://arxiv.org/licenses/nonexclusive-distrib...,Deep neural networks (DNNs) are increasingly...,"[{'version': 'v1', 'created': 'Mon, 31 Jul 202...",2023-08-02,"[[Bassan, Shahaf, ], [Amir, Guy, ], [Corsi, Da...","[explainable ai, xai]",[]
1616,2308.00184,Leopoldo Bertossi,Leopoldo Bertossi,Attribution-Scores in Data Management and Expl...,Paper associated to ADBIS23 tutorial. To appea...,,,,cs.DB cs.AI cs.LG,http://arxiv.org/licenses/nonexclusive-distrib...,We describe recent research on the use of ac...,"[{'version': 'v1', 'created': 'Mon, 31 Jul 202...",2023-08-02,"[[Bertossi, Leopoldo, ]]",[explainable machine learning],[]
1617,2308.00710,Igor Cherepanov,"Igor Cherepanov, David Sessler, Alex Ulmer, He...",Towards the Visualization of Aggregated Class ...,submitted to xaiworldconference2023,,,,cs.LG cs.AI cs.HC,http://creativecommons.org/licenses/by-nc-nd/4.0/,Deep learning (DL) models achieve remarkable...,"[{'version': 'v1', 'created': 'Sat, 29 Jul 202...",2023-08-03,"[[Cherepanov, Igor, ], [Sessler, David, ], [Ul...",[xai],[]
1618,2308.01475,Lili Zheng,"Genevera I. Allen, Luqin Gan, Lili Zheng",Interpretable Machine Learning for Discovery: ...,,,,,stat.ML cs.LG stat.ME,http://creativecommons.org/licenses/by/4.0/,New technologies have led to vast troves of ...,"[{'version': 'v1', 'created': 'Wed, 2 Aug 2023...",2023-08-04,"[[Allen, Genevera I., ], [Gan, Luqin, ], [Zhen...",[interpretable machine learning],[]
