In [2]:
import os
import json
import time as t
import pandas as pd
from tqdm import tqdm

In [2]:
def get_doaj_files(doaj_path='doaj_article_data_2021-04-07/'):
    files = os.listdir(doaj_path)
    
    return [os.path.join(doaj_path, f) for f in files]

def get_file_content(file_path):
    with open(file_path) as f:
        doaj_file = f.read()
        
    return json.loads(doaj_file)
    
def extract_useful_info(paper_dict):
    useful_info = {}

    useful_info['title'] = paper['bibjson']['title']
    useful_info['abstract'] = paper['bibjson']['abstract']
    
    if 'year' in paper['bibjson'].keys():
        useful_info['year'] = paper['bibjson']['year']
    if 'subject' in paper['bibjson'].keys():
        useful_info['subjects'] = [e['term'] for e in paper['bibjson']['subject']]
    
    return useful_info

def save(file_path, content):
    with open(file_path, 'a') as out:
        json.dump(content, out)
        out.write('\n')

In [3]:
tic = t.time()

files = get_doaj_files(doaj_path='doaj_article_data_2021-05-01/')

count_success = 0
count_fail = 0
count_fail_with_abs = 0

processed_doaj = 'processed_doaj.json'
fail_doaj = 'fail_doaj.json'
fail_with_abs = 'fail_with_abs.json'

for f in tqdm(files):
    doaj_file = get_file_content(f)
    
    for paper in doaj_file:
        try:
            useful_info = extract_useful_info(paper)
            save(processed_doaj, useful_info)
            count_success += 1
            
        except:
            count_fail += 1
            save(fail_doaj, paper)

            if "'abstract':" in str(paper):
                count_fail_with_abs += 1
                save(fail_with_abs, paper)

print('- success:', count_success)
print('- fail:', count_fail)
print('- fail_with_abs:', count_fail_with_abs)

tac = t.time()
duration = round((tac-tic)/60, 2)
print('Duration: {} min\n'.format(duration))

100%|██████████| 60/60 [07:19<00:00,  7.33s/it]

- success: 5439087
- fail: 550569
- fail_with_abs: 1060
Duration: 7.33 min






## Adapting format for embedding creation

In [40]:
filename = 'sample_processed_doaj_1m_equiv_wiki' #'sample_mini'
df = pd.read_json(filename+'.json', lines=True)
print(len(df))
df.head()

1000000


Unnamed: 0,title,abstract,year,subjects
0,Surgical technologists' knowledge and performa...,INTRODUCTION: The use of ionizing radiation in...,2020,"[Special aspects of education, Public aspects ..."
1,ASESORÍA ACADÉMICA UNIVERSITARIA: PERFIL DE CO...,Resumen:El artículo se refiere a las generalid...,2010,"[Education, Education (General)]"
2,Potential biomarkers of childhood brain tumor ...,Abstract Brain tumors are the most common soli...,2021,"[Medicine, Science]"
3,Un programa de formación continua con profesor...,En este artículo se presentan los resultados d...,2012,"[Special aspects of education, Theory and prac..."
4,Music in the Thought of Deconstruction / Decon...,This article critically speculates on points o...,2005,[Music]


In [41]:
df.replace({'"': '', '\n': ' ', '\r': '', '\t': ''}, regex=True, inplace=True)
df[['title', 'abstract']].to_csv(filename+'.csv', index=False, header=False, sep='\t', line_terminator='\n')

## Shuffling and sampling
To shuffle and sample, it's better, faster and easier to go with the linux shell (below, we get ~10\% out of the total nº of the processed successfully articles):


```shell
shuf -n 600000 processed_doaj.json > sample_processed_doaj.json
```