## Filtering DOAJ database by subject

In [35]:
import pandas as pd

def doaj_subject_filter(filename, topics):
    # Expects a json file
    df = pd.read_json(filename, lines=True)
    df['subjects'] = df['subjects'].astype(str)
    
    searchfor = '|'.join(topics)
    print('Searching for:', searchfor)
    
    df = df.loc[df['subjects'].str.contains(searchfor)]
    
    print('Dataframe size:', len(df))
    print('Saving to csv')
    df.to_csv('filtered_doaj.csv', header=True, index=False)
    
    return df

### Example

In [36]:
topics = ["'Medicine'", "'Public aspects of medicine'", "'Social sciences \(General\)'"]

df = doaj_subject_filter('sample_processed_doaj_600k.json', topics)

Searching for: 'Medicine'|'Public aspects of medicine'|'Social sciences \(General\)'
Dataframe size: 104389
Saving to csv


In [38]:
df.head()

Unnamed: 0,title,abstract,year,subjects
0,DEGENERACIÓN MACULAR RELACIONADA CON LA EDAD. ...,Se realizó un estudio transversal descriptivo ...,2006,"['Medicine', 'Medicine (General)']"
7,Association between perceived insufficient sle...,<p>Abstract</p> <p>Background</p> <p>Although ...,2013,['Public aspects of medicine']
8,Amniotic band syndrome: A case report and revi...,Amniotic band syndrome is a rare disorder with...,2018,['Medicine']
12,Avaliação da contaminação experimental de arei...,Procurou-se avaliar métodos e soluções tradici...,1991,"['Medicine', 'Public aspects of medicine']"
13,The principles of medical ethics and medical r...,In this paper I discuss the application of the...,1999,"['Medicine', 'Public aspects of medicine']"
