### Create text dataset of Wikipedia articles for specific topics
This notebook does the following:
- Creates a list of topics of interest by parsing json. (use your own list)
- Extracts and saves wikipedia articles about the topics

In [63]:
import wikipedia
import json

In [71]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [72]:
SAGE_FIELDS_FILE = '' # dir where list of specific topics was saved
SAGE_WIKI_DIR = '' # dir where articles are saved

### Get specific topics

In [73]:
text = load_doc(SAGE_FIELDS_FILE)
sage_fields_json = json.loads(text)
fields = list(sage_fields_json.keys())

In [85]:
subFields = []
containerField = []
for i in range(len(fields)):
    f = fields[i]
    sf = sage_fields_json[f].keys()
    subFields += sf
    containerField += [fields[i]] * len(sf)

In [86]:
subFields = [s.replace('/',' ') for s in subFields]
subFields

['Ecology & Conservation',
 'Physical Geography',
 'Meteorology',
 'Environmental Chemistry, Substances & Processes',
 'Human Geography',
 'Planning',
 'General Geography, Earth & Environmental Science',
 'Environmental Sciences',
 'Environmental Policy & Law',
 'Environmental Health',
 'Research Methods for Geography, Earth & Environmental Science',
 'General Sociology',
 'Sociology of Education',
 'Social Research',
 'Social Policy',
 'Social Change & Transformation',
 'Sociology of Work & Labor Studies',
 'Sociology of Religion',
 'Area Studies',
 'Gender & Sexuality',
 'Social Sociological Theory',
 'Environmental, Urban & Regional Sociology',
 'Sociology of Sport & Leisure',
 'Sociology of Family',
 'Age & the Life Course',
 'Anthropology',
 'Sociology of Organizations, Institutions & Structure',
 'Political Sociology',
 'Race, Ethnicity & Migration',
 'Crime & Deviance',
 'Sociology of Arts & Culture',
 'Sociology of Health, Illness & the Body',
 'Social Interaction & Everyday Li

### extract and save wikipedia articles

In [124]:
missing = []
for i in range(len(subFields)):
    sf = subFields[i]
    try:
        p = wikipedia.page(sf)
    except wikipedia.exceptions.DisambiguationError as e:
        print ('Options: ',e.options)
        p = wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        missing.append(sf)
        continue
    data = p.content
    cleanedsf = " ".join(clean_doc(sf))
    cleanedcf = " ".join(clean_doc(containerField[i]))
    data += " " + ' '.join([cleanedcf] * 30)
    data += " " + ' '.join([cleanedsf] * 60)
    
    filename = sf+'.txt'
    file = open(SAGE_WIKI_DIR+filename, 'w')
    file.write(data)
    file.close()
    print(sf," : ",p.title)

Options:  ['conservation biology', 'Ecology and Society']
Ecology & Conservation  :  Conservation biology
Physical Geography  :  Physical geography
Meteorology  :  Meteorology
Environmental Chemistry, Substances & Processes  :  Environmental chemistry
Human Geography  :  Human geography
Planning  :  Planning
General Geography, Earth & Environmental Science  :  Earth science
Environmental Sciences  :  Environmental science
Environmental Policy & Law  :  Environmental policy
Environmental Health  :  Environmental health
Research Methods for Geography, Earth & Environmental Science  :  Earth science
General Sociology  :  Sociology
Sociology of Education  :  Sociology of education
Social Research  :  Social research
Social Policy  :  Social policy
Social Change & Transformation  :  Social change
Sociology of Work & Labor Studies  :  Sociology
Sociology of Religion  :  Sociology of religion
Area Studies  :  Area studies
Gender & Sexuality  :  Gender studies
Social Sociological Theory  :  So

In [125]:
missing

['Research Methods & Data Analysis in Psychology',
 'Counseling Setting   Client Groups',
 'Teaching Diverse Students',
 'Teacher Assistants Support',
 'Preservice Training']