# Frontiers for Young Minds  – Article Classification

## Article Link Retrieval

In [None]:
from bs4 import BeautifulSoup

sections = ['astronomy-and-space-science', 
            'biodiversity', 
            'earth-and-its-resources', 
            'human-health', 
            'mathematics', 
            'neuroscience-and-psychology'
]
   
for sec in sections:
    page = sec + '.htm'
    fname = sec + '.txt'

    html = open('webpages/' + page, encoding='utf8')
    bs = BeautifulSoup(html)
    
    links = []
    for link in bs.find_all('a', class_="article-link"):
        links.append(link.get('href'))
    print(sec, len(links))
    
    with open('sections/' + fname, 'w') as f:
        for link in links:
            f.write("%s\n" % link)

## Web Scraping

In [None]:
import os, pathlib
from urllib.request import urlopen
from bs4 import BeautifulSoup

sections = ['astronomy-and-space-science', 
            'biodiversity', 
            'earth-and-its-resources', 
            'human-health', 
            'mathematics', 
            'neuroscience-and-psychology'
]
        
base_dir = pathlib.Path('sections')
for sec in sections:
    sec_dir = base_dir / sec
    os.makedirs(sec_dir)
    
    links = []
    with open('sections/' + sec + '.txt') as f:
        lines = f.readlines()
        for line in lines:
            link = line.rstrip("\n")
            links.append(link)

    for link in links:
        fname = link.partition('frym.')[2] + '.txt'
        html = urlopen(link)
        bs = BeautifulSoup(html)
        title = bs.find('h1', class_="heading fulltext-heading").text
        paras = bs.find_all('p', id=False, class_=False)

        paras_list = [title]
        for para in paras:
            if para.text is not None:
                if 'conflict of interest' in para.text:
                    break
                else:
                    paras_list.append(para.text)
                    
        with open('sections/' + sec + '/' + fname, 'w', encoding='utf-8') as f:
            for para in paras_list:
                f.write("%s\n" % para)

## Text Standardization ##

#### Text standardization function

In [None]:
import re
import string

def standardize(fname):
    with open(fname, encoding='utf-8') as f:
        paras = f.readlines()
        paras = paras[2:]

        std_lines = []
        for para in paras:
            para = para.rstrip("\n")
            lines = para.split(".")
            for line in lines:
                std_line = re.sub("\[\d.*\]", "", line)
                std_line = re.sub("\(Figure.*\)", "", std_line)
                std_lines.append(std_line)

        text = str(std_lines)
        text = text.lower()
        text = "".join(char for char in text if char not in string.punctuation)
        text = text.replace("’s", "")
        text = text.replace("’", "")
        return text

#### Text tokenization function

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def tokenize(text):
    tokens = text.split()
    tokens = [t for t in tokens if not t.isdigit()]
    
    sws = stopwords.words('english')
    tokens = [t for t in tokens if t not in sws]
    return tokens

### Text standardization of text files from the following 4 sections:
 * **biodiversity**
 * **earth and its resources**
 * **human health**
 * **neuroscience and psychology**

In [None]:
import os, pathlib

secs_dir = pathlib.Path('sections')
frym_dir = pathlib.Path('frym')
os.makedirs(frym_dir)

sections = ['biodiversity', 
            'earth-and-its-resources', 
            'human-health', 
            'neuroscience-and-psychology'
]

for sec in sections:
        os.makedirs(frym_dir / sec)
        
        fnames = os.listdir(secs_dir / sec)
        for fname in fnames:
            fpath = 'sections/' + sec + '/'+ fname
            text = standardize(fpath)
            tokens = tokenize(text)
            text = " ".join(t for t in tokens)
            
            with open('frym/' + sec + '/' + fname, 'w', encoding='utf-8') as f:
                f.write("%s" % text)