In [1]:
import os
import json
import numpy as np
from collections import defaultdict
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag

In [2]:
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
def is_noun(pos):
    if pos in noun_tags:
        return True
    else:
        return False

In [3]:
def extract_nouns(text):
    nouns = []
    
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        sentence_nouns = [word.lower() for (word, pos) in pos_tag(tokens) if is_noun(pos)] 
        nouns += sentence_nouns
    return ' '.join(nouns)

In [4]:
def get_categories(categories):
    specific_categories = categories.split()
    top_level_categories = set([category.split('.')[0] for category in specific_categories])
    
    return specific_categories, list(top_level_categories)

In [7]:
def generate_dataset(input_file, output_file):
    o = open(output_file, 'w')
    with open(input_file, 'r') as f:
        for line in f:
            input_row = json.loads(line)['arXiv']
            
            output_row = {}
            output_row['categories'], output_row['top_level_categories'] = get_categories(input_row['categories'])

            output_row['nouns'] = (extract_nouns(input_row['title']) + ' ' + extract_nouns(input_row['abstract'])).strip()
            
            json.dump(output_row, o)
            o.write('\n')

In [8]:
# Training set: 2016
# Test set: 2017

#os.mkdir(os.path.join('..', 'features'))
#os.mkdir(os.path.join('..', 'features', 'nouns'))

generate_dataset(os.path.join('..', 'data-by-year', '2016.json'), os.path.join('..', 'features', 'nouns', '2016.json'))
generate_dataset(os.path.join('..', 'data-by-year', '2017.json'), os.path.join('..', 'features', 'nouns', '2017.json'))