In [1]:
import json
import pandas as pd
import numpy as np
import glob
import os
import re
from tqdm import tqdm
import nltk
import random
from nltk.tokenize import word_tokenize,sent_tokenize

train_example_paths = glob.glob('data/train/*.json')
train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

import string

##### STEP 1: Make a list of the known labels provided to us

temp_1 = [text_cleaning(x) for x in metadata['dataset_label']]
temp_2 = [text_cleaning(x) for x in metadata['dataset_title']]
temp_3 = [text_cleaning(x) for x in metadata['cleaned_label']]

existing_labels = temp_1 + temp_2 + temp_3
existing_labels = [l.lower() for l in existing_labels]
existing_labels = list(set(existing_labels))

# Sort labels by length in descending order
existing_labels = sorted(existing_labels, key = len, reverse= True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozano\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def text_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    text = re.sub(r'\[[0-9]+]', ' specialreference ', text)

    # Remove years
    text = re.sub(r'(19|20)[0-9][0-9]', ' specialyear ', text)

    # remove other digits
    text = re.sub(r'\d+', ' ', text)

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    return text.lower()

## Extract Information

In [5]:
pos_sentences = []
neg_sentences = []
doc_label_section_idx = []
doc_label_sentence_idx = []
doc_label_list = []
first_label_sec_name = []
first_labels = []
n_secs = []

def process_doc(doc_id):
    doc_json = load_train_example_by_name(doc_id)
    this_doc_label_section_idx = []
    this_doc_label_sentence_idx = []
    this_doc_label_list = []
    i_doc_sent = -1
    doc_first_label = True
    n_secs.append(len(doc_json))

    for i_sec, section in enumerate(doc_json):
        
        sentences = sent_tokenize(section['text'])

        adni_count = 0
        for sentence in sentences:
            i_doc_sent += 1
            clean_sentence = text_cleaning(sentence)

            has_label = False
            label_is_adni = False
            for clean_label in existing_labels:
                if clean_label in clean_sentence:
                    if doc_first_label:
                        first_label_sec_name.append(section['section_title'])
                        first_labels.append(clean_label)
                        doc_first_label = False

                    has_label = True
                    this_doc_label_section_idx.append(i_sec)
                    this_doc_label_sentence_idx.append(i_doc_sent)
                    this_doc_label_list.append(clean_label)
                    clean_sentence = clean_sentence.replace(clean_label, '')

    doc_label_section_idx.append(this_doc_label_section_idx)
    doc_label_sentence_idx.append(this_doc_label_sentence_idx)
    doc_label_list.append(this_doc_label_list)
    if doc_first_label:
        first_label_sec_name.append('NOT FOUND')
        first_labels.append('NOT FOUND')

In [7]:
for doc_id in tqdm(docIdx):
    process_doc(doc_id)

100%|██████████| 14316/14316 [08:34<00:00, 27.82it/s]
pos size: 0
neg size: 0



## Get Section Title Info

In [81]:
def process_sec_name(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    text = ' '.join([t for t in text.split(' ') if len(t) > 1])

    return text.lower()

section_order = pd.Series(first_label_sec_name).value_counts().to_frame().reset_index()
section_order.columns = ['sec_name', 'cnt']

section_order.sec_name = section_order.sec_name.apply(lambda x: process_sec_name(x))
section_order = section_order.groupby('sec_name')['cnt'].sum().to_frame().reset_index()
section_order.columns = ['sec_name', 'cnt']
section_order = section_order.loc[section_order.sec_name.str.len() > 0]

# Consolidate entries that contain 'data'
sec_cons = section_order.sec_name.str.contains('data') | section_order.sec_name.str.contains('sample')
count_sum_data = section_order.loc[sec_cons, 'cnt'].sum()
section_order = section_order.loc[~sec_cons].reset_index(drop = True)
section_order.loc[len(section_order)] = ['data', count_sum_data]

section_order = section_order.loc[section_order.cnt > 10]
section_order = section_order.sort_values(by = 'cnt', ascending= False).reset_index(drop = True)

section_order.to_csv('data/section_order.csv', index = False)

In [82]:
section_order

Unnamed: 0,sec_name,cnt
0,introduction,2176
1,data,1784
2,abstract,1656
3,discussion,469
4,methods,238
5,background,128
6,materials and methods,109
7,participants,101
8,results,95
9,subjects,70


## Sort Sections

In [84]:
def sort_doc_sections(doc_secs):
    # doc_secs must be a list of dicts with field'section_title'
    for sec in doc_secs:
        section_title = process_sec_name(sec['section_title'])
        sec_scores = section_order.loc[section_order.sec_name.str.contains(section_title) |\
                                      section_order.sec_name.apply(lambda x: x in section_title), 'cnt']
        # sum scores of all matches
        result_score = sec_scores.sum() if len(sec_scores) > 0 else 0

        sec['score'] = result_score

    return sorted(doc_secs, key = lambda x: x['score'], reverse= True)

[{'section_title': 'Introduction', 'score': 2176},
 {'section_title': 'Data Analysis and Statistics', 'score': 1784},
 {'section_title': 'Abstract', 'score': 1656},
 {'section_title': 'Discussion', 'score': 493},
 {'section_title': 'Results', 'score': 131},
 {'section_title': 'Characteristics of Participants Survey at 3-months N (%)',
  'score': 101},
 {'section_title': 'Conclusion', 'score': 76},
 {'section_title': 'Methodology', 'score': 61},
 {'section_title': 'Study Design', 'score': 15},
 {'section_title': 'English and ICT', 'score': 0},
 {'section_title': 'Lebanon', 'score': 0},
 {'section_title': 'Gender Inequality in Lebanon', 'score': 0},
 {'section_title': "Lebanon's ICT Sector", 'score': 0},
 {'section_title': 'DOT Lebanon ICT Training Program', 'score': 0},
 {'section_title': 'Study Procedure', 'score': 0},
 {'section_title': 'Limitations', 'score': 0},
 {'section_title': 'Income Generation Opportunity Status', 'score': 0},
 {'section_title': 'Gender Inequality', 'score': 0

In [76]:
doc_json = load_train_example_by_name(train_example_names[0])
doc_json = [{'section_title': s['section_title']} for s in doc_json]

In [85]:
sort_doc_sections(doc_json)

[{'section_title': 'Introduction', 'score': 2176},
 {'section_title': 'Data Analysis and Statistics', 'score': 1784},
 {'section_title': 'Abstract', 'score': 1656},
 {'section_title': 'Discussion', 'score': 493},
 {'section_title': 'Results', 'score': 131},
 {'section_title': 'Characteristics of Participants Survey at 3-months N (%)',
  'score': 101},
 {'section_title': 'Conclusion', 'score': 76},
 {'section_title': 'Methodology', 'score': 61},
 {'section_title': 'Study Design', 'score': 15},
 {'section_title': 'English and ICT', 'score': 0},
 {'section_title': 'Lebanon', 'score': 0},
 {'section_title': 'Gender Inequality in Lebanon', 'score': 0},
 {'section_title': "Lebanon's ICT Sector", 'score': 0},
 {'section_title': 'DOT Lebanon ICT Training Program', 'score': 0},
 {'section_title': 'Study Procedure', 'score': 0},
 {'section_title': 'Limitations', 'score': 0},
 {'section_title': 'Income Generation Opportunity Status', 'score': 0},
 {'section_title': 'Gender Inequality', 'score': 0