In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import json
import glob
from copy import deepcopy
from collections import defaultdict
from functools import partial

import pandas as pd
import numpy as np

from nltk import sent_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

import unidecode

from tqdm.notebook import tqdm
import string

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

%matplotlib inline

# input_path='/kaggle/input/'
input_path='data/raw_data/'
basepath=input_path+'coleridgeinitiative-show-us-the-data/'

def clean_text(txt):
    return [re.sub('[^A-Za-z0-9]+', ' ', t.lower()) for t in txt]

device='cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
train_df=pd.read_csv(basepath+'train.csv')
sample_sub = pd.read_csv(basepath+'sample_submission.csv')

In [4]:
train_df.head(5)

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [5]:
train_files_path=basepath+'train/'
test_files_path=basepath+'test/'
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.extend(sent_tokenize(unidecode.unidecode(data.get('section_title'))))
            contents.extend(sent_tokenize(unidecode.unidecode(data.get('text'))))
            combined.extend(sent_tokenize(unidecode.unidecode(data.get('section_title'))))
            combined.extend(sent_tokenize(unidecode.unidecode(data.get('text'))))
    
    if output == 'text':
        return contents
    elif output == 'head':
        return headings
    else:
        return combined

In [6]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

  0%|          | 0/19661 [00:00<?, ?it/s]

Wall time: 16min 7s


In [7]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

  0%|          | 0/4 [00:00<?, ?it/s]

Wall time: 420 ms


In [8]:
train_df['cleaned_text']=train_df.text.progress_apply(clean_text)
sample_sub['cleaned_text']=sample_sub.text.progress_apply(clean_text)

  0%|          | 0/19661 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
additional_title=pd.read_csv('data/raw_data/additional_data/gov_data.csv')
additional_title=[r'\b'+re.sub('[^A-Za-z0-9]+', ' ', label.lower())+r'\b' for label in sorted(additional_title.iloc[:500,:].title,key=len)]

In [10]:
labels_list=[r'\b'+label+r'\b' for label in train_df.cleaned_label.unique()]
labels_pattern='|'.join(labels_list+additional_title)

In [11]:
labels=[]
sub_labels=[]
for text in tqdm(train_df.cleaned_text):
    text=' '.join(text)
    tmp=sorted(list(set(re.findall(labels_pattern,text))),key=len)
    result=[]
    for i,label in enumerate(tmp):
        try:
            if sum([label in ref for ref in tmp[i+1:]])>0:
                continue
            else:
                result.append(label)
        except:
            result.append(label)
    labels.append(sorted(result))
    
for text in tqdm(sample_sub.cleaned_text):
    text=' '.join(text)
    tmp=sorted(list(set(re.findall(labels_pattern,text))),key=len)
    sub_labels.append(tmp)

  0%|          | 0/19661 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
train_df['complete_lower_labels']=labels  

In [18]:
def get_text_labels(row):
    text_labels=[]
    for i,txt in enumerate(row['cleaned_text']):
        have_label=False
        tmp=[]
        for existing in row['complete_lower_labels']:
            tmp_span=[m.span() for m in re.finditer(existing,txt)]
            if len(tmp_span) > 0:
                tmp.append(existing)
        text_labels.append('|'.join(tmp))
    return text_labels

In [20]:
train_df['text_labels']=train_df.progress_apply(get_text_labels,axis=1)

  0%|          | 0/19661 [00:00<?, ?it/s]

In [21]:
training_texts=[t.strip() for txt in train_df.cleaned_text for t in txt]
training_labels=[label for labels in train_df['text_labels'] for label in labels]
test_texts=[[t.strip() for t in txt] for txt in sample_sub.cleaned_text]

In [22]:
processed_train_df=pd.DataFrame(zip(training_texts,training_labels),columns=['text','label'])
processed_test_df=pd.DataFrame(zip(test_texts,sample_sub.Id),columns=['text','Id'])
processed_test_df=processed_test_df.explode('text')

processed_train_df.to_csv('processed_train_df.csv',index=False)
processed_test_df.to_csv('processed_test_df.csv',index=False)