In [1]:
import json
import pandas as pd
import numpy as np
import glob
import os
import re
from tqdm import tqdm

test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

### Utils

In [2]:
def load_test_example_by_name(name):
    doc_path = os.path.join('data/test', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def delete_file(filename):
    if os.path.exists(filename):
        os.remove(filename)

### Feature Extraction Class

In [3]:
import re

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

def get_features(s):
    return [(w, t) for w, t in zip(\
            s['TOKEN'].values.tolist(),
            s['TARGET'].values.tolist()                          
        )]

import string
puncs = [c for c in string.punctuation]

def mask_numbers(text):
        # Replace each numeric char with '#'
        
        def repl(m):
            return f" {'#' * len(m.group())} "
        text = re.sub(r'[0-9]+', repl, text)
        return text

def make_single_whitespace(text):
    return _RE_COMBINE_WHITESPACE.sub(" ", text).strip()

class TextFeatureExtractor:
    def __init__(self, *args, **kwargs):
        # Initialize super
        
        # Load parameters
        def_args = dict()
        
        # Extract related arguments
        for k, def_val in def_args.items():
            self.__dict__.update({k: kwargs.get(k, def_val)})

    def transform(self, x):
        x.update({'output': [_df2features(x['output'])]})
        return x

    def _mask_numbers(self, text):
        # Replace each numeric char with #
        
        def repl(m):
            return f" {'#' * len(m.group())} "
        text = re.sub(r'[0-9]+', repl, text)
        return text

    def fit_transform(self, data, train_filenames, val_filenames):
        self.train_filenames = train_filenames
        self.val_filenames = val_filenames

        
        output = {}

        # Process each set
        for setname in ['train', 'val']:
            docs = []
            for f in tqdm(self.__dict__.get(f'{setname}_filenames')):
                df_slice = data[f]

                assert not df_slice['TOKEN'].isnull().any(), 'All tokens must have a value'
                df_slice['TARGET'] = df_slice['TARGET'].fillna('OTHER')
                df_slice['TOKEN'] = df_slice['TOKEN'].values.astype('U')
                df_slice['TOKEN'] = df_slice['TOKEN'].apply(mask_numbers)
                df_slice['TOKEN'] = df_slice['TOKEN'].apply(make_single_whitespace)

                data_slice = get_features(df_slice)
                docs.append(data_slice)
            
            X = [_doc2features(s) for s in tqdm(docs)]
            y = [_doc2labels(s) for s in tqdm(docs)]

            del docs
            
            assert(len(X) == len(y))
            
            output[f'{setname}_data'] = (X, y)
        
        return output

class DocGetter(object):
    def __init__(self, data):
        self.n_doc = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(\
            s['TOKEN'].values.tolist(), 
            s['TARGET'].values.tolist()
                                                                     
        )]
        self.grouped = self.data.groupby('FILENAME').apply(agg_func)
        self.docs = [s for s in self.grouped]

def _text2features(text):
    """Returns a list of examples.
    """
    #words = wordpunct_tokenize(text)
    return [_word2features(words, i) for i in range(len(words))]

def _df2features(df):
    """Returns a list of examples.
    """
    for col in ['TOKEN']:
        assert(col in df.columns)
    
    df['TOKEN'] = df['TOKEN'].values.astype('U')
    df['TOKEN'] = df['TOKEN'].apply(mask_numbers)
    df['TOKEN'] = df['TOKEN'].apply(make_single_whitespace)
    
    agg_func = lambda s: [(w,) for w in s['TOKEN'].values.tolist()]
    feature_list = agg_func(df)
    words = [s for s in feature_list]
    
    return _doc2features(words)

def _word2features(words, i):
    word = words[i]
    if not isinstance(word, str):
        word = word[0]
    
    # isfirstname = fe_isname.is_firstname(word)
    # islastname = fe_isname.is_lastname(word)
    # isname = isfirstname or islastname
    
    # if isfirstname and islastname:
    #     word = '#NAME#'
    # elif isfirstname:
    #     word = '#FIRSTNAME#'
    # elif islastname:
    #     word = '#LASTNAME#'

    digit_count = sum(c=='#' for c in word)
    length = len(word)
    assert length > 0, "All tokens must have length > 0"

    features = {
        #'bias': 1.0,
        #'word.index': i
    }
    
    # Add line & word indices
    # features.update({
    #     'word.lineindex': li,
    #     'word.wordindex': wi,
        
    # })
    
    # if isname:
    #     features.update({
    #     'word.lower()': word
        
    # })
    #else:
    # If all digits
    if word.isdigit():
        features.update({
            'isd': True,
            'dct': digit_count,
            '4dg': digit_count == 4,
            'dgr': 1.0,
            'len': length
        })
    else: # Not all digit
        features.update({
            'isd': False,
            'dgr': digit_count / length,
            'dct': digit_count,
            'len': length,
            'wor': word
        })#'pnc': np.mean(np.array([c in puncs for c in word]))

    if i > 0:
        word_other = words[i-1][0]
        features.update({
            '-1': word_other,
            #'-1:word.isupper()': word1.isupper()
        })
        if i > 1:
            word_other = words[i-2][0]
            features.update({
                '-2': word_other,
                #'-2:word.isupper()': word2.isupper()
            })
            if i > 2:
                word_other = words[i-3][0]
                features.update({
                    '-3': word_other,
                    #'-3:word.isupper()': word_other.isupper()
                })
                if i > 3:
                    word_other = words[i-4][0]
                    features.update({
                        '-4': word_other,
                        #'-4:word.isupper()': word_other.isupper()
                    })
                    if i > 4:
                        word_other = words[i-5][0]
                        features.update({
                            '-5': word_other
                        })
                        if i > 5:
                            word_other = words[i-6][0]
                            features.update({
                                '-6': word_other
                            })
                            
    else:
        features['BOS'] = True
    if i < len(words)-1:
        word_other = words[i+1][0]
        features.update({
            '+1':  word_other,
            #'+1:word.isupper()': word1.isupper()
        })
        if i < len(words)-2:
            word_other = words[i+2][0]
            features.update({
                '+2':  word_other,
                #'+2:word.isupper()': word2.isupper()
            })
            if i < len(words)-3:
                word_other = words[i+3][0]
                features.update({
                    '+3':  word_other,
                    #'+3:word.isupper()': word_other.isupper()
                })
                
                if i < len(words)-4:
                    word_other = words[i+4][0]
                    features.update({
                        '+4':  word_other,
                        #'+4:word.isupper()': word_other.isupper()
                    })
        
        
    else:
        features['EOS'] = True
    return features

def _doc2features(doc):
    """Returns a list of examples.
    """
    words = [(ex[0],) for ex in doc]
    return [_word2features(words, i) for i in range(len(words))]

def _doc2labels(doc):
    return [s[-1] for s in doc]
def _doc2tokens(doc):
    return [s[0] for s in doc]

### Preprocessing Functions

In [4]:
def preprocess_tokenize_doc(doc_json):
    doc_text = ' '.join([clean_text(sec['text']) for sec in doc])
    doc_text = make_single_whitespace(doc_text)

    doc_tokens = doc_text.split(' ')
    return doc_tokens

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

### Load Model

In [6]:
import joblib

model = joblib.load('pipeline_model.joblib')

### Create Submission

In [7]:
feature_extractor = TextFeatureExtractor()

In [22]:
test_preds = []
ids = []
for test_id in test_example_names:
    
    # Load and preprocess
    doc = load_test_example_by_name(test_id)
    doc_tokens = preprocess_tokenize_doc(doc)

    # Extract features
    x = {'output': pd.DataFrame({'TOKEN': doc_tokens})}
    x = feature_extractor.transform(x)

    # Predict
    pred = model.predict(x['output'])
    pred = pred[0]
    pred = np.array([int(p) for p in pred])

    # Get corresponding tokens
    pos_pred_idx = [i[0] for i in np.argwhere(pred == 1)]
    pred_tokens = [doc_tokens[i] for i in pos_pred_idx]

    pred_joined = ' '.join(pred_tokens)

    test_preds.append(pred_joined)
    ids.append(test_id)

sub_df = pd.DataFrame(columns = ['Id', 'PredictionString'])
sub_df['Id'] = ids
sub_df['PredictionString'] = test_preds

In [24]:
pd.options.display.max_rows = 25
sub_df

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,the alzheimer s disease neuroimaging initiativ...
1,2f392438-e215-4169-bebf-21ac4ff253e1,the trends in international mathematics and sc...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,noaa storm surge inundation slosh model
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes
