In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize
#from fuzzywuzzy import fuzz

train_example_paths = glob.glob('data/train/*.json')
test_example_paths = glob.glob('data/test/*.json')

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozano\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

## Create dataframe for tokens and targets

In [3]:
# Basic text cleaning

# Note that I removed lower from the original code here
def text_cleaning(text):
    text = re.sub("\s+"," ", text) # remove extra spaces
    text = ''.join([k for k in text if k not in string.punctuation]) # remove punctuation
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals and Perform Case Normalization
    return text

def text_preprocess(text):
    #text = text.replace('(', 'specialparstart').replace(')', 'specialparend')
    text = re.sub(r'\[[0-9]+]', 'specialreference', text)
    # Remove dates
    text = re.sub(r'(19|20)[0-9][0-9]', 'specialyear', text)
    
    return text

In [4]:
import string

##### STEP 1: Make a list of the known labels provided to us

temp_1 = [text_cleaning(x) for x in metadata['dataset_label']]
temp_2 = [text_cleaning(x) for x in metadata['dataset_title']]
temp_3 = [text_cleaning(x) for x in metadata['cleaned_label']]

existing_labels = temp_1 + temp_2 + temp_3
existing_labels = [l.lower() for l in existing_labels]
existing_labels = list(set(existing_labels))

# Sort labels by length in descending order
existing_labels = sorted(existing_labels, key = len, reverse= True)

In [5]:
pos_sentences = []
neg_sentences = []

def process_doc(doc_id):
    global count_BIO
    global count_O

    doc_json = load_train_example_by_name(doc_id)
    doc_text = ' '.join([sec['text'] for sec in doc_json])

    # Tokenize sentencewise
    sentences = sent_tokenize(doc_text)
    
    for sentence in sentences:
        clean_sentence = text_preprocess(sentence)
        clean_sentence = text_cleaning(clean_sentence).lower()

        has_label = False
        for clean_label in existing_labels:
            if clean_label in clean_sentence:
                has_label = True

                # Remove label from the text, or model will overfit
                clean_sentence = clean_sentence.replace(clean_label, '')

        if has_label:
            pos_sentences.append(clean_sentence)
        else:
            neg_sentences.append(clean_sentence)

#get_doc(docIdx[0])[0]

## Create Dataset for All Documents

In [6]:
for doc_id in tqdm(docIdx):
    process_doc(doc_id)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

100%|██████████| 14316/14316 [05:01<00:00, 47.44it/s]pos size: 55891
neg size: 4123030



## Save Dataset

In [7]:
import pickle

with open(f'data/sentence_classification_data/pos.pkl', 'wb') as f:
    pickle.dump(pos_sentences, f)

with open(f'data/sentence_classification_data/neg.pkl', 'wb') as f:
    pickle.dump(neg_sentences, f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

## Load Dataset

In [None]:
import pickle

with open(f'data/sentence_classification_data/pos.pkl', 'rb') as f:
    pos_sentences = pickle.load(f)

with open(f'data/sentence_classification_data/neg.pkl', 'rb') as f:
    neg_sentences = pickle.load(f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

## Create Dataset

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

print('Creating TF-IDF vectorizer...')
all_sentences = pos_sentences + neg_sentences
vectorizer = TfidfVectorizer().fit(all_sentences)

print('Creating data arrays...')
X = vectorizer.transform(all_sentences)
y = np.zeros(len(all_sentences))
y[len(pos_sentences):] = 1

Creating TF-IDF vectorizer...
Creating data arrays...


In [11]:
from sklearn.model_selection import train_test_split

print('Splitting data...')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

data = {
    'train_data': (X_train, y_train),
    'val_data': (X_val, y_val)
}



Splitting data...


## Train Model

In [None]:
from mlmodels import sklearn_model

from mlmodels.search.hparameters import lr_params

delete_file('trials_sklearn')

model = sklearn_model.SklearnModel1(minimize_metric = False)

res = model.search(data, crf_params.search_space, crf_params.search_fixed, num_iter = 25)
best_hparams = res['best_params']
best_hparams.update(crf_params.search_fixed)