Setting autoreload to automatically pickup changes in local packages imported from `../src` directory 

In [1]:
%load_ext autoreload
%autoreload 2

### Imports

In [2]:
import numpy as np
import pandas as pd
import sys
import ipyplot
sys.path.append("../src")

from df_helpers import (
    create_dataframe_from_files,
    create_dataframe_from_pd,
    read_ocr_from_json,
    clean_text_column
)

## Creating a dataframe

We assume that pdfs have been already converted into images and ocr'd with tesseract.    
Output of all these operations (.PDF/.PNG/.JSON) should ideally be under a single folder in local env.

#### Use code below to create dataframe based on the files present in the directory
```python
df_raw = create_dataframe_from_files(
    data_dir="../../../../datasets/500k/equal_split",
    file_ext='.png',
    first_page_only=True,
#     load_ground_truth="../output/df_5sup_pred.csv",
    dropna=True)
```

#### Use code below to create a datframe based on an existing DF (loaded from csv file)

In [15]:
df = pd.read_csv("../../../datasets/800/MSFT_MVP_Field_Output_Sample800.csv")
df = df.sample(frac=1)
df_raw = create_dataframe_from_pd(
    (
        df[
            df.groupby(['ProviderName'])['FileName']
            .transform('count') >= 201 # select a specific subset of providers with more than 200 invoices
        ]
#         [
#             df.groupby(['ProviderName'])['FileName']
#             .transform('count') > 50 # create a lower boundry for number of invoices per provider
#         ]
        .groupby(['ProviderName'])
        .head(150)
#         .apply(lambda x: x.sample(50, replace=False)) # take N randomly sampled invoices for each provider
    ),
    data_dir="../../../datasets/800/data_all" # point out the directory where files are located
)

Rows count before removing NaN values:  5100
Rows count after removing NaN values:  5100


In [None]:
df_raw.head(1)

### Reading OCR + cleaning/filtering text

In [17]:
# read ocr data from json file for each row
df_ocr = read_ocr_from_json(
    df_raw)
# clean ocr text for each row
df_clean = clean_text_column(
    df_ocr,
    chars_regex="[a-zA-Z]+",
    keep='all',
    min_words_count=20)
# df_clean = df_clean.sample(frac=1)

Removing empty texts:  55
Total rows count:  5045


In [19]:
from text_features import fuzzy_replace
from multiprocess import Pool
from functools import partial

with open("./invoice_vocabulary.txt", 'r') as file:
    invoice_vocabulary = file.read().replace('\n', ' ').split()    

all_text = df_clean['TextClean'].values

# logical=True counts threads, but we are interested in cores
max_pool = 20
pool = Pool(max_pool)
pool_outputs = pool.map(
    partial(
        fuzzy_replace,
        query_list = invoice_vocabulary,
        threshold=85,
        whitelist=True),
    all_text
)
pool.close()
pool.join()
pool.terminate()

df_clean['TextWhite'] = pool_outputs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
df_clean.head(1)

In [21]:
df_clean = df_clean.sample(frac=1)

### Set parameters for clustering/labeling

In [63]:
SELECTED_SUPPLIER = 'new'

#TFIDF
MAX_FEAT = 500
NGRAM_RANGE = (2, 4)

#PCA
USE_PCA = True
PCA_COMP = 30

#Scaler
USE_SCALER = False

### Run cleaned text through TFIDFProcessor to generate features vectors
Depending on params it may or may not use TFIDFVectorizer, PCA and StandardScaler transformers

In [64]:
from text_features import TFIDFProcessor

tfidf_proc = TFIDFProcessor(
    ngram_range=NGRAM_RANGE,
    max_feat=MAX_FEAT,
    use_pca=USE_PCA,
    pca_components=PCA_COMP,
    use_scaler=USE_SCALER,
    verbose=2,
)
# df_clean = df_clean['LayoutType'].values != '-1'
# X = df_clean["TextClean"].values
X = df_clean["TextWhite"].values
# IMG_PATHS = np.asarray([path.replace('\\','/')[3:] for path in df_clean["ImagePath"].values])
IMG_PATHS = df_clean["ImagePath"].values
X_feat = tfidf_proc.fit_pipeline(X)

TFIDF shape: (5045, 500)
PCA shape: (5045, 30)


### Apply clustering with DBSCAN alg
**eps** - element-wise distance threshold to determine if elements are close enough to be in the same cluster. The higher the final features size (PCA components number) the higher eps value should be   
**min_samples** - min samples in the same "neighborhood" to be considered a cluster. The higher this value will be the more well represented layout types we will pick up.

In [72]:
from clustering import fit_dbscan

labels, dbscan = fit_dbscan(X_feat, eps=0.30, min_samples=35, plot=False, n_jobs=-1)

Estimated number of clusters: 39
Estimated number of noise points: 508


In [54]:
# this is how to sideload and reuse previously known layout types (if present)
put_mask = df_clean['LayoutType'].values != '-1'
labels[put_mask] = df_clean['LayoutType'].values[put_mask]

In [73]:
# lists detected clusters + value counts for each
for i in np.unique(labels):
    print('Cluster %s count: ' % i, sum(labels == i))

Cluster -1 count:  508
Cluster 00 count:  242
Cluster 01 count:  117
Cluster 02 count:  149
Cluster 03 count:  136
Cluster 04 count:  379
Cluster 05 count:  297
Cluster 06 count:  136
Cluster 07 count:  135
Cluster 08 count:  142
Cluster 09 count:  113
Cluster 10 count:  42
Cluster 11 count:  131
Cluster 12 count:  100
Cluster 13 count:  129
Cluster 14 count:  57
Cluster 15 count:  136
Cluster 16 count:  127
Cluster 17 count:  79
Cluster 18 count:  137
Cluster 19 count:  146
Cluster 20 count:  143
Cluster 21 count:  86
Cluster 22 count:  91
Cluster 23 count:  104
Cluster 24 count:  83
Cluster 25 count:  119
Cluster 26 count:  107
Cluster 27 count:  138
Cluster 28 count:  58
Cluster 29 count:  146
Cluster 30 count:  142
Cluster 31 count:  50
Cluster 32 count:  64
Cluster 33 count:  36
Cluster 34 count:  72
Cluster 35 count:  40
Cluster 36 count:  53
Cluster 37 count:  37
Cluster 38 count:  38


In [330]:
# use this to change the label for all elements from specific cluster to another label
labels[labels=='11'] = '-1'

In [168]:
# use this to change labels for specific elements from a specific cluster
from clustering import change_labels

labels = change_labels(
    labels.copy(), 
    cluster_name='16',
    idx_to_change=[1], 
    target_labels=['-1']
)

#### Plotting with IPyPlot package for (much) better performance

In [None]:
# Print top N samples for each cluster in a separate interactive tab
ipyplot.plot_class_tabs(IMG_PATHS, labels, max_imgs_per_tab=10, img_width=220)

In [None]:
# Print top N samples for specific cluster
ipyplot.plot_images(IMG_PATHS[labels=='10'], max_images=20, img_width=220)

In [None]:
# Print first element from each cluster
# takes in labels and coresponding IMG_PATHS
ipyplot.plot_class_representations(IMG_PATHS, labels, img_width=400)

In [337]:
df_clean['LayoutType'] = labels
df_clean['LayoutType'] = df_clean['LayoutType'].apply(lambda x: x if (x == '-1') else 'layout_type_' + x)