# OCR - Date Extraction - Classification

We want to find in the collection of pdfs following entities:
1. Belegdatum = Rechnungsdatum 
2. Abrechnugnsperiode = JJJJMM von TTMMJJJ des Leistungsdatums (Lieferdatum)
Or
3. Referenznummer = Rechnungsnummer 

First, we only consider "Belegdatum"

## Steps
    * Load data into pandas
    * finde dates using regex
    * Maybe first use only first page (using all just takes longer)
    * get span of text before (maybe after) found date
    * tokenize span
    * build vectorizer
    * learn model using token words vector
    * Predict
    * use other features: 
        * number of white space around found date
        * position of match wrt page (upper part, lower part, left, right)
        * (on which page found)
        * How close to other found dates and in which order (first found date)

    * First determine language of document, than use date format accordingly
    
### Possible Tweaks
    * Not only use tokens left and right in same line but tokens nearby: "Lieferdatum" could be above in table
    * Modify image before ocr
    * Use different detection method for pdfs with text layer compared to scanned images
    * Use different tesseract options
    * Use table detection of documents
    * Use lexicon approach
    * Use HOCR output: hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')

## Load Data

In [None]:
import pandas as pd
import glob
import os

from PIL import Image
from pdf2image import convert_from_path
import pytesseract


# Adding tesseract custom options
custom_config = r'--oem 1 --psm 4'

In [None]:
df_auswertung = pd.read_excel('../data/01_raw/Auswertung.XLSX')
df_auswertung['pdf_number_pages'] = 0
df_auswertung['pdf_text'] = ''

files = glob.glob("../data/01_raw/KTA AI Rechnungen/*.pdf")
df_files = pd.DataFrame({'file_path': files})
df_files['file_name'] = df_files['file_path'].apply(lambda x: os.path.split(x)[1])
df_files['file_number'] = df_files['file_name'].apply(lambda x: int(os.path.splitext(x)[0]))

df_full = (pd.merge(df_auswertung, df_files, left_on='Dokumentnummer', right_on='file_number')
          .drop(['Dokumentnummer', 'Dokumentenbezeichnung'], axis=1))

# Train Test Split
df = df_full.sample(frac=0.8, random_state=42)
df_validate = df_full.drop(df.index)

In [None]:
print(len(df))
df.head()

In [None]:
import re
def clean_text(s):
    result = re.sub(r'\s+\n', ' ', s)
    return result

def doOcr(file_path):
    text = ''
    
    doc = convert_from_path(file_path)
    path, file_name = os.path.split(file_path)
    file_base_name, _ = os.path.splitext(file_name)
    file_number = int(file_base_name)

    page_number = 0
    for page_data in doc:
        page_number+=1
        page_text = pytesseract.image_to_string(page_data, config=custom_config)
        text += f"\n\n\n {page_text}"
            
    return page_number, text

In [None]:
custom_config = r'--oem 1 --psm 13'
temp = doOcr("../data/01_raw/KTA AI Rechnungen/5672950.pdf")
temp

In [None]:
df_test = df.head(2).copy()
df_test[['pdf_number_pages', 'pdf_text']] = df_test.apply(lambda r: doOcr(r['file_path']), axis=1, result_type='expand')
df_test['pdf_text'] = df_test['pdf_text'].apply(clean_text)

df_test

## Spacy

In [None]:
import spacy
nlp = spacy.load('de_core_news_md')

## Customize Spacy Tokenizer
Make sure that date formats including / - . infixes survive tokenization.

In [None]:
test_doc = '''we have an invoice no:123451\n,as well as 
a date 2020/11/20, another 11/20/2020 09-06-2020 09.06.2020.'''

nlp = spacy.load("de_core_news_md")
# Modify tokenizer
suffixes = list(nlp.Defaults.suffixes)
# remove dot as suffix
suffixes.append('\.')
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
# modify tokenizer infix patterns
infixes = (LIST_ELLIPSES + LIST_ICONS + [
        # EDIT: Removed hypen \- : r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[0-9])[+\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)
infix_re = spacy.util.compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

doc = nlp.make_doc(test_doc)
print([t for t in doc])

## Find matching dates and get features
Parse dates from text and get all date matches with left and right span.

In [None]:
from spacy.matcher import Matcher
import dateparser

test_doc = '''date 2020-09-11 2012.01.09 2020/12/31 we have an invoice no:123451\n,as well as 
a , 2011-10-20  20-April-2020 1211.12.31 11.13.12 another 12/14/2020 13/12/2020 09-06-2020 09.06.2020.'''

matcher = Matcher(nlp.vocab)
# tested: https://regexr.com/32t3r
# mm/dd/yyyy m/d/yy
#pattern1 = [{"TEXT": {"REGEX": r"^(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$"}}]
# dd/mm/yyyy d/m/yy
#pattern2 = [{"TEXT": {"REGEX": r"^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{1}(1[0-2]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$"}}]
# yyyy/mm/dd
#pattern5 = [{"TEXT": {"REGEX": r"^(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$|^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{1}(1[0-2]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$"}}]
pattern5 = [{"TEXT": {"REGEX": r"^(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$|^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{1}(1[0-2]|0?[1-9]))[.\-\/]{1}(?:[0-9]{2})?[0-9]{2}$"}}]
pattern3 = [{"TEXT": {"REGEX": r"^(?:[1-9]{1}[0-9]{3})[.\-\/]{1}(?:(1[0-2]|0?[1-9])[.\-\/]{1}(3[01]|[12][0-9]|0?[1-9]))$"}}]
# dd-Mon-yyyy e.g. 20-Jun-2020
months = r"(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)"
pattern4 = [{"TEXT": {"REGEX": fr"^(?:(3[01]|[12][0-9]|0?[1-9])[.\-\/]{{1}}({months}))[.\-\/]{{1}}(?:[0-9]{{2}})?[0-9]{{2}}$"}}]

            
#matcher.add("Date: (mm/dd/yyyy m/d/yy)", None, pattern1)
#matcher.add("Date: (dd/mm/yyyy d/m/yy)", None, pattern2)
matcher.add("Date: (__/__/yyyy _/_/yy)", None, pattern5)
matcher.add("Date: (yyyy/mm/dd)", None, pattern3)
matcher.add("Date: (dd-Mon-yyyy)", None, pattern4)

from datetime import datetime

def parse_date(string: str) -> datetime:
    # TODO: use match_id to better parse the date, also use language
    from dateparser import parse
    # date_formats = ["%d/%m/%Y", "%d/%m/%y", "%m/%d/%Y", "%m/%d/%y", "%Y/%m/%d", "%d-%B-%Y", "d-%b-%Y"]
    date = parse(string, languages=['de'])
    if not date:
        date = parse(string)
    return date
    
def get_date_matches_from_text(text: str, n_lefts:int=2, n_rights:int=1) -> pd.DataFrame:
    doc = nlp.make_doc(text)
    matches = matcher(doc)
    
    all_matches = []

    for match_id, start, end in matches:
        # The matched span (text)
        match_string = doc[start:end].text
        match_date = parse_date(match_string)
        match_date = match_date.strftime('%Y-%m-%d')
        span_left = doc[max(0, start-n_lefts):max(0, end-1)]
        span_right = doc[end:min(len(doc), end+n_rights)]
        all_matches.append({
            'match_id': nlp.vocab.strings[match_id],
            'match_date': match_date,
            'text_left': span_left.text,
            'text_right': span_right.text,
        })

    return all_matches

doc = nlp.make_doc(test_doc)
print([t for t in doc])

df_matches = get_date_matches_from_text(test_doc, 4)
df_matches[:2]

## Detect Language

In [None]:
from spacy_langdetect import LanguageDetector
#
text = 'This is an english text.'
text1 = 'Das ist ein deutscher text.'
text2 = 'Esto es un texto espanol.'

import spacy
nlp = spacy.load('de_core_news_md')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
doc = nlp(text1)

doc._.language

## Use sample pdf

In [None]:
test_pdf = df.sample(2, random_state=0)
test_pdf[['pdf_number_pages', 'pdf_text']] = test_pdf.apply(lambda r: doOcr(r['file_path']), axis=1, result_type='expand')

In [None]:
#!open "../data/01_raw/KTA AI Rechnungen/5672190.pdf"

In [None]:
test_pdf.head()

# Model

In [None]:
df_train = pd.read_csv('../data/05_model_input/train_formatted.csv')
print(len(df_train), df_train.value_counts('label'))
df_train.head(2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn_pandas import DataFrameMapper

vectorizer_text = TfidfVectorizer(analyzer='char',
                                  lowercase=True,
                                  ngram_range=(1,2),
                                  max_df = 0.8,
                                  max_features=1000,
                                  min_df=3,
                                  strip_accents=None, #'unicode',
                                  norm='l2',
                                  sublinear_tf=True)

# Pipeline
# Model Selection
classifier = LogisticRegression(penalty='l2')

# Feature Selection
features = vectorizer_text

#feature_selection = SelectKBest(chi2, k=100),
#feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100)),
feature_selection = None

from sklearn.preprocessing import FunctionTransformer

pipe = Pipeline([
    ('features', DataFrameMapper([
        ('text', vectorizer_text),
#        (['ratioNumAlph'], sklearn.preprocessing.StandardScaler()),
    ])),
    ('feature_selection', feature_selection),
    ('clf', classifier),
])

In [None]:
pipe.fit(df_train, df_train['label'])

df_eval = df_train.copy()
df_eval['prediction'] = pipe.predict(df_train)

classes = list(pipe.classes_)
df_eval['prediction_int'] = df_eval['prediction'].apply(lambda l: classes.index(l))

df_eval = pd.concat([df_eval, pd.DataFrame(pipe.predict_proba(df_eval))],axis=1)
df_eval['predict_probab'] = df_eval.apply(lambda r: r[r['prediction_int']], axis=1)

df_eval['predict_probab'] = pipe.predict_proba(df_eval)[:,0]

print("All dates:")
print(len(df_eval), len(pd.unique(df_eval['file_number'])))

df_eval.head()

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

def get_single_date_for_doc(g):
    # first only consider predicted belegdatum, as this is what we want to have..
    result = g.query("prediction == 'Belegdatum'")
    if len(result) == 0:
        # if no belegdatum predicted, use dates where the label is belegdatum
        result = g.query("label == 'Belegdatum'")
        if len(result) == 0:
            result = g
    # reduce 
    result = result.loc[result['predict_probab'].idxmax()]
    return result
        
df_docs = df_eval.groupby('file_number').apply(get_single_date_for_doc)

print("All documents:")
print(len(df_docs), len(pd.unique(df_docs['file_number'])))

    # TP: Belegdatum predicted correctly
    # FP: Belegdatum predicted incorrectly
    # TN: Correct absence of Belegdatum (maybe ocr problem)
    # FN: No Belegdatum predicted (missing result)
    
print('tn:',len(df_docs.query("label == 'other_date' & prediction == 'other_date'")),
      'fp:',len(df_docs.query("label == 'other_date' & prediction == 'Belegdatum'")),
      'fn:',len(df_docs.query("label == 'Belegdatum' & prediction == 'other_date'")),
      'tp:',len(df_docs.query("label == 'Belegdatum' & prediction == 'Belegdatum'")))

tn, fp, fn, tp = confusion_matrix(df_docs['label'], df_docs['prediction'],
                 labels=['other_date','Belegdatum']).ravel()

## Determine threshold

In [None]:
import numpy as np
def find_threshold(df_, p=0.95):
    # Find prediction threshold so that accuracy for documents above is equal to p
    predict_proba = pipe.predict_proba(df_)[:, 0]
    for t in np.arange(0, 1.01, 0.01):
        df_['prediction_t'] = ['Belegdatum' if v >= t else 'other_date' for v in predict_proba]
        df_test = df_.query("prediction_t == 'Belegdatum'")
        acc = round(accuracy_score(df_test['label'], df_test['prediction_t']), 2)
        if acc >= p:
            n_correct_docs_tuned = accuracy_score(df_test['label'], df_test['prediction_t'], normalize=False)
            n_docs_we_trust_tuned = len(df_test)

            return n_correct_docs_tuned, n_docs_we_trust_tuned, acc

find_threshold(df_eval)

## Evaluation

    # TP: Belegdatum predicted correctly
    # FP: Belegdatum predicted incorrectly
    # TN: Correct absence of Belegdatum (maybe ocr problem)
    # FN: No Belegdatum predicted (missing result)

In [None]:
import sys
sys.path.append("../src/cc_ocr_date_extract/pipelines/ml/")
import nodes
from nodes import evaluate_model

import importlib
importlib.reload(nodes);

In [None]:
from pprint import pprint
pprint(evaluate_model(pipe, df_train))

#tn: 38 fp: 11 fn: 60 tp: 199

## Cross validate

In [None]:
def train_cross_validate(pipe, X, y):
    from sklearn.model_selection import cross_validate
    from sklearn.metrics import recall_score
    from sklearn.metrics import make_scorer
    scoring = {'prec_macro': 'precision_macro',
               'rec_macro': make_scorer(recall_score, average='macro')}
    scores = cross_validate(pipe, X, y, scoring=scoring, cv = 5, return_train_score = True)
    scores = {k: f"{v.mean().round(2)} +- {v.std().round(2)}" for (k,v) in scores.items()}
    return scores
    
pprint(train_cross_validate(pipe, df_train.dropna(), df_train.dropna()['label']))

## Plot

In [None]:
df_test = pd.read_csv('../data/05_model_input/test_formatted.csv')
from sklearn.metrics import plot_roc_curve
plot_roc_curve(pipe, df_test, df_test['label']);

In [None]:
df_ = df_test.reset_index(drop=True).copy()
df_['prediction'] = pipe.predict(df_test)
df_['prediction_probab_belegdatum'] = pd.DataFrame(pipe.predict_proba(df_))[0]
df_['prediction_probab_other_date'] = pd.DataFrame(pipe.predict_proba(df_))[1]
df_.hist(['prediction_probab_belegdatum','prediction_probab_other_date'], bins=50);

from sklearn.metrics import plot_confusion_matrix
disp = plot_confusion_matrix(pipe, df_test, df_test['label'])

In [None]:
# https://scikit-learn.org/stable/modules/calibration.html#calibration

In [None]:
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve,  CalibratedClassifierCV

plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

prob_pos = pipe.predict_proba(df_)[:,1]
fraction_of_positives, mean_predicted_value = calibration_curve(df_test['label'], prob_pos, n_bins=20)

ax1.plot(mean_predicted_value, fraction_of_positives, "s-")

ax2.hist(prob_pos, range=(0, 1), bins=20, histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Vorhersage")
ax2.set_ylabel("Anzahl")
ax2.legend(loc="upper center", ncol=2)

plt.tight_layout()
plt.show()

## Inspect results

In [None]:
prediction = pipe.predict(df_train.dropna())

df_eval = df_train.dropna().reset_index(drop=True).copy()
df_eval['prediction'] = prediction
df_eval['prediction_probab'] = pd.DataFrame(pipe.predict_proba(df_eval))[0]

In [None]:
# number duplicate training examples
print("training examples:", len(df_train))
print("number duplicates:", len(df_train)-len(df_train.drop_duplicates('text')))

In [None]:
df_max_pred_probab = (df_eval.sort_values('prediction_probab')
                      .groupby(['file_number','prediction'])
                      .last()
                      .reset_index())


df_tt = (df_max_pred_probab[df_max_pred_probab
                       .groupby('file_number')
                       .apply(lambda x: ~x['label'].isin(['Belegdatum']))
                       .reset_index()['label']])
df_tt[df_tt['file_number'].duplicated()]

## One Example

In [None]:
!open "../data/01_raw/KTA AI Rechnungen/5672563.pdf" ## very many pages!

In [None]:
!open "../data/01_raw/KTA AI Rechnungen/5671953.pdf"

In [None]:
df_train.query('file_number == 5671953')

In [None]:
df_eval.query('file_number == 5671953')

In [None]:
df.query('file_number == 5671953')