In [None]:
## This takes the token file and does a number of things:
# - rejects documents with too few tokens (need OCR) or no ground truth
# - normalizes page numbers in 0..1
# - provides fuzzy matching scores for each token vs ground truth tokens

import csv
import decimal
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
#from util import is_dollar_amount, normalize_dollars

output_docs = 0

# Data in filings that we want to find.
# We output a column for each one of these, indicating how close the token is to
# the correct answer.
# For our first experiment, just extract gross_amount
# Other possible targets include 'committee','agency','callsign'

#targets = ["gross_amount", "contract_number", "committee"]
targets = ["Wages, tips and other compensation", "Federal Income tax withheld", "Social Security wages", "Social security tax withheld"]
targets = [t.replace(' ', '_') for t in targets]

filings = pd.read_csv("C:/Users/HP/Documents/work/DeepForm/deepform-master/myfilings.csv")

incsv = pd.read_csv("C:/Users/HP/Documents/work/DeepForm/deepform-master/my-tokens.csv")

outcols = ["slug", "x0", "y0", "x1", "y1", "token"] + targets
outcsv = csv.DictWriter(open("mytrain.csv", mode="w", encoding = "utf-8"), fieldnames=outcols)
outcsv.writeheader()


# computes fuzzy distance from each token in the series to the target answer for
# the document answer may be multiple tokens, in which case we take the max of
# matches.
def multi_token_target_match(answer, tokens, target, max_n, anstok):
    best_match = [0 for i in range(max_n)]
    best_idx = [0 for i in range(max_n)]
    # Two dimensional because we will have one array for each possible n-gram length.
    ratioslist = np.zeros((max_n, len(tokens)))
    # For each possible number of tokens in answertoken:
    for i in range(max_n):
        # For each n-gram of that length in the doc:
        for idx in range(0, len(tokens) - i):
            # Make it one token so we can compare.
            token_string = "".join(str(t) for t in tokens[idx : idx + i + 1])
            # Compare and store the float in match.
            match = fuzz.ratio(anstok, token_string) / 100.0
            # Update the ratioslist matrix with this match value for the n-gram
            # length and index.
            ratioslist[i, idx] = match
            # Update our vector of best matches for each n-gram.
            if match > best_match[i]:
                best_match[i] = match
                best_idx[i] = idx
    print("best_match array: " + str(best_match))
    best_len = np.argmax(best_match) + 1
    best_match_idx = best_idx[best_len - 1]
    print("Best choice for number of tokens: " + str(best_len))
    print(
        "Best Match Token Sequence: "
        + str(tokens[best_match_idx : best_match_idx + best_len])
    )

    scores = np.zeros(len(tokens))

    # Make a list of all indices from ratioslist[np.argmax(best_match),:] which
    # have the best match.
    best_idx_list = [
        i
        for i, value in enumerate(ratioslist[np.argmax(best_match), :])
        if value == best_match[best_len - 1]
    ]
    print("Target Occurs at Indices: " + str(best_idx_list))

    # For each of these indices in scores, set the following best_len tokens
    # equal to best_match.
    for a in best_idx_list:
        for i in range(best_len):
            scores[a + i] = best_match[best_len - 1]

    return scores


def target_match(answer, tokens, target, max_n):
    print()
    print("target: " + target)
    print("answer: " + str(answer))
    anstok = (
        str(answer).lower().replace(" ", "")
    )  # Remove spaces and make the answer lower case
    tokens = [token.lower() for token in tokens]  # lowercase all the tokens also

    if target == "Wages, tips and other compensation":

        scores = []
        max_n = 1
        for token in tokens:
            if is_dollar_amount(anstok) and is_dollar_amount(token):
                try:
                    scores.append(
                        fuzz.ratio(normalize_dollars(anstok), normalize_dollars(token))
                        / 100.0
                    )
                except decimal.InvalidOperation:
                    # not a number, maybe a date?
                    scores.append(fuzz.ratio(anstok, token) / 100.0)
            else:
                scores.append(fuzz.ratio(anstok, token) / 100.0)

    else:
        scores = multi_token_target_match(answer, tokens, target, max_n, anstok)

    return scores

In [15]:
targets

['Wages,_tips_and_other_compensation',
 'Federal_Income_tax_withheld',
 'Social_Security_wages',
 'Social_security_tax_withheld']

In [16]:
def process_doc(slug, rows, max_n):
    print()
    print()
    print("--------------------------------")
    print(f"Processing {slug} with {len(rows)} tokens")
    global output_docs
    if len(rows) < 10:
        # probably needs OCR
        print(f"Skipping {slug} because it has only {len(rows)} tokens")
        return
    
    filings.columns = [c.replace(' ', '_') for c in filings.columns]
    answers = filings[filings.pdf_file.str.contains(slug)]
    #answers = filings.loc[filings["pdf file"] == slug]
    #answers = filings.loc[filings["pdf file"].intersection(slug)]
    
    print("printing answers here 1", answers)
    
    if len(answers) != 1:
        print(f"Skipping {slug} because it matches {len(answers)} rows")
        return
    answers = answers.iloc[0]
    
    print("\n\n printing answers here 2\n\n", answers)
    print("type of answers", type(answers))
    
    if answers[targets].isnull().any():
        print(
            f"Skipping {slug} because it is missing answers for "
            f"{[t for t in targets if pd.isnull(answers[t])]}"
        )
        return

    df = pd.DataFrame(rows)
    
    '''
    page = pd.to_numeric(df["page"])
    maxpage = page.max()
    if maxpage:  # avoid div/0 for one page docs
        df["page"] = page / maxpage  # last page = 1.0
    '''
    
    for t in targets:
        df[t] = target_match(
            answers[t], df["token"].fillna(""), t, max_n
        )  # The value of the answer and an array of the tokens for that slug

    for _, row in df.iterrows():
        outcsv.writerow(row.to_dict())

    output_docs += 1

In [17]:
# --- Main ---
# Accumulate all rows with the same slug
# active_rows = []
# active_slug = None
# input_docs = 0
# max_n = 5
#    for row in incsv:
#     if row["slug"] != active_slug:
#         if active_slug:
#             process_doc(active_slug, active_rows, max_n)
#             input_docs += 1
#         active_slug = row["slug"]
#         active_rows = [row]
#     else:
#         active_rows.append(row)

# print(f"Input documents {input_docs}")
# print(f"Output documents {output_docs}")


# --- Main ---
# Accumulate all rows with the same slug
active_rows = []
# active_slug = None
input_docs = 0
max_n = 3
# for row in incsv:
#     if row["slug"] != active_slug:
#         if active_slug:
#             process_doc(active_slug, active_rows)
#             input_docs += 1
#         active_slug = row["slug"]
#         active_rows = [row]
#     else:
#         active_rows.append(row)
n = 0
for slug, group in incsv.groupby("slug"):
    process_doc(slug, group, max_n)
    n += 1
    if n > 200:
        break
# print(f"Input documents {input_docs}")
# print(f"Output documents {output_docs}")



--------------------------------
Processing CAPRUS W2.pdf with 2430 tokens
printing answers here 1 Empty DataFrame
Columns: [pdf_file, Wages,_tips_and_other_compensation, Federal_Income_tax_withheld, Social_Security_wages, Social_security_tax_withheld]
Index: []
Skipping CAPRUS W2.pdf because it matches 0 rows


--------------------------------
Processing DAT W2 (1).pdf with 2575 tokens
printing answers here 1 Empty DataFrame
Columns: [pdf_file, Wages,_tips_and_other_compensation, Federal_Income_tax_withheld, Social_Security_wages, Social_security_tax_withheld]
Index: []
Skipping DAT W2 (1).pdf because it matches 0 rows


--------------------------------
Processing DAT W2-2.pdf with 128 tokens
printing answers here 1 Empty DataFrame
Columns: [pdf_file, Wages,_tips_and_other_compensation, Federal_Income_tax_withheld, Social_Security_wages, Social_security_tax_withheld]
Index: []
Skipping DAT W2-2.pdf because it matches 0 rows


--------------------------------
Processing F5 W2.pdf wit

  return func(self, *args, **kwargs)


best_match array: [1.0, 0.72, 0.67]
Best choice for number of tokens: 1
Best Match Token Sequence: ['135405.59']
Target Occurs at Indices: [91, 168, 170, 172]

target: Federal_Income_tax_withheld
answer: 14431.03
best_match array: [1.0, 0.94, 0.62]
Best choice for number of tokens: 1
Best Match Token Sequence: ['14431.03']
Target Occurs at Indices: [92, 169, 171, 173]

target: Social_Security_wages
answer: 132900.0
best_match array: [0.86, 0.7, 0.67]
Best choice for number of tokens: 1
Best Match Token Sequence: ['132900']
Target Occurs at Indices: [97, 186, 188, 190]

target: Social_security_tax_withheld
answer: 8239.8
best_match array: [1.0, 0.92, 0.63]
Best choice for number of tokens: 1
Best Match Token Sequence: ['8239.8']
Target Occurs at Indices: [98, 187, 189, 191]


--------------------------------
Processing Intel W2.pdf with 192 tokens
printing answers here 1        pdf_file  Wages,_tips_and_other_compensation  \
3  Intel W2.pdf                           106468.39   

   Fed