## Invoice Extraction

In [1]:
# importing libraries
import pandas as pd
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import re
import io
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [2]:
# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
# Load a new spacy model
nlp = spacy.blank("en")
db = DocBin()  # Create a DocBin object

In [4]:
def pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path)
    return images

def extract_text_and_confidence(image): 
    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    text_confidence = {}
    for i in range(len(data['text'])):
        word = data['text'][i].strip()
        if word and int(data['conf'][i]) > 0:
            text_confidence[word] = int(data['conf'][i])
    return text_confidence

def get_text_confidence_from_pdf(pdf_path):
    images = pdf_to_images(pdf_path)
    all_text_confidence = []
    for page_number, image in enumerate(images, start=1):
        text_confidence = extract_text_and_confidence(image)
        page_data = {'Page': page_number, 'Text_Confidence': text_confidence}
        all_text_confidence.append(page_data)
    return all_text_confidence

def calculate_field_confidence(df, text_confidence_data):
    field_confidence = []

    for _, row in df.iterrows():
        field = row['Field']
        data = row['Data']
        
        field_conf = 0
        total_words = len(data.split())
        
        for page_data in text_confidence_data:
            text_confidence = page_data['Text_Confidence']
            
            for word in data.split():
                if word in text_confidence:
                    field_conf += text_confidence[word]
        
        avg_confidence = field_conf / total_words if total_words > 0 else 0
        field_confidence.append(avg_confidence)
    
    return field_confidence

def extract_text_from_images(images):
    texts = []
    for image in images:
        # Use BytesIO to handle image in memory
        with io.BytesIO() as img_buffer:
            image.save(img_buffer, format='JPEG')
            img_buffer.seek(0)
            img = Image.open(img_buffer)
            img_text = pytesseract.image_to_string(img)
            texts.append(img_text)
    
    return texts

In [5]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\1.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'INVOICE\n\nInvoice Number INV-0012\n\nInvoice Date 13/7/2020\n\nBilling Information\nCompany\n‘ABC Company\n\nName\nRichard Glenn\n\nAddress\n\n4275 Crummit Lane, Happy Village\nNew York, NY, 281502\n\nEmail\njohn@example.com\n\nProducts\n\nDescription\n\nQuantity\n\nShipping Information\n\nName\nRichard Glenn\n\nAddress\n\n4275 Crummit Lane, Happy Village\nNew York, NY, 281502\n\nCotton Male T-shirt\n\nTShirt: M 2\n$84.00\nSubtotal\nShipping\nTax\nTotal\n\nAdditional Notes\n\n‘TERMS AND CONDITIONS.\n\n$42.00\n\n$84.00\n$52.00\n$0.00\n\n$136.00\n\n1. The Seler shall not be liable to the Buyer directly or indirectly for any loss or damage suffered by the Buyer\n\n2. The Seller warrants the product for one (1) year from the date of shipment.\n\n3. Any purchase order received by the seller will be interpreted as accepting this offer and the sale offer in writing. The buyer may purchase\nthe product in this offer only under the Terms and Conditions ofthe Seller included in this offer.\n\n

In [6]:
# Load the training data
with open('train_data1.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [7]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data1.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 3249.76it/s]


In [8]:
# Initialize config file for training
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;4m[i] Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data1.spacy --paths.dev ./train_data1.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     70.19    0.00    0.00    0.00    0.00
100     200         29.94   1389.65  100.00  100.00  100.00    1.00
200     400          0.00      0.00  100.00  100.00  100.00    1.00
367     600          0.00      0.00  100.00  100.00  100.00    1.00
567     800          0.00      0.00  100.00  100.00  100.00    1.00
767    1000          0.00      0.00  100.00  100.00  100.00    1.00
967    1200          0.00      0.00  100.00  100.00  100.00    1.00
1167    1400          0.00      0.00  100.00  100.00  100.00    1.00
1367    1600          0.00      0.00  100.00  100.00  100.00    1.00
1567    1800          0.00      0.00  1

In [10]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [11]:
doc = nlp_ner(extracted_text)

In [12]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,INV-0012
1,INVOICE DATE,13/7/2020
2,NAME,Richard Glenn
3,ADDRESS,"4275 Crummit Lane, Happy Village"
4,ADDRESS,"New York, NY, 281502"
5,EMAIL,john@example.com
6,NAME,Richard Glenn
7,ADDRESS,"4275 Crummit Lane, Happy Village"
8,ADDRESS,"New York, NY, 281502"
9,SUBTOTAL,$84.00


In [13]:
# Specify the indices of the rows to be removed
indices_to_remove = [7,9]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,INV-0012
1,INVOICE DATE,13/7/2020
2,NAME,Richard Glenn
3,ADDRESS,"4275 Crummit Lane, Happy Village"
4,ADDRESS,"New York, NY, 281502"
5,EMAIL,john@example.com
6,NAME,Richard Glenn
7,ADDRESS,"New York, NY, 281502"
8,SUBTOTAL,$84.00
9,SHIPPING,$52.00


In [14]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,INVOICE NUMBER,INV-0012,90.0
1,INVOICE DATE,13/7/2020,90.0
2,NAME,Richard Glenn,95.5
3,ADDRESS,"4275 Crummit Lane, Happy Village",94.2
4,ADDRESS,"New York, NY, 281502",87.75
5,EMAIL,john@example.com,88.0
6,NAME,Richard Glenn,95.5
7,ADDRESS,"New York, NY, 281502",87.75
8,SUBTOTAL,$84.00,95.0
9,SHIPPING,$52.00,89.0


In [15]:
df.to_csv('1.csv',index=False)

In [None]:
######################################################################################

In [16]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\2.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'COMPANY Simple Invoice\n\nNant 19/6/2020\n\nInvoice #: INV-00004\n\nBill to:\n\nCompany Name\n‘ABC Company\nAddress\n\n4121 Barfield Lane\nIndianapolis, IN, 46225\nPhone Number\n\n(855) 555-5555\n\nRnd\nrich.glen@noemail.com\n\nProduct 1 1 $26.69\nProduct 2 1 $53.28\nProduct 3 1 9133.44\n\nTotal $213.51\n\nIyouhave any questions about this invoice, please contact,\n5555555555,\n‘example@example.com\n\n'

In [17]:
# Load the training data
with open('train_data2.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [18]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data2.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 3038.91it/s]

Skipping entity





In [19]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data2.spacy --paths.dev ./train_data2.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     81.33    0.00    0.00    0.00    0.00
 87     200         48.68   1471.51  100.00  100.00  100.00    1.00
187     400          0.00      0.00  100.00  100.00  100.00    1.00
287     600          0.00      0.00  100.00  100.00  100.00    1.00
464     800          0.00      0.00  100.00  100.00  100.00    1.00
664    1000          0.00      0.00  100.00  100.00  100.00    1.00
864    1200          0.00      0.00  100.00  100.00  100.00    1.00
1064    1400          0.00      0.00  100.00  100.00  100.00    1.00
1264    1600          0.00      0.00  100.00  100.00  100.00    1.00
1464    1800          0.00      0.00  1

In [20]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [21]:
doc = nlp_ner(extracted_text)

In [22]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,INVOICE DATE,19/6/2020
1,INVOICE NUMBER,INV-00004
2,ADDRESS,4121 Barfield Lane
3,ADDRESS,"Indianapolis, IN, 46225"
4,PHONE NUMBER,(855) 555-5555
5,EMAIL,rich.glen@noemail.com
6,TOTAL,$213.51
7,PHONE NUMBER,5555555555
8,EMAIL,example@example.com


In [23]:
# Specify the indices of the rows to be removed
indices_to_remove = [4]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,INVOICE DATE,19/6/2020
1,INVOICE NUMBER,INV-00004
2,ADDRESS,4121 Barfield Lane
3,ADDRESS,"Indianapolis, IN, 46225"
4,EMAIL,rich.glen@noemail.com
5,TOTAL,$213.51
6,PHONE NUMBER,5555555555
7,EMAIL,example@example.com


In [24]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,INVOICE DATE,19/6/2020,82.0
1,INVOICE NUMBER,INV-00004,88.0
2,ADDRESS,4121 Barfield Lane,90.0
3,ADDRESS,"Indianapolis, IN, 46225",93.666667
4,EMAIL,rich.glen@noemail.com,89.0
5,TOTAL,$213.51,96.0
6,PHONE NUMBER,5555555555,0.0
7,EMAIL,example@example.com,0.0


In [25]:
df.to_csv('2.csv',index=False)

In [None]:
#####################################################################################################

In [26]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\3.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'East Repair Inc. INVOICE\n\n1912 Harvest Lane\nNew York, NY 12210\n\nBill To Ship To Invoice # us-001\nsoba smith John Son Invoice Date 1102/2019\n2 Court Square 3787 Pineview Drive\nNew York, NY 12210 ‘Cambridge, MA 12210 Pos 2312/2019\nDue Date 26/02/2019\nary DESCRIPTION UNIT PRICE ‘AMOUNT\n\n1 Front and rear brake cables 100.00 100.00\n\n2 | New set of pedal arms. 15.00 30.00\n\n3 | Labor Shs 5.00 15.00\n\nSubtotal 145.00\n\nSales Tax 6.25% 9.06\n\nTOTAL $154.06\n\nSmith\n\nTerms & Conditions\nPayment is due within 15 days\n\nPlease make checks payable to: East Repair Inc.\n\n'

In [27]:
# Load the training data
with open('train_data3.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [28]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data3.spacy")

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 111.67it/s]

Skipping entity





In [29]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data3.spacy --paths.dev ./train_data3.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     65.22    0.00    0.00    0.00    0.00
 51     200         60.45   1932.92  100.00  100.00  100.00    1.00
111     400          0.26      0.31  100.00  100.00  100.00    1.00
189     600          0.00      0.00  100.00  100.00  100.00    1.00
284     800          0.00      0.00  100.00  100.00  100.00    1.00
384    1000          0.00      0.00  100.00  100.00  100.00    1.00
556    1200          0.00      0.00  100.00  100.00  100.00    1.00
756    1400          0.00      0.00  100.00  100.00  100.00    1.00
956    1600          0.00      0.00  100.00  100.00  100.00    1.00
1156    1800          0.00      0.00  100

In [30]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [31]:
doc = nlp_ner(extracted_text)

In [34]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
data = pd.DataFrame(entities, columns=['Field', 'Data'])
data

Unnamed: 0,Field,Data
0,COMPANY,East Repair Inc.
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210"
2,NAME,smith
3,NAME,John
4,INVOICE DATE,1102/2019
5,ADDRESS,"2 Court Square 3787 Pineview Drive\nNew York, ..."
6,ADDRESS,"‘Cambridge, MA"
7,PO NUMBER,Pos
8,SALES TAX,%
9,TOTAL,$


In [35]:
# Specify the indices of the rows to be removed
indices_to_remove = [2,6,7,8,9,10]

# Remove the rows
df1 = data.drop(indices_to_remove)

# Reset the index if needed
df1 = df1.reset_index(drop=True)
df1

Unnamed: 0,Field,Data
0,COMPANY,East Repair Inc.
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210"
2,NAME,John
3,INVOICE DATE,1102/2019
4,ADDRESS,"2 Court Square 3787 Pineview Drive\nNew York, ..."


In [36]:
text = ' '.join(texts)

patterns = {
    'P.O.#': r'Pos\s*(\S+)',
    'Due Date': r'Due Date\s*(\d{2}/\d{2}/\d{4})',
    'Subtotal': r'Subtotal\s*([\d.]+)',
    'Invoice #': r'Invoice #\s*(\S+)',
    'Sales Tax 6.25%': r'Sales Tax 6.25%\s*([\d.]+)',
}

data = {}
for field, pattern in patterns.items():
    match = re.search(pattern, texts[0], re.DOTALL)
    if match:
        data[field] = match.group(1).strip().replace('\n',' ')

df2 = pd.DataFrame(list(data.items()), columns=['Field', 'Data'])
df2

Unnamed: 0,Field,Data
0,P.O.#,2312/2019
1,Due Date,26/02/2019
2,Subtotal,145.00
3,Invoice #,us-001
4,Sales Tax 6.25%,9.06


In [37]:
df=pd.concat([df1,df2],axis=0)
df

Unnamed: 0,Field,Data
0,COMPANY,East Repair Inc.
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210"
2,NAME,John
3,INVOICE DATE,1102/2019
4,ADDRESS,"2 Court Square 3787 Pineview Drive\nNew York, ..."
0,P.O.#,2312/2019
1,Due Date,26/02/2019
2,Subtotal,145.00
3,Invoice #,us-001
4,Sales Tax 6.25%,9.06


In [38]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)

df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,COMPANY,East Repair Inc.,91.333333
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210",89.714286
2,NAME,John,87.0
3,INVOICE DATE,1102/2019,79.0
4,ADDRESS,"2 Court Square 3787 Pineview Drive\nNew York, ...",85.1
0,P.O.#,2312/2019,89.0
1,Due Date,26/02/2019,41.0
2,Subtotal,145.00,86.0
3,Invoice #,us-001,0.0
4,Sales Tax 6.25%,9.06,94.0


In [39]:
df.to_csv('3.csv',index=False)

In [None]:
###############################################################################

In [48]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\4.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'COMPANY NAME\n\nSLOGAN GOES HERE\n\nFORWARDING AGENT\nDHL\n\nTRACKING NUMBER\n55888800000998700\n\nSOLD BY\n\nLOCAL STORE\n\n255 Commercial Street\n25880 New York, US\n(555) 1000 255 6678\ninfo@localstore.com\n\nEORI: PT100003456566\n\nConveyor Belt 25"\n\nCountry of origin: US 88565.2252\n\nPole with bracket\n\nCountry of origin: US 88565.2545\n\nPole with bracket\n\nCountry of origin: US. 88565.2545\n\nPole with bracket\n\nCountry of origin: US 88565.2545\n\nPole with bracket\n\nCountry of origin: US 88565.2545\n\nInsurance: NOT INCLUDED\nReason for export: SALE\n\nIncoterms: DAP\n\nDescription of the goods\n(number of packages, units, weight, etc.):\n\n© Pallet 1200x800x1500mm (15.2kg) 1pc\n© Carton Box150x200x100mm (15.2kg) 3pcs\n\n‘TémplateLAB\nCOMMERCIAL\n\nINVOICE\n\nInternational Sales Operations\n\nDATE INVOICE NUMBER\n14/08/2023 F1000876/23\nPAID BY ORDER ID\n\nCREDIT CARD X001525\n\nBILLTO\n\nIMPORTING COMPANY\n\n100 Mighty Bay\n\n125863 Rome, IT\n00 39 5658 444 52474\ninfo

In [41]:
# Load the training data
with open('train_data4.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [42]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data4.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 2312.10it/s]


In [43]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data4.spacy --paths.dev ./train_data4.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     65.08    0.00    0.00    0.00    0.00
 38     200        670.63   3069.38  100.00  100.00  100.00    1.00
 84     400          0.70      1.76  100.00  100.00  100.00    1.00
144     600          0.00      0.00  100.00  100.00  100.00    1.00
211     800          0.01      0.01  100.00  100.00  100.00    1.00
299    1000          0.00      0.00  100.00  100.00  100.00    1.00
399    1200          0.00      0.00  100.00  100.00  100.00    1.00
506    1400          0.00      0.00  100.00  100.00  100.00    1.00
706    1600          0.00      0.00  100.00  100.00  100.00    1.00
906    1800          0.00      0.00  100.

In [44]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [49]:
doc = nlp_ner(extracted_text)

In [50]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,PO NUMBER,DHL
1,TRACKING NUMBER,55888800000998700
2,ADDRESS,255 Commercial Street
3,ADDRESS,"25880 New York, US"
4,PHONE NUMBER,(555) 1000 255 6678
5,WEBSITE,info@localstore.com
6,EORI,PT100003456566
7,PRODUCT,Conveyor Belt 25
8,PRODUCT,Country of origin: US
9,HS CODE,88565.2252


In [51]:
# Specify the indices of the rows to be removed
indices_to_remove = [22,29,30,31,32,33,34,35,36,37,38,39]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,PO NUMBER,DHL
1,TRACKING NUMBER,55888800000998700
2,ADDRESS,255 Commercial Street
3,ADDRESS,"25880 New York, US"
4,PHONE NUMBER,(555) 1000 255 6678
5,WEBSITE,info@localstore.com
6,EORI,PT100003456566
7,PRODUCT,Conveyor Belt 25
8,PRODUCT,Country of origin: US
9,HS CODE,88565.2252


In [52]:
# Get text and confidence scores from the PDF
text_confidence_data = get_text_confidence_from_pdf(pdf_path)

# Calculate confidence scores for each field in the DataFrame
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)

df

Unnamed: 0,Field,Data,Confidence
0,PO NUMBER,DHL,94.0
1,TRACKING NUMBER,55888800000998700,91.0
2,ADDRESS,255 Commercial Street,94.666667
3,ADDRESS,"25880 New York, US",88.5
4,PHONE NUMBER,(555) 1000 255 6678,95.5
5,WEBSITE,info@localstore.com,92.0
6,EORI,PT100003456566,91.0
7,PRODUCT,Conveyor Belt 25,64.0
8,PRODUCT,Country of origin: US,89.0
9,HS CODE,88565.2252,92.0


In [53]:
df.to_csv('4.csv',index=False)

In [None]:
###########################################################################

In [64]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\5.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'©. INVOICE\n\nINVOICE # DATE OF ISSUE\n1000-15088 12/06/2023\nBILL TO\n\nClient Name\nStreet address\nCity, State, Country\n\nTemplateLAB\n\nCLEANING PERIOD INCLUDED\n01/05/2023-31/05/2023\n\nCLEANING SERVICES\n2001 Street Name\n\nCity, State, Country, ZiP code\n(000) 123 456 7890\ncleaningservices@email.com\n\nZIP Code cleaningservices123.com\nDESCRIPTION UNIT COST Qty AMOUNT\nCurtain Cleani\niii: $40.00 3 120.00\nSuperior dry cleaning on-site\nG Cleani\neSRitencanend $50.00 2 100.00\nEco-friendly cleaning by using products that are non-toxic, biodegradable, and safe\nP. Washi\nresets svasmng $110.00 4 110.00\nJet washer to deliver a powerful water stream to remove dirt and clean surfaces\nChi Ss i\nile da As $105.00 1 105.00\nChimney sweeping to prevent soot build-up, which is a fire hazard\nCeili id Wall Cleani\nening an an ieaning $35.00 8 280.00\nRemoving dirt, oil, and other grime on walls and ceilings\nSanitization Servi\nanitization Services $60.00 3 {80008\nUsing Hydrogen per

In [65]:
# Load the training data
with open('train_data5.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [66]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data5.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1968.65it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [67]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data5.spacy --paths.dev ./train_data5.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     76.52    0.00    0.00    0.00    0.00
 22     200       5250.56   4728.31   95.37   94.50   96.26    0.95
 46     400         16.49    127.55   99.06  100.00   98.13    0.99
 73     600          8.78     87.33   99.06  100.00   98.13    0.99
103     800         74.69    124.70   99.07   98.17  100.00    0.99
136    1000        158.10    128.17   99.06  100.00   98.13    0.99
173    1200        486.26     56.09   99.53   99.07  100.00    1.00
212    1400         65.41     12.39  100.00  100.00  100.00    1.00
249    1600          0.00      0.00  100.00  100.00  100.00    1.00
294    1800        270.66     12.26  100.

In [68]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [69]:
doc = nlp_ner(extracted_text)

In [70]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,1000-15088
1,CLEANING PERIOD,01/05/2023-31/05/2023
2,PHONE NUMBER,(000) 123 456 7890
3,SUBTOTAL,$
4,TOTAL,$929.50
5,TAX,10
6,TAX,$84.50
7,TOTAL,$
8,ACCOUNT NUMBER,0123 0000 1111 2323\nSort Code 25-88-00


In [71]:
# Specify the indices of the rows to be removed
indices_to_remove = [3,5,7]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,1000-15088
1,CLEANING PERIOD,01/05/2023-31/05/2023
2,PHONE NUMBER,(000) 123 456 7890
3,TOTAL,$929.50
4,TAX,$84.50
5,ACCOUNT NUMBER,0123 0000 1111 2323\nSort Code 25-88-00


In [72]:
text = texts[0]

# Define regex patterns
patterns = { 
    'Street address': r'CLEANING SERVICES\n([\s\S]*?)\n\n',  
    'Email': r'(\S+@\S+\.\S+)', 
    'Website': r'ZIP Code\s+(\S+)'  
}

# Extract data
data = {}
for field, pattern in patterns.items():
    match = re.search(pattern, text)
    if match:
        data[field] = match.group(1).strip()  # Use group(1) for other fields

# Convert to DataFrame
df2 = pd.DataFrame(list(data.items()), columns=['Field', 'Data'])

df2


Unnamed: 0,Field,Data
0,Street address,2001 Street Name
1,Email,cleaningservices@email.com
2,Website,cleaningservices123.com


In [73]:
data=pd.concat([df,df2],axis=0)

In [74]:
# Get text and confidence scores from the PDF
text_confidence_data = get_text_confidence_from_pdf(pdf_path)

# Calculate confidence scores for each field in the DataFrame
data['Confidence'] = calculate_field_confidence(data, text_confidence_data)
data

Unnamed: 0,Field,Data,Confidence
0,INVOICE NUMBER,1000-15088,96.0
1,CLEANING PERIOD,01/05/2023-31/05/2023,92.0
2,PHONE NUMBER,(000) 123 456 7890,96.0
3,TOTAL,$929.50,96.0
4,TAX,$84.50,96.0
5,ACCOUNT NUMBER,0123 0000 1111 2323\nSort Code 25-88-00,96.0
0,Street address,2001 Street Name,96.0
1,Email,cleaningservices@email.com,90.0
2,Website,cleaningservices123.com,91.0


In [75]:
data.to_csv('5.csv',index=False)

In [None]:
#################################################################################

In [76]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\6.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'06/10/2021\n\n. INVO-005\nfong"Y ~~ Sample Invoice\nle\nBilling Information Shipping Information\nCompany Name Name\nABC Company John Smith Sam K. Smith\nAddress Address\n111 Pine Street, Suite 1815 111 Pine Street, Suite 1815\nSan Francisco, CA, 94111 San Francisco, CA, 94111\nPhone Number\n(123) 123-1232\nEmail\nJohn@example.com\nDescription Quantity Unit Price Total\nProduct/Service 1 ‘Sink 2 100 $200\nProduct/Service2 Nest Smart Filter 1 150 $150\nProduct/Service 3 Labor Fee 1 50 $50\nProduct/Service 4 Service Fee 1 25 $25\n\nTotal: $425\n\n\n'

In [77]:
# Load the training data
with open('train_data6.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [78]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data6.spacy")

100%|██████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 2465.58it/s]


In [83]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data6.spacy --paths.dev ./train_data6.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     75.57    1.77    1.85    1.69    0.02
 21     200       2151.23   4513.23   95.80   95.00   96.61    0.96
 44     400        179.09    228.17   99.15  100.00   98.31    0.99
 70     600         18.19     89.05   99.15  100.00   98.31    0.99
 98     800        178.38    123.17   99.15  100.00   98.31    0.99
131    1000        274.80    135.03   99.15  100.00   98.31    0.99
165    1200        445.95    147.22   99.15  100.00   98.31    0.99
202    1400        279.72    142.08   99.15  100.00   98.31    0.99
236    1600        369.53    124.32   99.15  100.00   98.31    0.99
281    1800       1033.34    160.01  100.

In [84]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [85]:
doc = nlp_ner(extracted_text)

In [86]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,PRODUCT,06/10/2021\n\n.
1,INVOICE NUMBER,INVO-005
2,COMPANY,ABC Company
3,NAME,John Smith
4,NAME,Sam K. Smith
5,ADDRESS,"111 Pine Street, Suite 1815"
6,ADDRESS,"111 Pine Street, Suite 1815"
7,ADDRESS,"San Francisco, CA, 94111"
8,ADDRESS,"San Francisco, CA, 94111"
9,PHONE NUMBER,(123) 123-1232\nEmail\n


In [87]:
# Specify the indices of the rows to be removed
indices_to_remove = [0,6,8,10,11,12,13,14]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,INVO-005
1,COMPANY,ABC Company
2,NAME,John Smith
3,NAME,Sam K. Smith
4,ADDRESS,"111 Pine Street, Suite 1815"
5,ADDRESS,"San Francisco, CA, 94111"
6,PHONE NUMBER,(123) 123-1232\nEmail\n
7,TOTAL,Total: $425


In [88]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,INVOICE NUMBER,INVO-005,67.0
1,COMPANY,ABC Company,80.0
2,NAME,John Smith,76.0
3,NAME,Sam K. Smith,92.0
4,ADDRESS,"111 Pine Street, Suite 1815",93.6
5,ADDRESS,"San Francisco, CA, 94111",93.25
6,PHONE NUMBER,(123) 123-1232\nEmail\n,62.666667
7,TOTAL,Total: $425,85.0


In [89]:
df.to_csv('6.csv',index=False)

In [None]:
##################################################################################

In [93]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\7.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'INVOICE\n\nEast Repair Inc.\n1912 Harvest Lane\nNew York, NY 12210\n\nBILLTO SHIP TO INVOICE # US-001\n\nJohn Smith John Smith INVOICE DATE 1102/2019\n2 Court Square 3787 Pineview Drive Pow\n\nNew York, NY 12210 Cambridge, MA 12210 be 23f2i2010\nDUE DATE 26/02/2019\n\nay DESCRIPTION UNIT PRICE ‘AMOUNT\n\n1 Front and rear brake cables 100.00 100.00\n\n2 New set of pedal arms 18.00 30.00\n\n3 Labor 3hrs 5.00 16.00\n\nSubtotal 148,00\n\nSales Tax 6.25% 9.06\n\nTOTAL $154.06\n\nSmith.\n\noh b Payment is due within 15 days\nyou Please make checks payable to: East Repair Inc.\n\n'

In [90]:
# Load the training data
with open('train_data7.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [91]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data7.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 3250.56it/s]


In [97]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data7.spacy --paths.dev ./train_data7.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     72.08    3.40    3.85    3.05    0.03
 19     200       5632.13   4605.60   93.89   93.89   93.89    0.94
 41     400        102.73    284.65   98.45  100.00   96.95    0.98
 66     600         59.99    205.20   97.69   98.45   96.95    0.98
 93     800        114.59    223.56   98.46   99.22   97.71    0.98
124    1000        250.79    247.53   98.46   99.22   97.71    0.98
157    1200       1400.44    311.62   98.45  100.00   96.95    0.98
190    1400        495.75    232.78   98.46   99.22   97.71    0.98
226    1600        429.78    151.61   99.24   99.24   99.24    0.99
271    1800        253.48    167.77   99.

In [98]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [99]:
doc = nlp_ner(extracted_text)

In [100]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,COMPANY,East Repair
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210"
2,INVOICE NUMBER,US-001
3,NAME,John Smith
4,INVOICE DATE,1102/2019
5,ADDRESS,2 Court Square 3787 Pineview Drive Pow
6,ADDRESS,"New York, NY 12210 Cambridge, MA 12210"
7,DUE DATE,26/02/2019
8,SUBTOTAL,14800
9,SALES TAX,%


In [101]:
# Specify the indices of the rows to be removed
indices_to_remove = [9,10,11,12]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,COMPANY,East Repair
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210"
2,INVOICE NUMBER,US-001
3,NAME,John Smith
4,INVOICE DATE,1102/2019
5,ADDRESS,2 Court Square 3787 Pineview Drive Pow
6,ADDRESS,"New York, NY 12210 Cambridge, MA 12210"
7,DUE DATE,26/02/2019
8,SUBTOTAL,14800


In [102]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,COMPANY,East Repair,95.5
1,ADDRESS,"1912 Harvest Lane\nNew York, NY 12210",78.428571
2,INVOICE NUMBER,US-001,0.0
3,NAME,John Smith,67.5
4,INVOICE DATE,1102/2019,78.0
5,ADDRESS,2 Court Square 3787 Pineview Drive Pow,85.0
6,ADDRESS,"New York, NY 12210 Cambridge, MA 12210",74.285714
7,DUE DATE,26/02/2019,25.0
8,SUBTOTAL,14800,0.0


In [103]:
df.to_csv('7.csv',index=False)

In [None]:
##############################################################################

In [15]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\8.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'Invoice\n\nYOUR INFORMATION\n\nJohn Smith\n123 Main Street, Anytown, USA\n\njohnsmith@example.com\n\nISSUED ON\n\n12/10/2023\n\nITEM DESCRIPTION\n\nDesign and Planning\nConstruction Services\nInstallation\nMaintenance\n\nConsultation\n\n@\n\nYOUR LOGO\n\nCLIENT INFORMATION\n\nJane Doe\n456 Elm Street, Anycity, USA\n\njanedoe@example.com\n\nDUE DATE\n\n12/11/2023\n\nINIT PRICE\n\n$500\n$1500\n$300\n$200\n$250\n\nTotal Amount Due: $3,450\n\nes\n\n\\\n\nThank you for choosing our services! We appreciate the opportunity to work with you. Please make the payment\nwithin 30 days of receiving this invoice. If you have any questions or concerns regarding the invoice, feel free to\ncontact us at the provided email address. We look forward to serving you again in the future.\n\n\n'

In [16]:
# Load the training data
with open('train_data8.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [17]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data8.spacy")

100%|██████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 9028.64it/s]


In [18]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data8.spacy --paths.dev ./train_data8.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     65.15   17.39   14.29   22.22    0.17
200     200         39.60   1498.61  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00    1.00
1800    1800          0.00      0.00 

In [19]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [20]:
doc = nlp_ner(extracted_text)

In [21]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,TOTAL,INFORMATION\n\n
1,NAME,John Smith
2,ADDRESS,"123 Main Street, Anytown, USA"
3,EMAIL,johnsmith@example.com
4,DUE DATE,ISSUED
5,INVOICE DATE,12/10/2023
6,TOTAL,ITEM DESCRIPTION
7,ADDRESS,CLIENT INFORMATION\n\nJane Doe\n456 Elm Street...
8,EMAIL,janedoe@example.com
9,DUE DATE,12/11/2023


In [22]:
# Specify the indices of the rows to be removed
indices_to_remove = [0,4,6,7,10,12,13,14,15,16,17,18,19]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,NAME,John Smith
1,ADDRESS,"123 Main Street, Anytown, USA"
2,EMAIL,johnsmith@example.com
3,INVOICE DATE,12/10/2023
4,EMAIL,janedoe@example.com
5,DUE DATE,12/11/2023
6,TOTAL,"$3,450"


In [23]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,NAME,John Smith,96.0
1,ADDRESS,"123 Main Street, Anytown, USA",95.4
2,EMAIL,johnsmith@example.com,91.0
3,INVOICE DATE,12/10/2023,92.0
4,EMAIL,janedoe@example.com,80.0
5,DUE DATE,12/11/2023,75.0
6,TOTAL,"$3,450",96.0


In [24]:
df.to_csv('8.csv',index=False)

In [None]:
############################################################################

In [25]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\9.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'From\nSaldo Apps\n\nJohn smith\nwiz@saldoappscom\n0296070807\nsaldoapps.com\n\nFirst str,28~-32, Chicago, USA\n\nHourly Invoice\n\n01\n” stat 2022\nst 9th 2028\n\nBill to\nShepard corp.\n\nshepard@maiicom\n80208979507\nNorth str, $2, Chicago, USA\n\nShip to\n\nNorth str, 32, Chicago, USA\n“Track #: ROB0296979597\n\nPrototype 2023045000\n\nPrototype-bosed programming «ste\nof enjct-aientad programming\n\nDesign 2023048000\n\nPayment instruction\n\nPaypal erait\nwiz@saldoapps.com\n\nMake checks payable to\nsdonn smith\n\nfeank Transfer\nRouting (aBA)-asni20084\nNotes\n\nPrototype-based programming isa sive of\n‘object-oriented programming in which behaviour\n\nzosox 2050% 2028045000\n\n20so% 2050% ——20.280,45000\n‘subtotal: uso 2000.00\nDiscount (20%): usp. 000\nshipping Cort usp.apo\nSoles Tox ‘usp 450.00\nTotat sp 8,480.00\n‘Amount pai usa\nBalance Due: usp 8,480.00\n\n'

In [26]:
# Load the training data
with open('train_data9.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [27]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data9.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 4071.48it/s]

Skipping entity
Skipping entity





In [28]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data9.spacy --paths.dev ./train_data9.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     83.14    0.00    0.00    0.00    0.00
100     200         52.33   1958.63  100.00  100.00  100.00    1.00
271     400          0.00      0.00  100.00  100.00  100.00    1.00
471     600          0.00      0.00  100.00  100.00  100.00    1.00
671     800          0.00      0.00  100.00  100.00  100.00    1.00
871    1000          0.00      0.00  100.00  100.00  100.00    1.00
1071    1200          0.00      0.00  100.00  100.00  100.00    1.00
1271    1400          0.00      0.00  100.00  100.00  100.00    1.00
1471    1600          0.00      0.00  100.00  100.00  100.00    1.00
1671    1800          0.00      0.00  

In [29]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [30]:
doc = nlp_ner(extracted_text)

In [31]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,ADDRESS,From\nSaldo
1,NAME,John smith
2,WEBSITE,wiz@saldoappscom
3,PHONE NUMBER,0296070807
4,WEBSITE,saldoapps.com
5,ADDRESS,"First str,28~-32, Chicago, USA"
6,COMPANY,Hourly Invoice
7,TRACKING NUMBER,”
8,TOTAL,9th 2028
9,COMPANY,Shepard corp


In [32]:
# Specify the indices of the rows to be removed
indices_to_remove = [0,6,7,8,13,14,15,16,17,18,19,22,24,25,26,27,28,29,33]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,NAME,John smith
1,WEBSITE,wiz@saldoappscom
2,PHONE NUMBER,0296070807
3,WEBSITE,saldoapps.com
4,ADDRESS,"First str,28~-32, Chicago, USA"
5,COMPANY,Shepard corp
6,EMAIL,shepard@maiicom
7,PHONE NUMBER,80208979507
8,ADDRESS,"North str, $2, Chicago, USA"
9,COMPANY,Paypal erait


In [33]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)

df

Unnamed: 0,Field,Data,Confidence
0,NAME,John smith,60.0
1,WEBSITE,wiz@saldoappscom,15.0
2,PHONE NUMBER,0296070807,0.0
3,WEBSITE,saldoapps.com,53.0
4,ADDRESS,"First str,28~-32, Chicago, USA",69.5
5,COMPANY,Shepard corp,48.0
6,EMAIL,shepard@maiicom,39.0
7,PHONE NUMBER,80208979507,0.0
8,ADDRESS,"North str, $2, Chicago, USA",69.6
9,COMPANY,Paypal erait,8.5


In [34]:
df.to_csv('9.csv',index=False)

In [None]:
#########################################################################

In [35]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\10.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'Date: 2020-02-04\nInvoice #: 1234567\nPO #: 12-19203\n\nTO Widget Tec\n\n988 1688 1 562 988 1688\n\nSHIPPING\nMETHOD\n\nSHIPPING\n\nJOB TERMS\n\nStandard\n\nImplementation\n\nSignature\n\n1TB Hard Drive\n\nServices\n\n3545 Long Beach Bivd.\nLong Beach, CA 90807\n\nCustomer ID: WT 1045\n\n| 2016-01-04\n\nUNIT PRICE\n\nDELIVERY DATE\n\nADVANCED\nSYSTEMS\n\nOnward. Lioward.\n\nSHIPTO Widget Tec\n\n35 mn ch Blvd.\n\nCustomer [D: WT1045\n\nTERMS DUE DATE\n\nPAYMENT |\n\n| Due on receipt | 2016-01-18\n\nLINE TOTAL\n\nDISCOUNT\n\n$106 $636\n\n$158\n\n$158\n\nSUBTOTAL $794\nSALES TAX $63.52\nSHIPPING $794\nTOTAL $944.52\n\nMake all checks payable to Advanced Systems\n\nThank you for your business!\n\n'

In [36]:
# Load the training data
with open('train_data10.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [37]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data10.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 2651.60it/s]


In [38]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data10.spacy --paths.dev ./train_data10.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     87.26    0.00    0.00    0.00    0.00
 80     200        114.48   2819.85   97.96   97.96   97.96    0.98
180     400         10.12     54.26  100.00  100.00  100.00    1.00
280     600          0.00      0.00  100.00  100.00  100.00    1.00
423     800          0.00      0.00  100.00  100.00  100.00    1.00
623    1000          0.00      0.00  100.00  100.00  100.00    1.00
823    1200          0.06      0.05  100.00  100.00  100.00    1.00
1023    1400          1.97      3.52  100.00  100.00  100.00    1.00
1223    1600          0.00      0.00  100.00  100.00  100.00    1.00
1423    1800          0.00      0.00  1

In [39]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [40]:
doc = nlp_ner(extracted_text)

In [41]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,INVOICE DATE,2020-02-04
1,INVOICE NUMBER,1234567
2,PO NUMBER,12-19203
3,COMPANY,Widget Tec
4,PHONE NUMBER,1 562 988 1688
5,COMPANY,Standard\n\n
6,NAME,TB Hard
7,ADDRESS,3545 Long Beach Bivd
8,ADDRESS,"Long Beach, CA 90807"
9,CUSTOMER ID,WT 1045


In [42]:
# Specify the indices of the rows to be removed
indices_to_remove = [5,6,11,12,14,16,17,18,19]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,INVOICE DATE,2020-02-04
1,INVOICE NUMBER,1234567
2,PO NUMBER,12-19203
3,COMPANY,Widget Tec
4,PHONE NUMBER,1 562 988 1688
5,ADDRESS,3545 Long Beach Bivd
6,ADDRESS,"Long Beach, CA 90807"
7,CUSTOMER ID,WT 1045
8,DATE,2016-01-04
9,COMPANY,Widget Tec


In [43]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,INVOICE DATE,2020-02-04,95.0
1,INVOICE NUMBER,1234567,96.0
2,PO NUMBER,12-19203,90.0
3,COMPANY,Widget Tec,88.5
4,PHONE NUMBER,1 562 988 1688,94.0
5,ADDRESS,3545 Long Beach Bivd,71.25
6,ADDRESS,"Long Beach, CA 90807",95.5
7,CUSTOMER ID,WT 1045,30.0
8,DATE,2016-01-04,96.0
9,COMPANY,Widget Tec,88.5


In [44]:
df.to_csv('10.csv',index=False)

In [None]:
#########################################################################################

In [45]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\11.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'DATA Invoice\nDESIGN\n\nInvoice Number: DD703\nEmpower Your World Date: 2020-02-01\nPO Number: 12-3456\n\nTo Widget Tec\n3545 Long Beach Blvd.\n\nLong Beach, CA, 90807\n\n1 562 988 1688\n\nCustomer ID: WT420\n\nPayment Terms\n\nJeremy Anderson #DD703 Due on receipt\n\nDescription\n\n3 Xerox Scanner\n\n$913 $2739\n20 Filing Cabinet $363 $7260\nSubtotal $9999\nSales Tax $799.92\nShipping $9999\nTotal $10859.92\n\nMake all checks payable to Data Design\n\nThank you for your business!\n\nData Design | 3329 Marcel Avenue, Long Beach, CA 90807 | p: 1 562 988 1688 | info@datadesign.com\n\n'

In [46]:
# Load the training data
with open('train_data11.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [47]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data11.spacy")

100%|████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 3295.02it/s]


In [48]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data11.spacy --paths.dev ./train_data11.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     82.34    1.56    1.59    1.54    0.02
 60     200        189.11   3135.52   98.46   98.46   98.46    0.98
127     400         18.32    114.06  100.00  100.00  100.00    1.00
220     600          0.00      0.00  100.00  100.00  100.00    1.00
320     800          0.00      0.00  100.00  100.00  100.00    1.00
420    1000          0.00      0.00  100.00  100.00  100.00    1.00
603    1200          0.00      0.00  100.00  100.00  100.00    1.00
803    1400          0.00      0.00  100.00  100.00  100.00    1.00
1003    1600          0.00      0.00  100.00  100.00  100.00    1.00
1203    1800          0.00      0.00  10

In [49]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [50]:
doc = nlp_ner(extracted_text)

In [51]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,DD703
1,INVOICE DATE,2020-02-01
2,PO NUMBER,12-3456
3,COMPANY,Widget Tec
4,ADDRESS,3545 Long Beach Blvd
5,ADDRESS,"Long Beach, CA, 90807"
6,PHONE NUMBER,1 562 988 1688
7,CUSTOMER ID,WT420
8,NAME,Jeremy Anderson
9,JOB ID,DD703


In [52]:
# Specify the indices of the rows to be removed
indices_to_remove = [9,10,11,13,16]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,INVOICE NUMBER,DD703
1,INVOICE DATE,2020-02-01
2,PO NUMBER,12-3456
3,COMPANY,Widget Tec
4,ADDRESS,3545 Long Beach Blvd
5,ADDRESS,"Long Beach, CA, 90807"
6,PHONE NUMBER,1 562 988 1688
7,CUSTOMER ID,WT420
8,NAME,Jeremy Anderson
9,SUBTOTAL,$9999


In [53]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
# Calculate confidence scores for each field in the DataFrame
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,INVOICE NUMBER,DD703,92.0
1,INVOICE DATE,2020-02-01,96.0
2,PO NUMBER,12-3456,96.0
3,COMPANY,Widget Tec,96.0
4,ADDRESS,3545 Long Beach Blvd,72.0
5,ADDRESS,"Long Beach, CA, 90807",96.0
6,PHONE NUMBER,1 562 988 1688,88.75
7,CUSTOMER ID,WT420,91.0
8,NAME,Jeremy Anderson,0.0
9,SUBTOTAL,$9999,96.0


In [54]:
df.to_csv('11.csv',index=False)

In [None]:
##################################################################

In [55]:
# Paths
pdf_path = 'D:\\Assignment\\TestDatav2\\12.pdf'

# Convert PDF to images
images = pdf_to_images(pdf_path)

# Extract text from images
texts = extract_text_from_images(images)
# Store the extracted texts in a single string variable
extracted_text = ""
for text in texts:
    extracted_text += text + "\n"  # Adding a newline for separation between texts
extracted_text

'i, LOGIC\n\nCORPORATION\n\nLogic Corporation\n6523 Russell Street, Long Beach, CA 90807\n1 562 988 1688\n\ninfo(Wlogicorp.com\n\nTO Widget Tec\n3545 Long Beach Blvd.\nLong Beach, CA 90807\n1 800 985 8533\n\nCustomer ID WT416\nSALESPERSON JOB\nBr dgett € Chow LC303\n\nQTy DESCRIPTION\n\n|\n\nDate: 3 1/2014\nInvoice #: |, 30757\n\npo #) (a3 t520! eee\n\n\n'

In [56]:
# Load the training data
with open('train_data12.json') as f:
    TRAIN_DATA = json.load(f)

# Remove None entries from annotations
cleaned_annotations = [annotation for annotation in TRAIN_DATA['annotations'] if annotation is not None]

# Update the train_data dictionary
TRAIN_DATA['annotations'] = cleaned_annotations

In [57]:
# Process the training data
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        continue  # Skip None values
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin object
db.to_disk("./train_data12.spacy")

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 218.32it/s]


In [58]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_data12.spacy --paths.dev ./train_data12.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     81.80    0.00    0.00    0.00    0.00
 45     200        244.70   3122.59   98.67   98.67   98.67    0.99
100     400         22.91    166.61  100.00  100.00  100.00    1.00
165     600          0.00      0.00  100.00  100.00  100.00    1.00
250     800          0.00      0.00  100.00  100.00  100.00    1.00
350    1000          0.00      0.00  100.00  100.00  100.00    1.00
450    1200          0.00      0.00  100.00  100.00  100.00    1.00
620    1400          0.01      0.00  100.00  100.00  100.00    1.00
820    1600          4.48      1.38  100.00  100.00  100.00    1.00
1020    1800        114.48     46.45  100

In [59]:
# Load the trained NER model
nlp_ner = spacy.load("model-best")

In [60]:
doc = nlp_ner(extracted_text)

In [61]:
# Extract entities and create a DataFrame
entities = [(ent.label_, ent.text) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['Field', 'Data'])
df

Unnamed: 0,Field,Data
0,COMPANY,LOGIC\n\nCORPORATION
1,COMPANY,Logic Corporation
2,ADDRESS,"6523 Russell Street, Long Beach, CA 90807"
3,PHONE NUMBER,1 562 988 1688
4,ADDRESS,3545 Long Beach Blvd
5,ADDRESS,"Long Beach, CA 90807"
6,PHONE NUMBER,1 800 985 8533
7,CUSTOMER ID,WT416
8,INVOICE NUMBER,": |,"
9,PO NUMBER,) (a3 t520!


In [62]:
# Specify the indices of the rows to be removed
indices_to_remove = [8,9]

# Remove the rows
df = df.drop(indices_to_remove)

# Reset the index if needed
df = df.reset_index(drop=True)
df

Unnamed: 0,Field,Data
0,COMPANY,LOGIC\n\nCORPORATION
1,COMPANY,Logic Corporation
2,ADDRESS,"6523 Russell Street, Long Beach, CA 90807"
3,PHONE NUMBER,1 562 988 1688
4,ADDRESS,3545 Long Beach Blvd
5,ADDRESS,"Long Beach, CA 90807"
6,PHONE NUMBER,1 800 985 8533
7,CUSTOMER ID,WT416


In [63]:
text_confidence_data = get_text_confidence_from_pdf(pdf_path)
df['Confidence'] = calculate_field_confidence(df, text_confidence_data)
df

Unnamed: 0,Field,Data,Confidence
0,COMPANY,LOGIC\n\nCORPORATION,95.5
1,COMPANY,Logic Corporation,94.5
2,ADDRESS,"6523 Russell Street, Long Beach, CA 90807",94.285714
3,PHONE NUMBER,1 562 988 1688,89.75
4,ADDRESS,3545 Long Beach Blvd,72.0
5,ADDRESS,"Long Beach, CA 90807",96.0
6,PHONE NUMBER,1 800 985 8533,90.25
7,CUSTOMER ID,WT416,78.0


In [64]:
df.to_csv('12.csv',index=False)