In [None]:
!pip3 install openai

In [None]:
import pandas as pd
from openai import OpenAI

## 1. Initial Text Extraction

#### Using PyMuPDF

In [None]:
import fitz  # Import the PyMuPDF library

def extract_text_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)  # Open the PDF file
    text = ""  # Initialize an empty string to store text

    for page in doc:  # Iterate through each page
        text += page.get_text()  # Extract text from the page and append it

    doc.close()  # Close the document
    return text

# here change path to pdf
#df_path = '/Users/juliamarkusiewicz/Documents/research assistant work code/PDF_Mining_Julia/Jaarrekening-2022-DG-Press-HoldinG-B.V.pdf'
pdf_path = '/Users/juliamarkusiewicz/Documents/research assistant work code/PDF_Mining_Julia/Jaarrekening-Momo-Medical-Holding-B.V.-2022.pdf'
#pdf_path = '/Users/juliamarkusiewicz/Documents/research assistant work code/PDF_Mining_Julia/Jaarrekening-2022-Informed-IT-Holding-B.V.pdf'
raw_text = extract_text_pymupdf(pdf_path)
text = extract_text_pymupdf(pdf_path)
text_data = extract_text_pymupdf(pdf_path)
print(text)

#### Using PyPDF2

In [None]:
import PyPDF2

def extract_text_pypdf2(pdf_path):
    text = ""  # Initialize an empty string to store text
    with open(pdf_path, 'rb') as file:  # Open the PDF file in binary mode
        reader = PyPDF2.PdfReader(file)  # Create a PDF reader object

        for page in reader.pages:  # Iterate through each page
            text += page.extract_text() + "\n"  # Extract text and append it

    return text

# Example usage
pdf_path = '/Users/juliamarkusiewicz/Documents/research assistant work code/PDF_Mining_Julia/Jaarrekening-2022-DG-Press-HoldinG-B.V.pdf'
text = extract_text_pypdf2(pdf_path)
raw_text=extract_text_pypdf2(pdf_path)
print(text)

### Table Extraction

In [None]:
from tabula import read_pdf

def extract_tables_tabula(pdf_path):
    # This function returns a list of DataFrames, one for each page
    dfs = read_pdf(pdf_path, pages='all', multiple_tables=True)

    for i, df in enumerate(dfs):
        print(f"Table {i+1}:")
        print(df)  # Print each table
        # You can also save the DataFrame to a CSV or Excel
        # df.to_csv(f'table_{i+1}.csv') # Uncomment to save to CSV

# Example usage
pdf_path = '/Users/juliamarkusiewicz/Documents/research assistant work code/PDF_Mining_Julia/Jaarrekening-2022-DG-Press-HoldinG-B.V.pdf'
extract_tables_tabula(pdf_path)
text = extract_tables_tabula(pdf_path)

## 2. Extracting, Categorising, and Processing data 

In [None]:
#correct

MAX_DESCRIPTION_LENGTH = 180
LLM_length=100

# is_number returns a number stripped from commas, dots and parentheses and a boolean indicating if the input is a number
# or the raw data if `s` is not a number
def is_number(s: str) -> [str, bool]:
    if s.startswith("(") and s.endswith(")"):
        s = s[1:-1]
    elif s == "-":
        return 0, True
    # NOTE: more cases here
    num = s.replace(",", "").replace(".", "")
    return num, num.isdigit()

def is_year(s) -> bool: return 1900 <= int(s) <= 2100
def ends_with_percent(s: str) -> bool: return s.endswith("%")

def is_page_number(s: str) -> bool:
    return s.strip().startswith("Page") or s.strip().startswith("Pagina") 

def parse_adjusted_financial_text(text):
    # Split the text into lines
    lines = text.strip().split("\n")
    
    # Initialize an empty list to store our parsed data
    parsed_data = []
    
    # this variable tracks if we found any numeric data on the current page
    no_numeric_data_on_page = True
    misc_text_data = []
    
    # Initialize empty strings for current description and values
    current_description = ""
    value_year1 = ""
    
    # Iterate over each line in the text
    for line in lines:
        # Check if line is a description or a value
        maybe_num, is_num = is_number(line)
        
        if is_num and is_year(maybe_num):
            continue
        elif ends_with_percent(line):
            continue
        
        if is_num:
            # If it's a digit, it's a value
            if value_year1 == "":
                # If the first value is not yet set, set it as year1 value
                value_year1 = maybe_num
            else:
                print([current_description.strip().lower(), int(value_year1), int(maybe_num)])
                parsed_data.append([current_description.strip().lower(), int(value_year1), int(maybe_num)])
                # Reset the values for the next set of data
                current_description = ""
                value_year1 = ""
                no_numeric_data_on_page = False
        else:
            # we might find long streches of text unrelated to data labeling
            # in that case we have the following solutions:
            #  * cap descripiton length (x chars)
            #  * try to extract category name using llms
            # If it's not a digit, it's a description
            # Accumulate descriptions until we reach a digit
            if current_description:
                if is_page_number(line):
                    if no_numeric_data_on_page:
                        misc_text_data.append(current_description)
                        current_description = ""
                    # we want to reset the tracker at every new page
                    no_numeric_data_on_page = True
                else: current_description += " " + line.strip()
               
                #skip if the length is more than MAX_DESCRIPTION_LENGTH
                if len(current_description) > MAX_DESCRIPTION_LENGTH:
                    current_description = ""        
            else:
                current_description = line
                
    # Convert the parsed data into a DataFrame
    return pd.DataFrame(parsed_data, columns=['Description', 'Year 1 EUR', 'Year 2 EUR']), misc_text_data

In [None]:
# Parse the adjusted text data
df_parsed, misc_text_data = parse_adjusted_financial_text(text_data)

# Display the parsed DataFrame
display(df_parsed)
#print(df_parsed)

In [None]:
# TODO: different way to extract information, using also last :500 characters didn't work
# extracting company information
client = OpenAI(base_url = 'http://localhost:11434/v1',api_key='ollama')

def prepare_prompt(text: str) -> list:
    return [
        # prompt not final, to be adjusted
        {"role": "system", "content": " extract the name of the company, the two years that the financial statement is about (i.e 2021, 2022), in the order they appear, the currency, and the type of financial statement. Output 5 (not more) variables, separated by commas."},
        {"role": "user", "content": text},
        {"role": "assistant", "content": "category name:"}
    ]


def output_company_information(text: str) -> str:
    messages = prepare_prompt(text)
    response = client.chat.completions.create(
        # TODO: try on smaller models
        model="gemma:7b",
        messages = messages,
        temperature=0.4
    )
  
    company_information = response.choices[0].message.content
    return company_information

# from the first 500 characters of df_parsed, extract company name, year, currency, and the type of financial statement
company_information = output_company_information(text[:500])
#make company informarion only the first row of the output
company_information = company_information.split("\n")[0]
print(company_information)


# make a new dataframe with the extracted company information
company_information = company_information.split(",")
company_information_df = pd.DataFrame([company_information], columns=['Company Name', 'Year 1', 'Year 2', 'Currency', 'Type of Financial Statement'])
display(company_information_df)

In [None]:
# setup for replacing all outputs longer than XXX characters with a category name extracted by gemma

def prepare_messages(text: str) -> list:
    return [
        # prompt not final, to be adjusted
        {"role": "system", "content": "This is a fragment of description from a financial statement. Extract a financial category name from the description. Output only the category and no other text or explanations. If there is no category or not applicable, output only: -."},
        {"role": "user", "content": text},
        {"role": "assistant", "content": "category name:"}
    ]

def output_category_name(text: str) -> str:
    messages = prepare_messages(text)
    response = client.chat.completions.create(
        # TODO: try on smaller models
        model="gemma:7b",
        messages = messages,
        temperature=1
    )
    category_name = response.choices[0].message.content
     # convert to lowercase
    category_name = category_name.lower()
    # remove whitespaces
    category_name = category_name.strip()
    return category_name


In [None]:
#change the 'Year 1 EUR' and 'Year 2 EUR' columns in df_parsed to the values under Year 1 and Year 2 in company_information_df
df_parsed = df_parsed.rename(columns={'Year 1 EUR': company_information_df['Year 1'][0], 'Year 2 EUR': company_information_df['Year 2'][0]})
df_parsed['Category'] = df_parsed['Description']

# extracring category names for descriptions longer than LLM_length
for i, row in enumerate(df_parsed['Description']):
    if len(row) > LLM_length and len(row) < MAX_DESCRIPTION_LENGTH:
        category_name = output_category_name(row)
        # tentative solution, to be deleted once the prompt is adjusted
        category_name = category_name.split("\n")[0]
        df_parsed.at[i, 'Category'] = category_name
        print(row, "->", category_name)
  


In [None]:
display(df_parsed)

### Mapping data to ledger

In [None]:
from financial_ledger import financial_ledger as ledger
import jellyfish as jf
from jellyfish import jaro_winkler_similarity as jws
from tqdm import tqdm

In [None]:
print(ledger)

In [None]:
# TODO: add description of Jaro-Winkler similarity

def add_matching_info_to_df(df, ledger):
    # Initialize lists to hold match results
    best_matches = []
    match_scores = []
    
    for index, row in df.iterrows():
        description = row['Description'].lower()
        max_score = 0
        best_match = ""
        for key, ledger_entry in ledger.items():
            # Iterate through both 'English' and 'Dutch' lists if they exist
            for lang in ['English', 'Dutch']:
                if lang in ledger_entry:  # Check if the language key exists
                    for term in ledger_entry[lang]:
                        score = jf.jaro_winkler(description, term.lower())
                        if score > max_score:
                            max_score = score
                            best_match = key
        # Append match result or indicate no match found
        if max_score > 0.7:
            best_matches.append(best_match)
            match_scores.append(max_score)
        else:
            best_matches.append("No match found")
            match_scores.append(max_score)
    
    # Add the match results to the DataFrame
    df['Best Match_JW'] = best_matches
    df['Match Score_JW'] = match_scores

add_matching_info_to_df(df_parsed, ledger)

# Now df_parsed contains two new columns: 'Best Match' and 'Match Score'
display(df_parsed)

#display only the rows with no match found
# df_no_match = df_parsed[df_parsed['Best Match_JW'] == "No match found"]
# display(df_no_match)


In [None]:
#this is used to display all rows in the dataframe when checking output, not necessary to run as it makes the output very long, mostly for testing purposes

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-detect the display width
pd.set_option('display.max_colwidth', -1)  # Display full width of columns

display(df_parsed)

In [None]:
#testing cosine similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


# Combine all texts to build the vocabulary for vectorization
all_texts = list(df_parsed['Category']) + [item for sublist in ledger.values() for lang in sublist if lang in ['English', 'Dutch'] for item in sublist[lang]]

vectorizer = TfidfVectorizer().fit(all_texts)
description_vectors = vectorizer.transform(df_parsed['Category'])

# Prepare and vectorize ledger entries
ledger_entries = [item for sublist in ledger.values() for lang in sublist if lang in ['English', 'Dutch'] for item in sublist[lang]]
ledger_vectors = vectorizer.transform(ledger_entries)

# Calculate cosine similarity between description vectors and ledger vectors
similarity_scores = cosine_similarity(description_vectors, ledger_vectors)

# Determine the best match for each description based on the highest cosine similarity score
best_matches = [ledger_entries[np.argmax(row)] if max(row) > 0.7 else "No match found" for row in similarity_scores]
df_parsed['Best Match_Cosine'] = best_matches
df_parsed['Highest Match Score_Cosine'] = [max(row) for row in similarity_scores]
display(df_parsed)

# Display a summary of match scores to help decide on a threshold
#print(df_parsed[['Description', 'Best Match', 'Highest Match Score']].head(20))
# Adjust threshold based on inspection
#threshold = 0.7
#df_parsed['Best Match Adjusted'] = [ledger_entries[np.argmax(row)] if max(row) > threshold else "No match found" for row in similarity_scores]

# Review adjustments
#display(df_parsed[['Category', 'Best Match Adjusted', 'Highest Match Score']])

# output only the rows where the best match is not found
#no_match_found = df_parsed[df_parsed['Best Match Adjusted'] == 'No match found']
#display(no_match_found)


## 3. Creating output

In [None]:
# if in df_parsed, best match JR and best match cosine  = "not found" or "not applicable", delete the row
# TODO: check if when there is no match from one method, if there is one from the other 

df_parsed = df_parsed[df_parsed['Best Match_JW'] != "No match found"]
df_parsed = df_parsed[df_parsed['Best Match_Cosine'] != "No match found"]
df_parsed = df_parsed[df_parsed['Best Match_JW'] != "not applicable"]
df_parsed = df_parsed[df_parsed['Best Match_Cosine'] != "not applicable"]

# if categories are found, make a new df with catgegory name, year 1 value,  year 2 value, output info from the ledger like id and other categories based on best match
df_output = df_parsed[['Category', company_information_df['Year 1'][0], company_information_df['Year 2'][0], 'Best Match_JW', 'Match Score_JW', 'Best Match_Cosine', 'Highest Match Score_Cosine']]

# based on the best match in the ledger, make new columns with balance, id, category, and statement type from the ledger for each row in df_output
def add_ledger_info_to_df(df, ledger):
    # Initialize lists to hold match results
    balance = []
    id = []
    category = []
    statement_type = []
    postencode= []
     
    for index, row in df.iterrows():
        best_match = row['Best Match_JW']
        for key, ledger_entry in ledger.items():
            if key == best_match:
                balance.append(ledger_entry['balance'])
                id.append(ledger_entry['id'])
                category.append(ledger_entry['category'])
                statement_type.append(ledger_entry['statement_type'])
                postencode.append(ledger_entry['postencode'])
    
    # Add the match results to the DataFrame
    df_output['Balance'] = balance
    df_output['ID'] = id
    df_output['Category'] = category
    df_output['Statement Type'] = statement_type
    df_output['Postencode'] = postencode

add_ledger_info_to_df(df_output, ledger)
display(df_output)


## 4. Data Cleaning , Classifying 

### Classifying Data 


In [None]:
def classify_table(df):
    # Example of simple keyword-based classification
    if 'Total of inventories' in df.columns:
        return 'Inventory'
    elif 'Total of non-current assets' in df.columns:
        return 'Non-Current Assets'


    else:
        return 'Unknown'

## 4. Data Validation


In [None]:
#Consistency check ...

# Balance sheet validation rule: Assets = Liabilities + Equity
if total_assets == total_liabilities + equity:
    print("The balance sheet balances.")
else:
    print("There is a discrepancy in the balance sheet.")
    