## pymupdf PDF miner script 
### Extact lines of text from pdf file by page number, convert it into a format for taxonomy update and merge mycobank data. Output saved to excel

In [None]:
pip install pymupdf

In [1]:
import pymupdf
import pandas as pd
import re
import numpy as np

doc = pymupdf.open("2024__The2024OutlineofFungiandfungus-liketaxa.pdf") 
#out = open("output.txt", "wb") # create a text output

def extract_text_from_page_span(doc, start_page, end_page):
    text = ""
    for page in doc.pages(start_page, end_page, 1): 
        #text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
        text += page.get_text(sort=True) #preserves the ws at the start of the line
        #text += page.get_text()  #collapses ws at the start of the line
    return text

def join_indented_lines(text):
    lines = text.splitlines()
    result = []
    previous_line = None

    for line in lines:
        #if line.startswith((' ', '\t')):  # Check for indentation (spaces or tabs)
        if line.startswith(' '):  # Check for indentation (spaces or tabs)
            if previous_line is not None:
                result[-1] = result[-1] + ' ' + line.lstrip()  # Append to previous
            else:
                result.append(line.lstrip()) #if the first line is indented, append it directly
        else:
            result.append(line)
            previous_line = line

    return '\n'.join(result)


start_page = 40
end_page = 332
extracted_text = extract_text_from_page_span(doc, start_page, end_page)
#extracted_text = re.sub(r",\s*\n", " ", extracted_text)  #finds lines ending in a comma and appends them to the previous line
text_lines = extracted_text.splitlines()
result_text = join_indented_lines(extracted_text)
#print(result_text)
revised_text= (re.sub(r"\s\d{4}", "", result_text)) 
#print(revised_text)

### Load text to pandas

In [2]:
pd.set_option('display.width', 2000) 
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_columns', None)

text_lines = revised_text.splitlines()

def create_dataframe(text_lines):
    df = pd.DataFrame(text_lines, columns=['Text'])
    return df

df = create_dataframe(text_lines)
df = df.iloc[6:]
df.head(20)

Unnamed: 0,Text
6,"Phylum APHELIDIOMYCOTA Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov"
7,"Class Aphelidiomycetes Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov"
8,"Aphelidiales Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov*Fp78"
9,"Aphelidiaceae Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov"
10,Amoeboaphelidium Scherff. (5)
11,Aphelidium Zopf (10)
12,"Paraphelidium Karpov, Moreira, López-García (2)"
13,Protaphelidium Seliuk & Karpov (1)*Seliuk & Karpov
14,Pseudaphelidium Schweik. & Schnepf (1)
15,


#### Modify the extracted text now in pandas df

#### Extract rank from text

In [3]:
def add_column_if_list_member_found(df, column_to_check, list_to_search, new_column_name):
    for index, row in df.iterrows():
        for item in list_to_search:
            if item in row[column_to_check]:
                df.loc[index, new_column_name] = item
                df.loc[index, column_to_check] = row[column_to_check].replace(item, '').strip()
                break # Stop searching after the first match                
    return df
search_rank = ['Phylum', 'Class', 'Subphylum']

df = add_column_if_list_member_found(df, 'Text', search_rank, 'Rank')
print(df)

                                                                                                                    Text    Rank
6        APHELIDIOMYCOTA Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov  Phylum
7       Aphelidiomycetes Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov   Class
8      Aphelidiales Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov*Fp78     NaN
9          Aphelidiaceae Tedersoo, Sanchez-Ramirez, Kõljalg, Bahram, M. Döring, Schigel, T.W. May, M. Ryberg & Abarenkov     NaN
10                                                                                         Amoeboaphelidium Scherff. (5)     NaN
...                                                                                                                  ...     ...
15148                                                                 Unemaeea Tedersoo, nom. inv

### Find and populate Organism name column

In [24]:
#Try this
pattern = 'incertae sedis'
pattern2 = r"(.+?incertae sedis)"
df['OrgName'] = np.nan
df['Authority'] = np.nan

for index, row in df.iterrows():
    if isinstance(row['Text'], str) and 'incertae sedis' in row['Text']:
        #print(f"Row {index}: Contains 'incertae sedis': {row['Text']}")
        #works but the split needs to be adjusted
        #df[['OrgName', 'Authority']] = df['Text'].str.split('incertae sedis', n=1, expand=True)
        df[['OrgName', 'Authority']] = df['Text'].str.extract(pattern2, expand=False)
                                        

    else:
        #print(f"Row {index}: Does not contain 'incertae sedis': {row['Text']}")
        df[['OrgName', 'Authority']] = df['Text'].str.split(' ', n=1, expand=True)

    #df.loc[df['Text'].str.contains(r"(.*" + pattern + ")"), ['OrgName', 'Authority']] = df.loc[df['Text'].str.contains(r"(.*" + pattern + ")"), 'Text'].str.split((r"(.*" + pattern + ")"), expand=True)



#df['OrgName']= df['Text'].str.split(r"(.*" + pattern + ")", expand=False)
#df['OrgName']= df['Text'].str.extract(r"(.*" + pattern + ")", expand=False)


#Splitting name from authority
#df[['OrgName', 'Authority']] = df['Text'].str.split(' ', n=1, expand=True)
df


ValueError: Columns must be same length as key

In [None]:
#skip this
def add_organism_name(df, column_to_check, pattern, new_column_name):
    for index, row in df.iterrows():
        #for item in search:
        if pattern in row[column_to_check]:
            pass
            #df.loc[index, new_column_name] = df['Text'].str.extract(r"(.*" + pattern + ")", expand=False)
                #df.loc[index, column_to_check] = row[column_to_check].replace(item, '').strip()
        else:
            df.loc[index, new_column_name] = df['Text'].str.split(' ', n=1, expand=True)
    return df
    
pattern = 'incertae sedis'
df = add_organism_name(df, 'Text', pattern, 'OrgName')
df








pattern = r'incertae sedis'
#df['Name1'] = df['Text'].str.extract(pattern, expand=True)
#df['Name1']= df['Text'].str.extract(r"(.*" + pattern + ")", expand=False)

#Splitting name from authority
#df[['Name2', 'Authority in paper']] = df['Text'].str.split(' ', n=1, expand=True)

#df = df.drop('Text', axis=1)


In [None]:
#skip this
def process_data(df, column_name, pattern, new_column_name):
    """
    Searches for a pattern in a DataFrame column and adds a new column 
    based on whether the pattern is found.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column to search within.
        pattern (str): The regular expression pattern to search for.
        new_column_name (str): The name of the new column to create.

    Returns:
        pd.DataFrame: The modified DataFrame with the new column.
    """
    def check_pattern(text):
        if isinstance(text, str) and re.search(pattern, text):
            #return "Pattern Found"
            #return df['Text'].str.extract(r"(.*" + pattern + ")", expand=False)
            df['OrgName']= df['Text'].str.extract(r"(.*" + pattern + ")", expand=False)
        else:
            #return "Pattern Not Found"
            #return df['Text'].str.split(' ', n=1, expand=True)
            df[['OrgName', 'Authority in paper']] = df['Text'].str.split(' ', n=1, expand=True)

    df[new_column_name] = df[column_name].apply(check_pattern)
    return df


pattern = r'incertae sedis'
new_column_name = "OrgName"
df = process_data(df, 'Text', pattern, new_column_name)
print(df)

In [None]:
def extract_pattern(df, text_column, pattern):
    """
    Creates a new column in a Pandas DataFrame where if a pattern is found in the text column, it extracts the match and all preceding text, 
    otherwise splits the first word of the text into two new columns.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        text_column (str): The name of the column containing the text to search.
        pattern (str): The pattern to search for in the text.
    
    Returns: 
        pd.DataFrame: The updated DataFrame with new columns.
    """
    
    df['OrgName'] = np.nan
    #df['first_word'] = np.nan
    df['Authority'] = np.nan
    
    for index, text in enumerate(df[text_column]):
        match = re.search(pattern, text)
        if match:
            start_index = match.start()
            df.loc[index, 'OrgName'] = text[:start_index + match.end()]
            df.loc[index, 'Authority'] = text[start_index + match.end:()]
        else:
            words = text.split()
            if len(words) > 0:
                df.loc[index, 'OrgName'] = words[0]
                df.loc[index, 'Authority'] = " ".join(words[1:])
    
    return df

pattern = r'incertae sedis'
extract_pattern(df, 'Text', pattern)

### Complete populating rank based on text patterns in Organism Name -mycotina=subphylum, -mycetes=class, -ales=order, -aceae=family

### Merge in Mycobank data

### Save final output to Excel

In [20]:
df.to_excel (r'C:\Users\mcveigh\Documents\PythonPC\PDFminertest.xlsx', index = False, header=True)

In [6]:
import pandas as pd

# Sample DataFrame
data = {'col': ['apple pie', 'banana', 'cherry tart', 'date pudding', 'elderberry']}
df2 = pd.DataFrame(data)

for index, row in df2.iterrows():
    if isinstance(row['col'], str) and 'berry' in row['col']:
        # Do something if 'col' is a string and contains 'berry'
        print(f"Row {index}: Contains 'berry': {row['col']}")
    else:
        # Do something else if 'col' is not a string or doesn't contain 'berry'
        print(f"Row {index}: Does not contain 'berry': {row['col']}")
        

Row 0: Does not contain 'berry': apple pie
Row 1: Does not contain 'berry': banana
Row 2: Does not contain 'berry': cherry tart
Row 3: Does not contain 'berry': date pudding
Row 4: Contains 'berry': elderberry
