## pymupdf PDF miner script 
### Extact lines of text from pdf file by page number, convert it into a format for taxonomy update and merge mycobank data. Output saved to excel

In [None]:
pip install pymupdf

In [1]:
import pymupdf
import pandas as pd
import re
import numpy as np

doc = pymupdf.open("2024__The2024OutlineofFungiandfungus-liketaxa.pdf") 
#out = open("output.txt", "wb") # create a text output

def extract_text_from_page_span(doc, start_page, end_page):
    text = ""
    for page in doc.pages(start_page, end_page, 1): 
        #text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
        text += page.get_text(sort=True) #preserves the ws at the start of the line
        #text += page.get_text()  #collapses ws at the start of the line
    return text

def join_indented_lines(text):
    lines = text.splitlines()
    result = []
    previous_line = None

    for line in lines:
        #if line.startswith((' ', '\t')):  # Check for indentation (spaces or tabs)
        if line.startswith(' '):  # Check for indentation (spaces or tabs)
            if previous_line is not None:
                result[-1] = result[-1] + ' ' + line.lstrip()  # Append to previous
            else:
                result.append(line.lstrip()) #if the first line is indented, append it directly
        else:
            result.append(line)
            previous_line = line

    return '\n'.join(result)


start_page = 44 #40
end_page = 332
extracted_text = extract_text_from_page_span(doc, start_page, end_page)
#extracted_text = re.sub(r",\s*\n", " ", extracted_text)  #finds lines ending in a comma and appends them to the previous line
text_lines = extracted_text.splitlines()
result_text = join_indented_lines(extracted_text)
#print(result_text)
revised_text= (re.sub(r"\s\d{4}", "", result_text)) 
#print(revised_text)

### Load text to pandas

In [2]:
pd.set_option('display.width', 2000) 
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_columns', None)

text_lines = revised_text.splitlines()

def create_dataframe(text_lines):
    df = pd.DataFrame(text_lines, columns=['Text'])
    return df

df = create_dataframe(text_lines)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
#df = df.iloc[6:]
df.head(35)

Unnamed: 0,Text
0,"Johansoniaceae Doilom, Phookamsak & K.D. Hyde*FoF04619"
1,Johansonia Sacc. (13)*FoF07635
2,Orthobellus A.A. Silva & Cavalc. (3)*FoF01955
4,"Neoantennariellaceae Abdollahz. & Crous*Notes 554, 555, FoF12736"
5,"Cippumomyces Crous, Overton & Ricci (2)*Note 759, FoF15609"
6,"Fumiglobus D.R. Reynolds & G.S. Gilbert (9)*Note 555, FoF06947"
7,"Neoantennariella Abdollahz. & Crous (1)*Note 554, FoF11064"
8,"Neoasbolisia Abdollahz. & Crous (1)*Note 555, FoF11066"
10,"Piedraiaceae Viégas ex Cif., Bat. & S. Camposa*FoF06972"
11,Piedraia Fonseca. & Leãno (3)*FoF06972


#### Modify the extracted text now in pandas df

#### Extract rank from text

In [3]:
#df['OrgName'] = np.nan
#df['Authority'] = np.nan

def add_column_if_list_member_found(df, column_to_check, list_to_search, new_column_name):
    for index, row in df.iterrows():
        for item in list_to_search:
            if item in row[column_to_check]:
                df.loc[index, new_column_name] = item
                df.loc[index, column_to_check] = row[column_to_check].replace(item, '').strip()
                break # Stop searching after the first match                
    return df
search_rank = ['Phylum', 'Class', 'Subphylum']

df = add_column_if_list_member_found(df, 'Text', search_rank, 'Rank')
df.head(10)

Unnamed: 0,Text,Rank
0,"Johansoniaceae Doilom, Phookamsak & K.D. Hyde*FoF04619",
1,Johansonia Sacc. (13)*FoF07635,
2,Orthobellus A.A. Silva & Cavalc. (3)*FoF01955,
4,"Neoantennariellaceae Abdollahz. & Crous*Notes 554, 555, FoF12736",
5,"Cippumomyces Crous, Overton & Ricci (2)*Note 759, FoF15609",
6,"Fumiglobus D.R. Reynolds & G.S. Gilbert (9)*Note 555, FoF06947",
7,"Neoantennariella Abdollahz. & Crous (1)*Note 554, FoF11064",
8,"Neoasbolisia Abdollahz. & Crous (1)*Note 555, FoF11066",
10,"Piedraiaceae Viégas ex Cif., Bat. & S. Camposa*FoF06972",
11,Piedraia Fonseca. & Leãno (3)*FoF06972,


In [None]:
print(df.dtypes)

## Find and populate OrgName and Authority column
#### If Text contains 'incertae sedis' split text so that 'incertae sedis' and all text preceding it go in OrgName. Everything following 'incertae sedis' then goes into Authority
#### Else if Text does not contain 'incertae sedis' split Text at the first space so that the first word goes in OrgName and everything else goes in Authority


In [4]:
additional_cols = ['OrgName','Authority']
df2 = df.reindex(df.columns.tolist() + additional_cols, axis = 1)
df2.head(10)

Unnamed: 0,Text,Rank,OrgName,Authority
0,"Johansoniaceae Doilom, Phookamsak & K.D. Hyde*FoF04619",,,
1,Johansonia Sacc. (13)*FoF07635,,,
2,Orthobellus A.A. Silva & Cavalc. (3)*FoF01955,,,
4,"Neoantennariellaceae Abdollahz. & Crous*Notes 554, 555, FoF12736",,,
5,"Cippumomyces Crous, Overton & Ricci (2)*Note 759, FoF15609",,,
6,"Fumiglobus D.R. Reynolds & G.S. Gilbert (9)*Note 555, FoF06947",,,
7,"Neoantennariella Abdollahz. & Crous (1)*Note 554, FoF11064",,,
8,"Neoasbolisia Abdollahz. & Crous (1)*Note 555, FoF11066",,,
10,"Piedraiaceae Viégas ex Cif., Bat. & S. Camposa*FoF06972",,,
11,Piedraia Fonseca. & Leãno (3)*FoF06972,,,


In [5]:
# Use loc to conditionally split the column

#search_text = 'incertae sedis'
pattern2 = r"(.+?incertae sedis)"
condition = df2['Text'].str.contains('incertae sedis')


#search_subphylum = 'mycotina'
#search_class = 'mycetes'
#search_order = 'ales'
#search_family = 'aceae'

#subphylum_apply = 'subphylum'
#class_apply = 'class'
#order_apply = 'order'
#family_apply = 'family'
#new_column_name = 'Rank2'

for index, row in df2.iterrows():
    if isinstance(row['Text'], str) and 'incertae sedis' in row['Text']:
        #print(f"Row {index}: incertae sedis found {row['Text']}")
        df2['OrgName'] = df2['Text'].str.extract(pattern2, expand=False)
        df2['Authority'] = df2['Text'].str.extract(r'' + re.escape('incertae sedis') + r'(.*)', expand=False)
        #df2[new_column_name] = df2['OrgName'].apply(lambda x: subphylum_apply if search_subphylum in str(x) else '')

    elif isinstance(row['Text'], str) and 'incertae sedis' not in row['Text']:
        df2.loc[~condition, 'OrgName'] = df2.loc[~condition, 'Text'].str.split(' ', n=1, expand=True)[0]
        df2.loc[~condition, 'Authority'] = df2.loc[~condition, 'Text'].str.split(' ', n=1, expand=True)[1]

        
df2.head(25)


 'Notes']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df2.loc[~condition, 'OrgName'] = df2.loc[~condition, 'Text'].str.split(' ', n=1, expand=True)[0]
 'A.A. Silva & Cavalc. (3)*FoF01955' ...
 'Tedersoo, nom. inval. (1)*Tedersoo et al.'
 'Tedersoo, nom. inval. (1)*Tedersoo et al.'
 'on new genera and higher taxa']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df2.loc[~condition, 'Authority'] = df2.loc[~condition, 'Text'].str.split(' ', n=1, expand=True)[1]


Unnamed: 0,Text,Rank,OrgName,Authority
0,"Johansoniaceae Doilom, Phookamsak & K.D. Hyde*FoF04619",,Johansoniaceae,"Doilom, Phookamsak & K.D. Hyde*FoF04619"
1,Johansonia Sacc. (13)*FoF07635,,Johansonia,Sacc. (13)*FoF07635
2,Orthobellus A.A. Silva & Cavalc. (3)*FoF01955,,Orthobellus,A.A. Silva & Cavalc. (3)*FoF01955
4,"Neoantennariellaceae Abdollahz. & Crous*Notes 554, 555, FoF12736",,Neoantennariellaceae,"Abdollahz. & Crous*Notes 554, 555, FoF12736"
5,"Cippumomyces Crous, Overton & Ricci (2)*Note 759, FoF15609",,Cippumomyces,"Crous, Overton & Ricci (2)*Note 759, FoF15609"
6,"Fumiglobus D.R. Reynolds & G.S. Gilbert (9)*Note 555, FoF06947",,Fumiglobus,"D.R. Reynolds & G.S. Gilbert (9)*Note 555, FoF06947"
7,"Neoantennariella Abdollahz. & Crous (1)*Note 554, FoF11064",,Neoantennariella,"Abdollahz. & Crous (1)*Note 554, FoF11064"
8,"Neoasbolisia Abdollahz. & Crous (1)*Note 555, FoF11066",,Neoasbolisia,"Abdollahz. & Crous (1)*Note 555, FoF11066"
10,"Piedraiaceae Viégas ex Cif., Bat. & S. Camposa*FoF06972",,Piedraiaceae,"Viégas ex Cif., Bat. & S. Camposa*FoF06972"
11,Piedraia Fonseca. & Leãno (3)*FoF06972,,Piedraia,Fonseca. & Leãno (3)*FoF06972


### Complete populating rank based on text patterns found in Organism Name -mycotina=subphylum, -mycetes=class, -ales=order, -aceae=family

In [6]:
def assign_category(row, search_terms, category_mapping, target_column, new_column_name):
    """
    Assigns a category to a row based on matching text in a specified column.

    Args:
        row (pd.Series): A row of the DataFrame.
        search_terms (dict): Dictionary of categories and their corresponding search terms (regex patterns).
        category_mapping (dict): Dictionary mapping categories to desired output strings.
	    target_column (str): Name of the column to search within.
        new_column_name (str): Name of the new column to create.

    Returns:
        str: The assigned category string, or None if no match is found.
    """
    text = row[target_column]
    for category, patterns in search_terms.items():
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return category_mapping[category]
    return None

def categorize_dataframe(df, search_terms, category_mapping, target_column, new_column_name):
     """
    Applies the assign_category function to each row of the DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        search_terms (dict): Dictionary of categories and their corresponding search terms.
        category_mapping (dict): Dictionary mapping categories to desired output strings.
        target_column (str): Name of the column to search within.
        new_column_name (str): Name of the new column to create.

    Returns:
        pd.DataFrame: The DataFrame with the new category column.
    """
     df[new_column_name] = df.apply(assign_category, axis=1, search_terms=search_terms, category_mapping = category_mapping, target_column = target_column, new_column_name = new_column_name)
     return df


search_subphylum = 'mycotina'
search_class = 'mycetes'
search_order = 'ales'
search_family = 'aceae'

search_terms = {
    'subphylum': [r'mycotina'],
    'class': [r'mycetes'],
    'order': [r'ales'],
    'family': [r'aceae']
}

category_mapping = {
    'subphylum': 'subphylum',
    'class': 'class',
    'order': 'order',
    'family': 'family'
}

target_column = 'OrgName'
new_column_name = 'Rank2'

df2 = categorize_dataframe(df2, search_terms, category_mapping, target_column, new_column_name)
print(df2)

                                                                   Text Rank               OrgName                                      Authority   Rank2
0                Johansoniaceae Doilom, Phookamsak & K.D. Hyde*FoF04619  NaN        Johansoniaceae        Doilom, Phookamsak & K.D. Hyde*FoF04619  family
1                                        Johansonia Sacc. (13)*FoF07635  NaN            Johansonia                            Sacc. (13)*FoF07635    None
2                         Orthobellus A.A. Silva & Cavalc. (3)*FoF01955  NaN           Orthobellus              A.A. Silva & Cavalc. (3)*FoF01955    None
4      Neoantennariellaceae Abdollahz. & Crous*Notes 554, 555, FoF12736  NaN  Neoantennariellaceae    Abdollahz. & Crous*Notes 554, 555, FoF12736  family
5            Cippumomyces Crous, Overton & Ricci (2)*Note 759, FoF15609  NaN          Cippumomyces  Crous, Overton & Ricci (2)*Note 759, FoF15609    None
...                                                                 ...  ...

In [None]:
#df2.loc[df['Authority'].str.contains(search_text), ['synonym1', 'synonym2']] = df.loc[df['text_column'].str.contains(search_text), 'text_column'].str.split('', n=1, expand=True)


In [7]:
def split_column(df, column_name):
    """
    Splits a DataFrame column into multiple columns based on '=' delimiter.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column to split.

    Returns:
        pd.DataFrame: The DataFrame with the split columns.
    """
    split_series = df[column_name].str.split("=", expand=True)
    num_cols = split_series.shape[1]
    new_col_names = [f'{column_name}_{i+1}' for i in range(num_cols)]
    split_series.columns = new_col_names
    df = pd.concat([df, split_series], axis=1)
    df = df.drop(column_name, axis=1)
    return df

# Example Usage
df3 = split_column(df2, 'Authority')
df3.head(25)

                                                                   Text Rank               OrgName   Rank2                                    Authority_1 Authority_2 Authority_3 Authority_4 Authority_5 Authority_6 Authority_7 Authority_8 Authority_9 Authority_10 Authority_11 Authority_12 Authority_13 Authority_14 Authority_15 Authority_16 Authority_17 Authority_18
0                Johansoniaceae Doilom, Phookamsak & K.D. Hyde*FoF04619  NaN        Johansoniaceae  family        Doilom, Phookamsak & K.D. Hyde*FoF04619        None        None        None        None        None        None        None        None         None         None         None         None         None         None         None         None         None
1                                        Johansonia Sacc. (13)*FoF07635  NaN            Johansonia    None                            Sacc. (13)*FoF07635        None        None        None        None        None        None        None        None         None    

### Merge in Mycobank data from RESP API

In [None]:
MBdata = (r'mycobank_combined.xlsx')
MBdata_df = pd.read_excel(MBdata, index_col=None)
#MBList_df = MBist_df.rename(columns={"ID": "id", "Taxon name": "name"}, inplace=True)
MBdata_df.set_index('id', inplace=True)
MBdata_df.head(25)

### Save final output to Excel

In [8]:
df3.to_excel (r'C:\Users\mcveigh\Documents\PythonPC\PDFminertest.xlsx', index = False, header=True)

### TEST code that can be ignored

In [None]:


# Sample DataFrame
data = {'col': ['apple pie', 'banana', 'cherry tart', 'date pudding', 'elderberry']}
df2 = pd.DataFrame(data)

for index, row in df2.iterrows():
    if isinstance(row['col'], str) and 'berry' in row['col']:
        # Do something if 'col' is a string and contains 'berry'
        print(f"Row {index}: Contains 'berry': {row['col']}")
    else:
        # Do something else if 'col' is not a string or doesn't contain 'berry'
        print(f"Row {index}: Does not contain 'berry': {row['col']}")
        

In [None]:
# Sample DataFrame
data = {'text_column': ['apple pie', 'banana', 'cherry tart', 'date']}
df = pd.DataFrame(data)

# Text to search for
search_text = 'pie'

# Create new columns 'first_part' and 'second_part'
df['first_part']  = ''
df['second_part']  = ''

# Use loc to conditionally split the column
df.loc[df['text_column'].str.contains(search_text), ['first_part', 'second_part']] = df.loc[df['text_column'].str.contains(search_text), 'text_column'].str.split('', n=1, expand=True)

df.loc[~df['text_column'].str.contains(search_text), 'second_part'] = df.loc[~df['text_column'].str.contains(search_text), 'text_column']

print(df)
