In [48]:
import pandas as pd
import re

def extract_data(text):
    # Extract four words following 'name:'
    name_match = re.search(r'name:\s*([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)', text, re.IGNORECASE)
    name = ' '.join(name_match.groups()) if name_match else ''
    
    # Extract date following 'printed:'
    date_match = re.search(r'printed:\s*([^\s]+)', text, re.IGNORECASE)
    printed_date = date_match.group(1) if date_match else ''
    
    # Extract high school GPA
    high_school_match = re.search(r'high school:\s*([0-9]*\.?[0-9]+)', text, re.IGNORECASE)
    high_school = high_school_match.group(1) if high_school_match else ''

    # Extract undergraduate GPA
    ug_match = re.search(r'undergraduate:\s*([0-9]*\.?[0-9]+)', text, re.IGNORECASE)
    ug = ug_match.group(1) if ug_match else ''

    # Extract graduate GPA
    g_match = re.search(r'graduate:\s*([0-9]*\.?[0-9]+)', text, re.IGNORECASE)
    g = g_match.group(1) if g_match else ''

    # Extract date following 'DOB:'
    dob_match = re.search(r'DOB:\s*([A-Za-z]+\s+\d{1,2},\s+\d{4})', text, re.IGNORECASE)
    dob = dob_match.group(1) if dob_match else ''

    # Extract six words following 'US:'
    b_match = re.search(r'High School Diploma U.S. Equivalence:\s*((\S+\s+){5}\S+)', text, re.IGNORECASE)
    b_eq = b_match.group(1).strip() if b_match else ''

    m_match = re.search(r'ny, U.S. Equivalence:\s*((\S+\s+){5}\S+)', text, re.IGNORECASE)
    m_eq = m_match.group(1).strip() if m_match else ''

    hs_match = re.search(r'admission requirement:\s*((\S+\s+){3}\S+)', text, re.IGNORECASE)
    hs_eq = hs_match.group(1).strip() if hs_match else ''

    return name, dob, hs_eq, high_school, b_eq, ug, m_eq, g, printed_date
    
def extract_data_from_pdf_text(input_csv, output_csv):
    # Read the input CSV file
    df = pd.read_csv(input_csv)
    
    # Apply the function to the 'text' column and create new columns
    df[['Name', 'DOB', 'High school Diploma', 'High school GPA', 'Undergraduate Degree', 'Undergraduate GPA', 'Graduate Degree', 'Graduate GPA', 'Printed Date']] = df['text'].apply(lambda text: pd.Series(extract_data(text)))
    
    # Drop rows where all the extracted columns are empty
    df_extracted = df.dropna(subset=['Name', 'DOB', 'High school Diploma', 'High school GPA', 'Undergraduate Degree', 'Undergraduate GPA', 'Graduate Degree', 'Graduate GPA', 'Printed Date'], how='all')
    
    # Keep only unique values for each column
    unique_data = {
        'Name': df_extracted['Name'].dropna().unique(),
        'DOB': df_extracted['DOB'].dropna().unique(),
        'Admission Reuirement for High school Diploma': df_extracted['High school Diploma'].dropna().unique(),
        'High school GPA': df_extracted['High school GPA'].dropna().unique(),
        'Undergraduate Degree': df_extracted['Undergraduate Degree'].dropna().unique(),
        'Undergraduate GPA': df_extracted['Undergraduate GPA'].dropna().unique(),
        'Graduate Degree': df_extracted['Graduate Degree'].dropna().unique(),
        'Graduate GPA': df_extracted['Graduate GPA'].dropna().unique(),
        'Printed Date': df_extracted['Printed Date'].dropna().unique()
    }

    # Create a new DataFrame with one row of unique values
    final_data = {key: (values[1] if len(values) > 0 else '') for key, values in unique_data.items()}
    final_df = pd.DataFrame([final_data])
    print(final_df)
    # Write the new DataFrame to the output CSV file
    final_df.to_csv(output_csv, index=False)

In [49]:
input_csv = r'D:\\output.csv'  # The path to the input CSV file generated by the previous script
output_csv = r'D:\\extracted_data.csv'  # The path to the output CSV file with extracted names
    
extract_data_from_pdf_text(input_csv, output_csv)
print(f"Extracted data saved to {output_csv}")

                               Name            DOB  \
0  Phaneendra Sai Sri DEVABHAKTHUNI  July 21, 1996   

  Admission Reuirement for High school Diploma High school GPA  \
0              completion of tenth grade-level            4.00   

                               Undergraduate Degree Undergraduate GPA  \
0  Bachelor of Technology in Mechanical Engineering              3.37   

                                 Graduate Degree Graduate GPA Printed Date  
0  Master of Science in Mechatronics Engineering         3.37    9/13/2021  
Extracted data saved to D:\\extracted_data.csv
