In [95]:
from PyPDF2 import PdfReader
import os
import pandas as pd
import numpy as np

file = "content/SE Result PDF.pdf"

In [96]:
if os.path.exists("ouput.txt"):
    os.remove("output.txt")
if os.path.exists("extracted_data.txt"):
    os.remove("extracted_data.txt")

In [97]:
from PyPDF2 import PdfReader


def extract_and_remove_spaces(file, page_range=None, output_file=None):
    """
    file: input pdf file
    page_range: a tuple (start, end) representing the range of pages to scrape,
                or None to extract all pages
    output_file: the output file where the modified data will be saved,
                 or None to not save to a file
    """
    try:
        reader = PdfReader(file)

        # Determine the range of pages to extract
        if page_range is None:
            start_page = 0
            end_page = len(reader.pages)
        else:
            start_page, end_page = page_range

        extracted_text = ""

        # Extract and process pages within the specified range
        for page_no in range(start_page, end_page):
            page = reader.pages[page_no]
            page_text = page.extract_text()

            # Remove leading spaces until a non-space character is encountered
            modified_text = "\n".join(line.lstrip() for line in page_text.splitlines())

            extracted_text += modified_text + "\n"

        # Output the modified text to the specified file or print it
        if output_file is not None:
            with open(output_file, "w") as f:
                f.write(extracted_text)
            print(f"Data extracted and saved to {output_file}")
        else:
            print(extracted_text)
    except Exception as e:
        print(f"An error occurred: {e}")


# Example usage:
input_pdf = file
page_range = (0, 4)  # Extract pages 1 to 3 (0-based index)
output_txt = "output.txt"
extract_and_remove_spaces(input_pdf, page_range, output_txt)

Data extracted and saved to output.txt


In [98]:
def remove_empty_lines(input_file):
    try:
        with open(input_file, "r") as file:
            lines = file.readlines()

        with open(input_file, "w") as file:
            for line in lines:
                if (
                    line.strip()
                ):  # Check if the line is not empty after stripping whitespace
                    file.write(line)

        print(f"Empty lines removed from {input_file}")
    except FileNotFoundError:
        print(f"File '{input_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")


# Example usage:
input_file = "output.txt"  # Replace with your input file path
remove_empty_lines(input_file)

Empty lines removed from output.txt


In [99]:
def process_string_block(string_block):
    # Convert the string block into a list, and remove empty strings
    string_list = [s for s in string_block.split(" ") if s]
    # Remove all the ":" characters
    string_list = [s.replace(":", "") for s in string_list]
    string_list = [s.replace("*", "") for s in string_list]
    # Remove all the empty strings
    string_list = [s for s in string_list if s]

    # Check if there are at least 4 items in the list
    if len(string_list) >= 4:
        # Get the -4th index item
        item_to_split = string_list[-4]
        # Separate the last 3 digits
        last_3_digits = item_to_split[-3:]
        # Remove the last 3 digits from the item
        item_without_last_3_digits = item_to_split[:-3]

        # Insert the modified item back into the list at the same place
        string_list[-4] = item_without_last_3_digits
        # Insert the last 3 digits as a new item after the modified item
        string_list.insert(-3, last_3_digits)

    string_list = [s for s in string_list if s]

    return string_list


# Example usage:
# string_block = "SEAT NO.: S190243001 NAME : AASHUTOSH SANJAYRAO GUNTURKAR           MOTHER : SHITAL SANJAYRAO GUNTURKARPRN :71907142K CLG.: DYPIT[24]"
string_block = "SEAT NO.: S190243002 NAME : ADMANE PARTH SUDHIR                     MOTHER : RUPALI SUDHIR ADMANE     PRN :72022819M CLG.: DYPIT[24]"
result_list = process_string_block(string_block)
print(result_list)

['SEAT', 'NO.', 'S190243002', 'NAME', 'ADMANE', 'PARTH', 'SUDHIR', 'MOTHER', 'RUPALI', 'SUDHIR', 'ADMANE', 'PRN', '72022819M', 'CLG.', 'DYPIT[24]']


In [100]:
def merge_name_mother_prn(result_list):
    # Find the index of "NAME" and "MOTHER"
    name_index = result_list.index("NAME")
    mother_index = result_list.index("MOTHER")

    # Find the indices between "NAME" and "MOTHER"
    name_to_mother_indices = list(range(name_index + 1, mother_index))
    name_to_mother_merged = " ".join(result_list[i] for i in name_to_mother_indices)
    result_list[name_index + 1 : mother_index] = [name_to_mother_merged]

    mother_index = result_list.index("MOTHER")
    prn_index = result_list.index("PRN")
    mother_to_prn_indices = list(range(mother_index + 1, prn_index))
    mother_to_prn_merged = " ".join(result_list[i] for i in mother_to_prn_indices)

    result_list[mother_index + 1 : prn_index] = [mother_to_prn_merged]

    return result_list


# Example usage:
# string_block = "SEAT NO.: S190243001 NAME : AASHUTOSH SANJAYRAO GUNTURKAR           MOTHER : SHITAL SANJAYRAO GUNTURKARPRN :71907142K CLG.: DYPIT[24]"
# string_block = "SEAT NO.: S190243002 NAME : ADMANE PARTH SUDHIR                     MOTHER : RUPALI SUDHIR ADMANE     PRN :72022819M CLG.: DYPIT[24]"
string_block = "SEAT NO.: S190243003 NAME : ALKA GUPTA                              MOTHER : URMILA GUPTA             PRN :72022830B CLG.: DYPIT[24]"
result_list = process_string_block(string_block)
merged_result = merge_name_mother_prn(result_list)
print(merged_result)

['SEAT', 'NO.', 'S190243003', 'NAME', 'ALKA GUPTA', 'MOTHER', 'URMILA GUPTA', 'PRN', '72022830B', 'CLG.', 'DYPIT[24]']


In [101]:
import pandas as pd


def extract_student_info(merged_result):
    student_info = {
        "name": None,
        "mother_name": None,
        "seat_number": None,
        "prn_number": None,
        "college": None,
    }

    for i in range(len(merged_result)):
        item = merged_result[i].strip()
        if item == "SEAT":
            student_info["seat_number"] = merged_result[i + 2].strip()
        elif item == "NAME":
            student_info["name"] = merged_result[i + 1]
        elif item == "MOTHER":
            student_info["mother_name"] = merged_result[i + 1]
        elif item == "PRN":
            student_info["prn_number"] = merged_result[i + 1]
        elif item == "CLG.":
            student_info["college"] = merged_result[i + 1]

    return student_info


def create_dataframe_from_list(merged_results_list):
    data = []
    student_info = {}

    for merged_result in merged_results_list:
        student_info.update(extract_student_info(merged_result))
        data.append(student_info)
        student_info = {}

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data)
    return df


# Example usage:
# merged_results_list = [['SEAT', 'NO.', 'S190243001', 'NAME', 'AASHUTOSH SANJAYRAO GUNTURKAR', 'MOTHER', 'SHITAL SANJAYRAO GUNTURKAR', 'PRN', '71907142K', 'CLG.', 'DYPIT[24]']]

merged_results_list = [merged_result]
df = create_dataframe_from_list(merged_results_list)

# Display the DataFrame
df

Unnamed: 0,name,mother_name,seat_number,prn_number,college
0,ALKA GUPTA,URMILA GUPTA,S190243003,72022830B,DYPIT[24]


In [102]:
# Example usage:
# string_block = "204181 ELECTRONIC CIRCUITS              030/030  046/070  076/100    ---      ---      ---    76   03     B   08  24  --- ---"
string_block1 = "204181 ELECTRONIC CIRCUITS              030/030  046/070  076/100    ---      ---      ---    76   03     B   08  24  --- ---"
string_block2 = "204185 ELECTRONIC CIRCUIT LAB             ---      ---      ---      ---    027/050    ---    54   01     D   06  06  --- ---"
string_block3 = "204190A TECHNICAL ENGLISH FOR ENGG.        ---      ---      ---      ---      ---      ---    AC   00    AC   00  00  --- ---"
result_list = process_string_block(string_block3)
print(result_list)

['204190A', 'TECHNICAL', 'ENGLISH', 'FOR', 'ENGG.', '---', '---', '---', '---', '---', '---', 'AC', '00', 'AC', '00', '00', '---', '---']


In [103]:
def merge_subject_name(lst):
    # Find the start and end indices
    start_index = 1
    end_index = -13  # 13th item from the end, including the 0th item

    # Slice the list and join the elements
    merged_items = (
        " ".join(lst[start_index:end_index]) if start_index < len(lst) else ""
    )

    # Replace the merged items in the list
    lst[start_index:end_index] = [merged_items]

    return lst


# Example usage:
merged_result_list = merge_subject_name(result_list)
print(merged_result_list)

['204190A', 'TECHNICAL ENGLISH FOR ENGG.', '---', '---', '---', '---', '---', '---', 'AC', '00', 'AC', '00', '00', '---', '---']


In [105]:
with open("output.txt", "r") as file:
    # Read each line from the file
    for line in file:
        # Remove trailing newline characters and whitespace
        line = line.strip()
        
        # Process the line using your function
        processed_line = process_string_block(line)
        
        # Print the processed output
        print(processed_line)

['PAGE', '-', '1', 'SAVITRIBAI', 'PHULE', 'PUNE', 'UNIVERSITY', ',S.E.(2019', 'CREDIT', 'PAT.)', 'EXAMINATION,', 'APRIL/MAY', '2021', 'D', 'ATE', '23', 'AUG', '2021']
['COLLEGE', '[CEGP014270]', '-', 'DR.', 'D.Y.PATIL', 'IN', 'ST.', 'OF', 'TECH.', 'PUNE']
['BRANCH', 'C', 'ODE', '16-S.E.(2019', 'PAT.)(ELECTRONICS', '&TELECOM)']
['..................................................................................................................................']
['SEAT', 'NO.', 'S190243001', 'NAME', 'AASHUTOSH', 'SANJAYRAO', 'GUNTURKAR', 'MOTHER', 'SHITAL', 'SANJAYRAO', 'GUNTURKAR', 'PRN', '71907142K', 'CLG.', 'DYPIT[24]']
['COURSE', 'NAME', 'ISE', 'ESE', 'TOTAL', 'TW', 'PR', 'OR', 'Tot%', 'Crd', 'Grd', 'GP', 'CP', 'P&R', 'ORD']
['SEM.1', '............', '.......', '.......', '.......', '.......', '.......', '.......', '...', '...', '...', '...', '...', '...', '...']
['204181', 'ELECTRONIC', 'CIRCUITS', '030/030', '046/070', '076/100', '---', '---', '---', '76', '03', 'B', '08', '24', '--