# Definitions and Loading

In [1]:
from PyPDF2 import PdfReader
import os
import pandas as pd
import numpy as np

file = "contents/CEGP014270_F.E.(2019 PAT.) (2) (1).pdf"

In [2]:
if os.path.exists("ouput.txt"):
    os.remove("output.txt")
if os.path.exists("extracted_data.txt"):
    os.remove("extracted_data.txt")
if os.path.exists("data.json"):
    os.remove("data.json")

In [3]:
from PyPDF2 import PdfReader


def extract_and_remove_spaces(file, page_range=None, output_file=None):
    """
    file: input pdf file
    page_range: a tuple (start, end) representing the range of pages to scrape,
                or None to extract all pages
    output_file: the output file where the modified data will be saved,
                 or None to not save to a file
    """
    try:
        reader = PdfReader(file)

        # Determine the range of pages to extract
        if page_range is None:
            start_page = 0
            end_page = len(reader.pages)
        else:
            start_page, end_page = page_range

        extracted_text = ""

        # Extract and process pages within the specified range
        for page_no in range(start_page, end_page):
            page = reader.pages[page_no]
            page_text = page.extract_text()

            # Remove leading spaces until a non-space character is encountered
            modified_text = "\n".join(line.lstrip() for line in page_text.splitlines())

            extracted_text += modified_text + "\n"

        # Output the modified text to the specified file or print it
        if output_file is not None:
            with open(output_file, "w") as f:
                f.write(extracted_text)
            print(f"Data extracted and saved to {output_file}")
        else:
            print(extracted_text)
    except Exception as e:
        print(f"An error occurred: {e}")


# Example usage:
input_pdf = file
page_range = (0, 1153)  # Extract pages 1 to 3 (0-based index)
output_txt = "output.txt"
extract_and_remove_spaces(input_pdf, page_range, output_txt)

Data extracted and saved to output.txt


In [4]:
def process_string_block(string_block):
    # Convert the string block into a list, and remove empty strings
    string_list = [s for s in string_block.split(" ") if s]
    # Remove all the ":" characters
    string_list = [s.replace(":", "") for s in string_list]
    # Remove all the empty strings
    string_list = [s for s in string_list if s]

    # Check if there are at least 4 items in the list
    if len(string_list) >= 4:
        # Get the -4th index item
        item_to_split = string_list[-4]
        # Separate the last 3 digits
        last_3_digits = item_to_split[-3:]
        # Remove the last 3 digits from the item
        item_without_last_3_digits = item_to_split[:-3]

        # Insert the modified item back into the list at the same place
        string_list[-4] = item_without_last_3_digits
        # Insert the last 3 digits as a new item after the modified item
        string_list.insert(-3, last_3_digits)

    string_list = [s for s in string_list if s]

    return string_list

In [5]:
def merge_name_mother_prn(result_list):
    # Find the index of "NAME" and "MOTHER"
    name_index = result_list.index("NAME")
    mother_index = result_list.index("MOTHER")

    # Find the indices between "NAME" and "MOTHER"
    name_to_mother_indices = list(range(name_index + 1, mother_index))
    name_to_mother_merged = " ".join(result_list[i] for i in name_to_mother_indices)
    result_list[name_index + 1 : mother_index] = [name_to_mother_merged]

    mother_index = result_list.index("MOTHER")
    prn_index = result_list.index("PRN")
    mother_to_prn_indices = list(range(mother_index + 1, prn_index))
    mother_to_prn_merged = " ".join(result_list[i] for i in mother_to_prn_indices)

    result_list[mother_index + 1 : prn_index] = [mother_to_prn_merged]

    return result_list

In [6]:
def extract_student_info(merged_result):
    student_info = {
        "name": None,
        "mother": None,
        "seat_no": None,
        "prn": None,
    }

    for i in range(len(merged_result)):
        item = merged_result[i].strip()
        if item == "SEAT":
            student_info["seat_no"] = merged_result[i + 2].strip()
        elif item == "NAME":
            student_info["name"] = merged_result[i + 1]
        elif item == "MOTHER":
            student_info["mother"] = merged_result[i + 1]
        elif item == "PRN":
            student_info["prn"] = merged_result[i + 1]

    return student_info

In [7]:
def merge_subject_name(string_block):
    # count the index of the list
    def count_index(string_block):
        count = 0
        for i in string_block:
            count = count + 1
        return count
    if count_index(string_block) == 16:
        #merge index 1 and 2 with a space in between and remove index 2
        string_block[1] = string_block[1] + " " + string_block[2]
        del string_block[2]
        print(string_block)
    elif count_index(string_block) ==17:
        #merge index 1,2 and 3 with a space in between and remove index 2 and 3
        string_block[1] = string_block[1] + " " + string_block[2] + " " + string_block[3]
        del string_block[2]
        del string_block[2]
    elif count_index(string_block) == 18:
        #merge index 1,2,3 and 4 with a space in between and remove index 2,3 and 4
        string_block[1] = string_block[1] + " " + string_block[2] + " " + string_block[3] + " " + string_block[4]
        del string_block[2]
        del string_block[2]
        del string_block[2]
    elif count_index(string_block) == 19:
        #merge index 1,2,3,4 and 5 with a space in between and remove index 2,3,4 and 5
        string_block[1] = string_block[1] + " " + string_block[2] + " " + string_block[3] + " " + string_block[4] + " " + string_block[5]
        del string_block[2]
        del string_block[2]
        del string_block[2]
        del string_block[2]
        
    return string_block

In [8]:
# read output.txt file and delete all lines staring with "COURSE NAME"
with open("output.txt", "r") as f:
    lines = f.readlines()
    with open("extracted_data.txt", "w") as f:
        for line in lines:
            if not line.startswith("COURSE NAME"):
                f.write(line)

In [9]:
def parse_gpa_text(gpa_text):
    gpa_text_list = gpa_text.split(",")
    gpa_text_list = [s.strip() for s in gpa_text_list]
    gpa_text_list = [s.replace(":", "") for s in gpa_text_list]
    gpa_text_list = [s.split(" ") for s in gpa_text_list]
    gpa_text_list = [s for sublist in gpa_text_list for s in sublist]
    gpa_text_list = [s for s in gpa_text_list if s]

    gpa_dict = {}
    sgpa = gpa_text_list[3]
    credits = gpa_text_list[-1]
    gpa_dict["sgpa"] = sgpa
    gpa_dict["credits"] = credits

    return gpa_dict

# Blocks

In [10]:
blocks = []
with open('extracted_data.txt', 'r') as file:
    block = []
    for line in file:
        if line.startswith('SEAT'):
            block = [line]
        elif line.startswith('FIRST'):
            block.append(line)
            blocks.append(''.join(block))
            block = []
        else:
            block.append(line)

In [11]:
second_block = blocks[2]

def split_semesters(text):
    sem_start_indices = [i for i, line in enumerate(text.split('\n')) if line.strip().startswith('SEM.:')]
    sem_start_indices.append(None)  # add sentinel to handle last chunk
    semesters = [text.split('\n')[sem_start_indices[i]:sem_start_indices[i+1]] for i in range(len(sem_start_indices)-1)]
    return semesters


semesters = split_semesters(second_block)
semesters[0].pop(-1)
semesters[1].pop(-1)

block_buffer = second_block.split('\n')[0]
extract_student_info(merge_name_mother_prn(process_string_block(block_buffer)))

def process_semesters_block(semesters_block):
    return [merge_subject_name(process_string_block(line)) for line in semesters_block]

semester_one = process_semesters_block(semesters[0])

# go through each item of the list and remove the first occurence of "*" from the string
for i in range(len(semesters[1])):
    semesters[1][i] = semesters[1][i].replace("*", "", 1)

semester_two = process_semesters_block(semesters[1])
semester_one.pop(0)
semester_two.pop(0)
semester_two.pop(-1)

def process_subjects(semester_one):
    subjects = []

    for subject_info in semester_one:
        subject_dict = {
            "subject_code": subject_info[0],
            "subject_name": subject_info[1],
            "ise": subject_info[2],
            "ese": subject_info[3],
            "total": subject_info[4],
            "tw": subject_info[5],
            "pr": subject_info[6],
            "or": subject_info[7],
            "tot%": subject_info[8],
            "crd": subject_info[9],
            "grd": subject_info[10],
            "gp": subject_info[11],
            "cp": subject_info[12],
            "p&r": subject_info[13],
            "ord": subject_info[14]
        }
        subjects.append(subject_dict)
    return subjects

subjects_sem_1 = process_subjects(semester_one)
subjects_sem_2 = process_subjects(semester_two)

['107009', 'ENGINEERING CHEMISTRY', '029/030', '069/070', '098/100', '---', '---', '---', '98', '04', 'O', '10', '40', '---', '---']
['107009', 'ENGINEERING CHEMISTRY', '---', '---', '---', '---', '023/025', '---', '92', '01', 'O', '10', '10', '---', '---']
['101007', 'ENVIRONMENTAL STUDIES-I', '---', '---', '---', '---', '---', '---', 'AC', '00', 'AC', '00', '00', '---', '---']
['101011', 'ENGINEERING MECHANICS', '030/030', '044/070', '074/100', '---', '---', '---', '74', '03', 'B', '08', '24', '---', '---']
['101011', 'ENGINEERING MECHANICS', '---', '---', '---', '---', '020/025', '---', '80', '01', 'A', '09', '09', '---', '---']
['102012', 'ENGINEERING GRAPHICS', '---', '031/050', '031/50', '---', '---', '---', '62', '01', 'C', '07', '07', '---', '---']
['102012', 'ENGINEERING GRAPHICS', '---', '---', '---', '022/025', '---', '---', '88', '01', 'A', '09', '09', '---', '---']
['107002', 'ENGINEERING PHYSICS', '023/030', '034/070', '057/100', '---', '---', '---', '57', '04', 'D', '06'

In [12]:
def get_subjects_from_block(block_items):
    second_block = block_items
    semesters = split_semesters(second_block)
    semesters[0].pop(-1)
    semesters[1].pop(-1)

    block_buffer = second_block.split('\n')[0]
    extract_student_info(merge_name_mother_prn(process_string_block(block_buffer)))

    semester_one = process_semesters_block(semesters[0])

    for i in range(len(semesters[1])):
        semesters[1][i] = semesters[1][i].replace("*", "", 1)

    semester_two = process_semesters_block(semesters[1])
    semester_one.pop(0)
    semester_two.pop(0)
    semester_two.pop(-1)

    subjects_sem_1 = process_subjects(semester_one)
    subjects_sem_2 = process_subjects(semester_two)

    return {"sem1": subjects_sem_1, "sem2": subjects_sem_2}

In [13]:
abc = get_subjects_from_block(blocks[0]) #works

['107009', 'ENGINEERING CHEMISTRY', '027/030', '053/070', '080/100', '---', '---', '---', '80', '04', 'A', '09', '36', '---', '---']
['107009', 'ENGINEERING CHEMISTRY', '---', '---', '---', '---', '020/025', '---', '80', '01', 'A', '09', '09', '---', '---']
['101007', 'ENVIRONMENTAL STUDIES-I', '---', '---', '---', '---', '---', '---', 'AC', '00', 'AC', '00', '00', '---', '---']
['101011', 'ENGINEERING MECHANICS', '012/030', '012/070', '024/100', '---', '---', '---', 'FF', '03', 'F', '00', '00', '---', '---']
['101011', 'ENGINEERING MECHANICS', '---', '---', '---', '---', '018/025', '---', '72', '01', 'B', '08', '08', '---', '---']
['102012', 'ENGINEERING GRAPHICS', '---', '014/050', '014/50', '---', '---', '---', 'FF', '01', 'F', '00', '00', '---', '---']
['102012', 'ENGINEERING GRAPHICS', '---', '---', '---', '019/025', '---', '---', '76', '01', 'B', '08', '08', '---', '---']
['107002', 'ENGINEERING PHYSICS', '014/030', '018/070', '032/100', '---', '---', '---', 'FF', '04', 'F', '00'

In [14]:
def get_student(block_items):
    first_line = block_items.split('\n')[0]
    block_buffer = first_line
    extract_student_info(merge_name_mother_prn(process_string_block(block_buffer)))
    return extract_student_info(merge_name_mother_prn(process_string_block(block_buffer)))


In [15]:
get_student(blocks[0])

{'name': 'KATKAR SOURABH SUDAM',
 'mother': 'SEEMA',
 'seat_no': 'F190240001',
 'prn': '72287243H'}

In [16]:
def get_gpa(block_items):
    gpa_text = block_items.split('\n')[-2]
    return parse_gpa_text(gpa_text)

In [17]:
get_gpa(blocks[0])

{'sgpa': '--', 'credits': '33'}

In [18]:
def run_loop(blocks):
    students = []
    for block in blocks:
        info = get_student(block)
        marks = get_subjects_from_block(block)
        gpa = get_gpa(block)
        
        dictionary = {**info, **marks, **gpa}
        students.append(dictionary)

    return students

output = run_loop(blocks)

['107009', 'ENGINEERING CHEMISTRY', '027/030', '053/070', '080/100', '---', '---', '---', '80', '04', 'A', '09', '36', '---', '---']
['107009', 'ENGINEERING CHEMISTRY', '---', '---', '---', '---', '020/025', '---', '80', '01', 'A', '09', '09', '---', '---']
['101007', 'ENVIRONMENTAL STUDIES-I', '---', '---', '---', '---', '---', '---', 'AC', '00', 'AC', '00', '00', '---', '---']
['101011', 'ENGINEERING MECHANICS', '012/030', '012/070', '024/100', '---', '---', '---', 'FF', '03', 'F', '00', '00', '---', '---']
['101011', 'ENGINEERING MECHANICS', '---', '---', '---', '---', '018/025', '---', '72', '01', 'B', '08', '08', '---', '---']
['102012', 'ENGINEERING GRAPHICS', '---', '014/050', '014/50', '---', '---', '---', 'FF', '01', 'F', '00', '00', '---', '---']
['102012', 'ENGINEERING GRAPHICS', '---', '---', '---', '019/025', '---', '---', '76', '01', 'B', '08', '08', '---', '---']
['107002', 'ENGINEERING PHYSICS', '014/030', '018/070', '032/100', '---', '---', '---', 'FF', '04', 'F', '00'

In [19]:
# store output as a json file
import json
with open('fe.json', 'w') as outfile:
    json.dump(output, outfile)